Merge branch 'master' into fix-context-window-slicing

Apply cond slice fix
2026-01-29 00:06:18 +08:00 · 2025-09-11 20:23:31 -07:00 · 2025-09-09 17:45:35 -07:00
326 changed files with 17829 additions and 43219 deletions
--- a/.ci/update_windows/update.py
+++ b/.ci/update_windows/update.py
@ -53,16 +53,6 @@ try:
    repo.stash(ident)
 except KeyError:
    print("nothing to stash")  # noqa: T201
-except:
-    print("Could not stash, cleaning index and trying again.")  # noqa: T201
-    repo.state_cleanup()
-    repo.index.read_tree(repo.head.peel().tree)
-    repo.index.write()
-    try:
-        repo.stash(ident)
-    except KeyError:
-        print("nothing to stash.")  # noqa: T201
-
 backup_branch_name = 'backup_branch_{}'.format(datetime.today().strftime('%Y-%m-%d_%H_%M_%S'))
 print("creating backup branch: {}".format(backup_branch_name))  # noqa: T201
 try:
@ -76,10 +66,8 @@ if branch is None:
    try:
        ref = repo.lookup_reference('refs/remotes/origin/master')
    except:
-        print("fetching.")  # noqa: T201
-        for remote in repo.remotes:
-            if remote.name == "origin":
-                remote.fetch()
+        print("pulling.")  # noqa: T201
+        pull(repo)
        ref = repo.lookup_reference('refs/remotes/origin/master')
    repo.checkout(ref)
    branch = repo.lookup_branch('master')
@ -161,4 +149,3 @@ try:
        shutil.copy(stable_update_script, stable_update_script_to)
 except:
    pass
-
--- a/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt
+++ b/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt
@ -1,28 +0,0 @@
-As of the time of writing this you need this driver for best results:
-https://www.amd.com/en/resources/support-articles/release-notes/RN-AMDGPU-WINDOWS-PYTORCH-7-1-1.html
-
-HOW TO RUN:
-
-If you have a AMD gpu:
-
-run_amd_gpu.bat
-
-If you have memory issues you can try disabling the smart memory management by running comfyui with:
-
-run_amd_gpu_disable_smart_memory.bat
-
-IF YOU GET A RED ERROR IN THE UI MAKE SURE YOU HAVE A MODEL/CHECKPOINT IN: ComfyUI\models\checkpoints
-
-You can download the stable diffusion XL one from: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors
-
-
-RECOMMENDED WAY TO UPDATE:
-To update the ComfyUI code: update\update_comfyui.bat
-
-
-TO SHARE MODELS BETWEEN COMFYUI AND ANOTHER UI:
-In the ComfyUI directory you will find a file: extra_model_paths.yaml.example
-Rename this file to: extra_model_paths.yaml and edit it with your favorite text editor.
-
-
-
--- a/.ci/windows_nvidia_base_files/README_VERY_IMPORTANT.txt
+++ b/.ci/windows_nvidia_base_files/README_VERY_IMPORTANT.txt
--- a/.ci/windows_nvidia_base_files/run_cpu.bat
+++ b/.ci/windows_nvidia_base_files/run_cpu.bat
--- a/.ci/windows_amd_base_files/run_amd_gpu.bat
+++ b/.ci/windows_amd_base_files/run_amd_gpu.bat
--- a/.ci/windows_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
+++ b/.ci/windows_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
@ -1,2 +1,2 @@
-.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --disable-smart-memory
+.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast fp16_accumulation
 pause
--- a/.ci/windows_nvidia_base_files/advanced/run_nvidia_gpu_disable_api_nodes.bat
+++ b/.ci/windows_nvidia_base_files/advanced/run_nvidia_gpu_disable_api_nodes.bat
@ -1,3 +0,0 @@
-..\python_embeded\python.exe -s ..\ComfyUI\main.py --windows-standalone-build --disable-api-nodes
-echo If you see this and ComfyUI did not start try updating your Nvidia Drivers to the latest. If you get a c10.dll error you need to install vc redist that you can find: https://aka.ms/vc14/vc_redist.x64.exe
-pause
--- a/.ci/windows_nvidia_base_files/run_nvidia_gpu.bat
+++ b/.ci/windows_nvidia_base_files/run_nvidia_gpu.bat
@ -1,3 +0,0 @@
-.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build
-echo If you see this and ComfyUI did not start try updating your Nvidia Drivers to the latest. If you get a c10.dll error you need to install vc redist that you can find: https://aka.ms/vc14/vc_redist.x64.exe
-pause
--- a/.ci/windows_nvidia_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
+++ b/.ci/windows_nvidia_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
@ -1,3 +0,0 @@
-.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast fp16_accumulation
-echo If you see this and ComfyUI did not start try updating your Nvidia Drivers to the latest. If you get a c10.dll error you need to install vc redist that you can find: https://aka.ms/vc14/vc_redist.x64.exe
-pause
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -8,15 +8,13 @@ body:
        Before submitting a **Bug Report**, please ensure the following:

        - **1:** You are running the latest version of ComfyUI.
-        - **2:** You have your ComfyUI logs and relevant workflow on hand and will post them in this bug report.
+        - **2:** You have looked at the existing bug reports and made sure this isn't already reported.
        - **3:** You confirmed that the bug is not caused by a custom node. You can disable all custom nodes by passing
-        `--disable-all-custom-nodes` command line argument. If you have custom node try updating them to the latest version.
+        `--disable-all-custom-nodes` command line argument.
        - **4:** This is an actual bug in ComfyUI, not just a support question. A bug is when you can specify exact
        steps to replicate what went wrong and others will be able to repeat your steps and see the same issue happen.

-        ## Very Important
-
-        Please make sure that you post ALL your ComfyUI logs in the bug report. A bug report without logs will likely be ignored.
+        If unsure, ask on the [ComfyUI Matrix Space](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) or the [Comfy Org Discord](https://discord.gg/comfyorg) first.
  - type: checkboxes
    id: custom-nodes-test
    attributes:
--- a/.github/PULL_REQUEST_TEMPLATE/api-node.md
+++ b/.github/PULL_REQUEST_TEMPLATE/api-node.md
@ -1,21 +0,0 @@
-<!-- API_NODE_PR_CHECKLIST: do not remove -->
-
-## API Node PR Checklist
-
-### Scope
- [ ] **Is API Node Change**
-
-### Pricing & Billing
- [ ] **Need pricing update**
- [ ] **No pricing update**
-
-If **Need pricing update**:
- [ ] Metronome rate cards updated
- [ ] Auto‑billing tests updated and passing
-
-### QA
- [ ] **QA done**
- [ ] **QA not required**
-
-### Comms
- [ ] Informed **Kosinkadink**
--- a/.github/workflows/api-node-template.yml
+++ b/.github/workflows/api-node-template.yml
@ -1,58 +0,0 @@
-name: Append API Node PR template
-
-on:
-  pull_request_target:
-    types: [opened, reopened, synchronize, ready_for_review]
-    paths:
-      - 'comfy_api_nodes/**'   # only run if these files changed
-
-permissions:
-  contents: read
-  pull-requests: write
-
-jobs:
-  inject:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Ensure template exists and append to PR body
-        uses: actions/github-script@v7
-        with:
-          script: |
-            const { owner, repo } = context.repo;
-            const number = context.payload.pull_request.number;
-            const templatePath = '.github/PULL_REQUEST_TEMPLATE/api-node.md';
-            const marker = '<!-- API_NODE_PR_CHECKLIST: do not remove -->';
-
-            const { data: pr } = await github.rest.pulls.get({ owner, repo, pull_number: number });
-
-            let templateText;
-            try {
-              const res = await github.rest.repos.getContent({
-                owner,
-                repo,
-                path: templatePath,
-                ref: pr.base.ref
-              });
-              const buf = Buffer.from(res.data.content, res.data.encoding || 'base64');
-              templateText = buf.toString('utf8');
-            } catch (e) {
-              core.setFailed(`Required PR template not found at "${templatePath}" on ${pr.base.ref}. Please add it to the repo.`);
-              return;
-            }
-
-            // Enforce the presence of the marker inside the template (for idempotence)
-            if (!templateText.includes(marker)) {
-              core.setFailed(`Template at "${templatePath}" does not contain the required marker:\n${marker}\nAdd it so we can detect duplicates safely.`);
-              return;
-            }
-
-            // If the PR already contains the marker, do not append again.
-            const body = pr.body || '';
-            if (body.includes(marker)) {
-              core.info('Template already present in PR body; nothing to inject.');
-              return;
-            }
-
-            const newBody = (body ? body + '\n\n' : '') + templateText + '\n';
-            await github.rest.pulls.update({ owner, repo, pull_number: number, body: newBody });
-            core.notice('API Node template appended to PR description.');
--- a/.github/workflows/release-stable-all.yml
+++ b/.github/workflows/release-stable-all.yml
@ -1,78 +0,0 @@
-name: "Release Stable All Portable Versions"
-
-on:
-  workflow_dispatch:
-    inputs:
-      git_tag:
-        description: 'Git tag'
-        required: true
-        type: string
-
-jobs:
-  release_nvidia_default:
-    permissions:
-      contents: "write"
-      packages: "write"
-      pull-requests: "read"
-    name: "Release NVIDIA Default (cu130)"
-    uses: ./.github/workflows/stable-release.yml
-    with:
-      git_tag: ${{ inputs.git_tag }}
-      cache_tag: "cu130"
-      python_minor: "13"
-      python_patch: "9"
-      rel_name: "nvidia"
-      rel_extra_name: ""
-      test_release: true
-    secrets: inherit
-
-  release_nvidia_cu128:
-    permissions:
-      contents: "write"
-      packages: "write"
-      pull-requests: "read"
-    name: "Release NVIDIA cu128"
-    uses: ./.github/workflows/stable-release.yml
-    with:
-      git_tag: ${{ inputs.git_tag }}
-      cache_tag: "cu128"
-      python_minor: "12"
-      python_patch: "10"
-      rel_name: "nvidia"
-      rel_extra_name: "_cu128"
-      test_release: true
-    secrets: inherit
-
-  release_nvidia_cu126:
-    permissions:
-      contents: "write"
-      packages: "write"
-      pull-requests: "read"
-    name: "Release NVIDIA cu126"
-    uses: ./.github/workflows/stable-release.yml
-    with:
-      git_tag: ${{ inputs.git_tag }}
-      cache_tag: "cu126"
-      python_minor: "12"
-      python_patch: "10"
-      rel_name: "nvidia"
-      rel_extra_name: "_cu126"
-      test_release: true
-    secrets: inherit
-
-  release_amd_rocm:
-    permissions:
-      contents: "write"
-      packages: "write"
-      pull-requests: "read"
-    name: "Release AMD ROCm 7.1.1"
-    uses: ./.github/workflows/stable-release.yml
-    with:
-      git_tag: ${{ inputs.git_tag }}
-      cache_tag: "rocm711"
-      python_minor: "12"
-      python_patch: "10"
-      rel_name: "amd"
-      rel_extra_name: ""
-      test_release: false
-    secrets: inherit
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@ -21,28 +21,3 @@ jobs:

    - name: Run Ruff
      run: ruff check .
-
-  pylint:
-    name: Run Pylint
-    runs-on: ubuntu-latest
-
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-
-    - name: Set up Python
-      uses: actions/setup-python@v4
-      with:
-        python-version: '3.12'
-
-    - name: Install requirements
-      run: |
-        python -m pip install --upgrade pip
-        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-        pip install -r requirements.txt
-
-    - name: Install Pylint
-      run: pip install pylint
-
-    - name: Run Pylint
-      run: pylint comfy_api_nodes
--- a/.github/workflows/stable-release.yml
+++ b/.github/workflows/stable-release.yml
@ -2,53 +2,17 @@
 name: "Release Stable Version"

 on:
-  workflow_call:
-    inputs:
-      git_tag:
-        description: 'Git tag'
-        required: true
-        type: string
-      cache_tag:
-        description: 'Cached dependencies tag'
-        required: true
-        type: string
-        default: "cu129"
-      python_minor:
-        description: 'Python minor version'
-        required: true
-        type: string
-        default: "13"
-      python_patch:
-        description: 'Python patch version'
-        required: true
-        type: string
-        default: "6"
-      rel_name:
-        description: 'Release name'
-        required: true
-        type: string
-        default: "nvidia"
-      rel_extra_name:
-        description: 'Release extra name'
-        required: false
-        type: string
-        default: ""
-      test_release:
-        description: 'Test Release'
-        required: true
-        type: boolean
-        default: true
  workflow_dispatch:
    inputs:
      git_tag:
        description: 'Git tag'
        required: true
        type: string
-      cache_tag:
-        description: 'Cached dependencies tag'
+      cu:
+        description: 'CUDA version'
        required: true
        type: string
-        default: "cu129"
+        default: "129"
      python_minor:
        description: 'Python minor version'
        required: true
@ -59,21 +23,7 @@ on:
        required: true
        type: string
        default: "6"
-      rel_name:
-        description: 'Release name'
-        required: true
-        type: string
-        default: "nvidia"
-      rel_extra_name:
-        description: 'Release extra name'
-        required: false
-        type: string
-        default: ""
-      test_release:
-        description: 'Test Release'
-        required: true
-        type: boolean
-        default: true
+

 jobs:
  package_comfy_windows:
@ -92,15 +42,15 @@ jobs:
        id: cache
        with:
          path: |
-            ${{ inputs.cache_tag }}_python_deps.tar
+            cu${{ inputs.cu }}_python_deps.tar
            update_comfyui_and_python_dependencies.bat
-          key: ${{ runner.os }}-build-${{ inputs.cache_tag }}-${{ inputs.python_minor }}
+          key: ${{ runner.os }}-build-cu${{ inputs.cu }}-${{ inputs.python_minor }}
      - shell: bash
        run: |
-          mv ${{ inputs.cache_tag }}_python_deps.tar ../
+          mv cu${{ inputs.cu }}_python_deps.tar ../
          mv update_comfyui_and_python_dependencies.bat ../
          cd ..
-          tar xf ${{ inputs.cache_tag }}_python_deps.tar
+          tar xf cu${{ inputs.cu }}_python_deps.tar
          pwd
          ls

@ -115,19 +65,12 @@ jobs:
          echo 'import site' >> ./python3${{ inputs.python_minor }}._pth
          curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
          ./python.exe get-pip.py
-          ./python.exe -s -m pip install ../${{ inputs.cache_tag }}_python_deps/*
-
-          grep comfy ../ComfyUI/requirements.txt > ./requirements_comfyui.txt
-          ./python.exe -s -m pip install -r requirements_comfyui.txt
-          rm requirements_comfyui.txt
-
+          ./python.exe -s -m pip install ../cu${{ inputs.cu }}_python_deps/*
          sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth

-          if test -f ./Lib/site-packages/torch/lib/dnnl.lib; then
-            rm ./Lib/site-packages/torch/lib/dnnl.lib #I don't think this is actually used and I need the space
-            rm ./Lib/site-packages/torch/lib/libprotoc.lib
-            rm ./Lib/site-packages/torch/lib/libprotobuf.lib
-          fi
+          rm ./Lib/site-packages/torch/lib/dnnl.lib #I don't think this is actually used and I need the space
+          rm ./Lib/site-packages/torch/lib/libprotoc.lib
+          rm ./Lib/site-packages/torch/lib/libprotobuf.lib

          cd ..

@ -142,18 +85,14 @@ jobs:

          mkdir update
          cp -r ComfyUI/.ci/update_windows/* ./update/
-          cp -r ComfyUI/.ci/windows_${{ inputs.rel_name }}_base_files/* ./
+          cp -r ComfyUI/.ci/windows_base_files/* ./
          cp ../update_comfyui_and_python_dependencies.bat ./update/

          cd ..

          "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=768m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable
-          mv ComfyUI_windows_portable.7z ComfyUI/ComfyUI_windows_portable_${{ inputs.rel_name }}${{ inputs.rel_extra_name }}.7z
+          mv ComfyUI_windows_portable.7z ComfyUI/ComfyUI_windows_portable_nvidia.7z

-      - shell: bash
-        if: ${{ inputs.test_release }}
-        run: |
-          cd ..
          cd ComfyUI_windows_portable
          python_embeded/python.exe -s ComfyUI/main.py --quick-test-for-ci --cpu

@ -162,9 +101,10 @@ jobs:
          ls

      - name: Upload binaries to release
-        uses: softprops/action-gh-release@v2
+        uses: svenstaro/upload-release-action@v2
        with:
-          files: ComfyUI_windows_portable_${{ inputs.rel_name }}${{ inputs.rel_extra_name }}.7z
-          tag_name: ${{ inputs.git_tag }}
+          repo_token: ${{ secrets.GITHUB_TOKEN }}
+          file: ComfyUI_windows_portable_nvidia.7z
+          tag: ${{ inputs.git_tag }}
+          overwrite: true
          draft: true
-          overwrite_files: true
--- a/.github/workflows/test-build.yml
+++ b/.github/workflows/test-build.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
    steps:
      - uses: actions/checkout@v4
      - name: Set up Python ${{ matrix.python-version }}
--- a/.github/workflows/test-ci.yml
+++ b/.github/workflows/test-ci.yml
@ -5,7 +5,6 @@ on:
  push:
    branches:
      - master
-      - release/**
    paths-ignore:
      - 'app/**'
      - 'input/**'
@ -22,15 +21,14 @@ jobs:
      fail-fast: false
      matrix:
        # os: [macos, linux, windows]
-        # os: [macos, linux]
-        os: [linux]
-        python_version: ["3.10", "3.11", "3.12"]
+        os: [macos, linux]
+        python_version: ["3.9", "3.10", "3.11", "3.12"]
        cuda_version: ["12.1"]
        torch_version: ["stable"]
        include:
-          # - os: macos
-          #   runner_label: [self-hosted, macOS]
-          #   flags: "--use-pytorch-cross-attention"
+          - os: macos
+            runner_label: [self-hosted, macOS]
+            flags: "--use-pytorch-cross-attention"
          - os: linux
            runner_label: [self-hosted, Linux]
            flags: ""
@ -75,15 +73,14 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        # os: [macos, linux]
-        os: [linux]
+        os: [macos, linux]
        python_version: ["3.11"]
        cuda_version: ["12.1"]
        torch_version: ["nightly"]
        include:
-          # - os: macos
-          #   runner_label: [self-hosted, macOS]
-          #   flags: "--use-pytorch-cross-attention"
+          - os: macos
+            runner_label: [self-hosted, macOS]
+            flags: "--use-pytorch-cross-attention"
          - os: linux
            runner_label: [self-hosted, Linux]
            flags: ""
--- a/.github/workflows/test-execution.yml
+++ b/.github/workflows/test-execution.yml
@ -2,9 +2,9 @@ name: Execution Tests

 on:
  push:
-    branches: [ main, master, release/** ]
+    branches: [ main, master ]
  pull_request:
-    branches: [ main, master, release/** ]
+    branches: [ main, master ]

 jobs:
  test:
--- a/.github/workflows/test-launch.yml
+++ b/.github/workflows/test-launch.yml
@ -2,9 +2,9 @@ name: Test server launches without errors

 on:
  push:
-    branches: [ main, master, release/** ]
+    branches: [ main, master ]
  pull_request:
-    branches: [ main, master, release/** ]
+    branches: [ main, master ]

 jobs:
  test:
@ -32,9 +32,7 @@ jobs:
      working-directory: ComfyUI
    - name: Check for unhandled exceptions in server log
      run: |
-        grep -v "Found comfy_kitchen backend triton: {'available': False, 'disabled': True, 'unavailable_reason': \"ImportError: No module named 'triton'\", 'capabilities': \[\]}" console_output.log | grep -v "Found comfy_kitchen backend triton: {'available': False, 'disabled': False, 'unavailable_reason': \"ImportError: No module named 'triton'\", 'capabilities': \[\]}" > console_output_filtered.log
-        cat console_output_filtered.log
-        if grep -qE "Exception|Error" console_output_filtered.log; then
+        if grep -qE "Exception|Error" console_output.log; then
          echo "Unhandled exception/error found in server log."
          exit 1
        fi
--- a/.github/workflows/test-unit.yml
+++ b/.github/workflows/test-unit.yml
@ -2,15 +2,15 @@ name: Unit Tests

 on:
  push:
-    branches: [ main, master, release/** ]
+    branches: [ main, master ]
  pull_request:
-    branches: [ main, master, release/** ]
+    branches: [ main, master ]

 jobs:
  test:
    strategy:
      matrix:
-        os: [ubuntu-latest, windows-2022, macos-latest]
+        os: [ubuntu-latest, windows-latest, macos-latest]
    runs-on: ${{ matrix.os }}
    continue-on-error: true
    steps:
--- a/.github/workflows/update-version.yml
+++ b/.github/workflows/update-version.yml
@ -6,7 +6,6 @@ on:
      - "pyproject.toml"
    branches:
      - master
-      - release/**

 jobs:
  update-version:
--- a/.github/workflows/windows_release_dependencies.yml
+++ b/.github/workflows/windows_release_dependencies.yml
@ -17,7 +17,7 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "130"
+        default: "129"

      python_minor:
        description: 'python minor version'
@ -29,7 +29,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "9"
+        default: "6"
 #  push:
 #    branches:
 #      - master
@ -56,8 +56,7 @@ jobs:
            ..\python_embeded\python.exe -s -m pip install --upgrade torch torchvision torchaudio ${{ inputs.xformers }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2
            pause" > update_comfyui_and_python_dependencies.bat

-            grep -v comfyui requirements.txt > requirements_nocomfyui.txt
-            python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} ${{ inputs.extra_dependencies }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r requirements_nocomfyui.txt pygit2 -w ./temp_wheel_dir
+            python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} ${{ inputs.extra_dependencies }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r requirements.txt pygit2 -w ./temp_wheel_dir
            python -m pip install --no-cache-dir ./temp_wheel_dir/*
            echo installed basic
            ls -lah temp_wheel_dir
--- a/.github/workflows/windows_release_dependencies_manual.yml
+++ b/.github/workflows/windows_release_dependencies_manual.yml
@ -1,64 +0,0 @@
-name: "Windows Release dependencies Manual"
-
-on:
-  workflow_dispatch:
-    inputs:
-      torch_dependencies:
-        description: 'torch dependencies'
-        required: false
-        type: string
-        default: "torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu128"
-      cache_tag:
-        description: 'Cached dependencies tag'
-        required: true
-        type: string
-        default: "cu128"
-
-      python_minor:
-        description: 'python minor version'
-        required: true
-        type: string
-        default: "12"
-
-      python_patch:
-        description: 'python patch version'
-        required: true
-        type: string
-        default: "10"
-
-jobs:
-  build_dependencies:
-    runs-on: windows-latest
-    steps:
-        - uses: actions/checkout@v4
-        - uses: actions/setup-python@v5
-          with:
-            python-version: 3.${{ inputs.python_minor }}.${{ inputs.python_patch }}
-
-        - shell: bash
-          run: |
-            echo "@echo off
-            call update_comfyui.bat nopause
-            echo -
-            echo This will try to update pytorch and all python dependencies.
-            echo -
-            echo If you just want to update normally, close this and run update_comfyui.bat instead.
-            echo -
-            pause
-            ..\python_embeded\python.exe -s -m pip install --upgrade ${{ inputs.torch_dependencies }} -r ../ComfyUI/requirements.txt pygit2
-            pause" > update_comfyui_and_python_dependencies.bat
-
-            grep -v comfyui requirements.txt > requirements_nocomfyui.txt
-            python -m pip wheel --no-cache-dir ${{ inputs.torch_dependencies }} -r requirements_nocomfyui.txt pygit2 -w ./temp_wheel_dir
-            python -m pip install --no-cache-dir ./temp_wheel_dir/*
-            echo installed basic
-            ls -lah temp_wheel_dir
-            mv temp_wheel_dir ${{ inputs.cache_tag }}_python_deps
-            tar cf ${{ inputs.cache_tag }}_python_deps.tar ${{ inputs.cache_tag }}_python_deps
-
-        - uses: actions/cache/save@v4
-          with:
-            path: |
-              ${{ inputs.cache_tag }}_python_deps.tar
-              update_comfyui_and_python_dependencies.bat
-            key: ${{ runner.os }}-build-${{ inputs.cache_tag }}-${{ inputs.python_minor }}
--- a/.github/workflows/windows_release_nightly_pytorch.yml
+++ b/.github/workflows/windows_release_nightly_pytorch.yml
@ -68,7 +68,7 @@ jobs:

            mkdir update
            cp -r ComfyUI/.ci/update_windows/* ./update/
-            cp -r ComfyUI/.ci/windows_nvidia_base_files/* ./
+            cp -r ComfyUI/.ci/windows_base_files/* ./
            cp -r ComfyUI/.ci/windows_nightly_base_files/* ./

            echo "call update_comfyui.bat nopause
--- a/.github/workflows/windows_release_package.yml
+++ b/.github/workflows/windows_release_package.yml
@ -81,7 +81,7 @@ jobs:

            mkdir update
            cp -r ComfyUI/.ci/update_windows/* ./update/
-            cp -r ComfyUI/.ci/windows_nvidia_base_files/* ./
+            cp -r ComfyUI/.ci/windows_base_files/* ./
            cp ../update_comfyui_and_python_dependencies.bat ./update/

            cd ..
--- a/25
+++ b/25
@ -1,2 +1,25 @@
 # Admins
-* @comfyanonymous @kosinkadink @guill
+* @comfyanonymous
+
+# Note: Github teams syntax cannot be used here as the repo is not owned by Comfy-Org.
+# Inlined the team members for now.
+
+# Maintainers
+*.md @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill
+/tests/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill
+/tests-unit/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill
+/notebooks/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill
+/script_examples/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill
+/.github/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill
+/requirements.txt @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill
+/pyproject.toml @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill
+
+# Python web server
+/api_server/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne @guill
+/app/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne @guill
+/utils/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne @guill
+
+# Node developers
+/comfy_extras/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne @guill
+/comfy/comfy_types/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne @guill
+/comfy_api_nodes/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne @guill
--- a/QUANTIZATION.md
+++ b/QUANTIZATION.md
@ -1,168 +0,0 @@
-# The Comfy guide to Quantization
-
-
-## How does quantization work?
-
-Quantization aims to map a high-precision value x_f to a lower precision format with minimal loss in accuracy. These smaller formats then serve to reduce the models memory footprint and increase throughput by using specialized hardware.
-
-When simply converting a value from FP16 to FP8 using the round-nearest method we might hit two issues:
- The dynamic range of FP16 (-65,504, 65,504) far exceeds FP8 formats like E4M3 (-448, 448) or E5M2 (-57,344, 57,344), potentially resulting in clipped values
- The original values are concentrated in a small range (e.g. -1,1) leaving many FP8-bits "unused"
-
-By using a scaling factor, we aim to map these values into the quantized-dtype range, making use of the full spectrum. One of the easiest approaches, and common, is using per-tensor absolute-maximum scaling.
-
-```
-absmax = max(abs(tensor))
-scale = amax / max_dynamic_range_low_precision
-
-# Quantization
-tensor_q = (tensor / scale).to(low_precision_dtype)
-
-# De-Quantization
-tensor_dq = tensor_q.to(fp16) * scale
-
-tensor_dq ~ tensor
-```
-
-Given that additional information (scaling factor) is needed to "interpret" the quantized values, we describe those as derived datatypes.
-
-
-## Quantization in Comfy
-
-```
-QuantizedTensor (torch.Tensor subclass)
-  ↓ __torch_dispatch__
-Two-Level Registry (generic + layout handlers)
-  ↓
-MixedPrecisionOps + Metadata Detection
-```
-
-### Representation
-
-To represent these derived datatypes, ComfyUI uses a subclass of torch.Tensor to implements these using the `QuantizedTensor` class found in `comfy/quant_ops.py`
-
-A `Layout` class defines how a specific quantization format behaves:
- Required parameters
- Quantize method
- De-Quantize method
-
-```python
-from comfy.quant_ops import QuantizedLayout
-
-class MyLayout(QuantizedLayout):
-    @classmethod
-    def quantize(cls, tensor, **kwargs):
-        # Convert to quantized format
-        qdata = ...
-        params = {'scale': ..., 'orig_dtype': tensor.dtype}
-        return qdata, params
-    
-    @staticmethod
-    def dequantize(qdata, scale, orig_dtype, **kwargs):
-        return qdata.to(orig_dtype) * scale
-```
-
-To then run operations using these QuantizedTensors we use two registry systems to define supported operations. 
-The first is a **generic registry** that handles operations common to all quantized formats (e.g., `.to()`, `.clone()`, `.reshape()`).
-
-The second registry is layout-specific and allows to implement fast-paths like nn.Linear.
-```python
-from comfy.quant_ops import register_layout_op
-
-@register_layout_op(torch.ops.aten.linear.default, MyLayout)
-def my_linear(func, args, kwargs):
-    # Extract tensors, call optimized kernel
-    ...
-```
-When `torch.nn.functional.linear()` is called with QuantizedTensor arguments, `__torch_dispatch__` automatically routes to the registered implementation.
-For any unsupported operation, QuantizedTensor will fallback to call `dequantize` and dispatch using the high-precision implementation.
-
-
-### Mixed Precision
-
-The `MixedPrecisionOps` class (lines 542-648 in `comfy/ops.py`) enables per-layer quantization decisions, allowing different layers in a model to use different precisions. This is activated when a model config contains a `layer_quant_config` dictionary that specifies which layers should be quantized and how.
-
-**Architecture:**
-
-```python
-class MixedPrecisionOps(disable_weight_init):
-    _layer_quant_config = {}  # Maps layer names to quantization configs
-    _compute_dtype = torch.bfloat16  # Default compute / dequantize precision
-```
-
-**Key mechanism:**
-
-The custom `Linear._load_from_state_dict()` method inspects each layer during model loading:
- If the layer name is **not** in `_layer_quant_config`: load weight as regular tensor in `_compute_dtype`
- If the layer name **is** in `_layer_quant_config`: 
-  - Load weight as `QuantizedTensor` with the specified layout (e.g., `TensorCoreFP8Layout`)
-  - Load associated quantization parameters (scales, block_size, etc.)
-
-**Why it's needed:**
-
-Not all layers tolerate quantization equally. Sensitive operations like final projections can be kept in higher precision, while compute-heavy matmuls are quantized. This provides most of the performance benefits while maintaining quality.
-
-The system is selected in `pick_operations()` when `model_config.layer_quant_config` is present, making it the highest-priority operation mode.
-
-
-## Checkpoint Format
-
-Quantized checkpoints are stored as standard safetensors files with quantized weight tensors and associated scaling parameters, plus a `_quantization_metadata` JSON entry describing the quantization scheme.
-
-The quantized checkpoint will contain the same layers as the original checkpoint but:
- The weights are stored as quantized values, sometimes using a different storage datatype. E.g. uint8 container for fp8.
- For each quantized weight a number of additional scaling parameters are stored alongside depending on the recipe.
- We store a metadata.json in the metadata of the final safetensor containing the `_quantization_metadata` describing which layers are quantized and what layout has been used.
-
-### Scaling Parameters details
-We define 4 possible scaling parameters that should cover most recipes in the near-future:
- **weight_scale**: quantization scalers for the weights
- **weight_scale_2**: global scalers in the context of double scaling
- **pre_quant_scale**: scalers used for smoothing salient weights
- **input_scale**: quantization scalers for the activations
-
-| Format | Storage dtype | weight_scale | weight_scale_2 | pre_quant_scale | input_scale |
-|--------|---------------|--------------|----------------|-----------------|-------------|
-| float8_e4m3fn | float32 | float32 (scalar) | - | - | float32 (scalar) |
-
-You can find the defined formats in `comfy/quant_ops.py` (QUANT_ALGOS).
-
-### Quantization Metadata
-
-The metadata stored alongside the checkpoint contains:
- **format_version**: String to define a version of the standard
- **layers**: A dictionary mapping layer names to their quantization format. The format string maps to the definitions found in `QUANT_ALGOS`. 
-
-Example:
-```json
-{
-  "_quantization_metadata": {
-    "format_version": "1.0",
-    "layers": {
-      "model.layers.0.mlp.up_proj": "float8_e4m3fn",
-      "model.layers.0.mlp.down_proj": "float8_e4m3fn",
-      "model.layers.1.mlp.up_proj": "float8_e4m3fn"
-    }
-  }
-}
-```
-
-
-## Creating Quantized Checkpoints
-
-To create compatible checkpoints, use any quantization tool provided the output follows the checkpoint format described above and uses a layout defined in `QUANT_ALGOS`.
-
-### Weight Quantization
-
-Weight quantization is straightforward - compute the scaling factor directly from the weight tensor using the absolute maximum method described earlier. Each layer's weights are quantized independently and stored with their corresponding `weight_scale` parameter.
-
-### Calibration (for Activation Quantization)
-
-Activation quantization (e.g., for FP8 Tensor Core operations) requires `input_scale` parameters that cannot be determined from static weights alone. Since activation values depend on actual inputs, we use **post-training calibration (PTQ)**:
-
-1. **Collect statistics**: Run inference on N representative samples
-2. **Track activations**: Record the absolute maximum (`amax`) of inputs to each quantized layer
-3. **Compute scales**: Derive `input_scale` from collected statistics
-4. **Store in checkpoint**: Save `input_scale` parameters alongside weights
-
-The calibration dataset should be representative of your target use case. For diffusion models, this typically means a diverse set of prompts and generation parameters.
--- a/README.md
+++ b/README.md
@ -66,9 +66,6 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
   - [Lumina Image 2.0](https://comfyanonymous.github.io/ComfyUI_examples/lumina2/)
   - [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/)
   - [Qwen Image](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/)
-   - [Hunyuan Image 2.1](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_image/)
-   - [Flux 2](https://comfyanonymous.github.io/ComfyUI_examples/flux2/)
-   - [Z Image](https://comfyanonymous.github.io/ComfyUI_examples/z_image/)
 - Image Editing Models
   - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/)
   - [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model)
@ -81,7 +78,6 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
   - [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/)
   - [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/)
   - [Wan 2.2](https://comfyanonymous.github.io/ComfyUI_examples/wan22/)
-   - [Hunyuan Video 1.5](https://docs.comfy.org/tutorials/video/hunyuan/hunyuan-video-1-5)
 - Audio Models
   - [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
   - [ACE Step](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
@ -115,14 +111,10 @@ Workflow examples can be found on the [Examples page](https://comfyanonymous.git

 ## Release Process

-ComfyUI follows a weekly release cycle targeting Monday but this regularly changes because of model releases or large changes to the codebase. There are three interconnected repositories:
+ComfyUI follows a weekly release cycle targeting Friday but this regularly changes because of model releases or large changes to the codebase. There are three interconnected repositories:

 1. **[ComfyUI Core](https://github.com/comfyanonymous/ComfyUI)**
-   - Releases a new stable version (e.g., v0.7.0) roughly every week.
-   - Starting from v0.4.0 patch versions will be used for fixes backported onto the current stable release.
-   - Minor versions will be used for releases off the master branch.
-   - Patch versions may still be used for releases on the master branch in cases where a backport would not make sense.
-   - Commits outside of the stable release tags may be very unstable and break many custom nodes.
+   - Releases a new stable version (e.g., v0.7.0)
   - Serves as the foundation for the desktop release

 2. **[ComfyUI Desktop](https://github.com/Comfy-Org/desktop)**
@ -179,20 +171,10 @@ There is a portable standalone build for Windows that should work for running on

 ### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia.7z)

-Simply download, extract with [7-Zip](https://7-zip.org) or with the windows explorer on recent windows versions and run. For smaller models you normally only need to put the checkpoints (the huge ckpt/safetensors files) in: ComfyUI\models\checkpoints but many of the larger models have multiple files. Make sure to follow the instructions to know which subfolder to put them in ComfyUI\models\
+Simply download, extract with [7-Zip](https://7-zip.org) and run. Make sure you put your Stable Diffusion checkpoints/models (the huge ckpt/safetensors files) in: ComfyUI\models\checkpoints

 If you have trouble extracting it, right click the file -> properties -> unblock

-Update your Nvidia drivers if it doesn't start.
-
-#### Alternative Downloads:
-
-[Experimental portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)
-
-[Portable with pytorch cuda 12.8 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu128.7z).
-
-[Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).
-
 #### How do I share models between another UI and ComfyUI?

 See the [Config file](extra_model_paths.yaml.example) to set the search paths for models. In the standalone windows build you can find this file in the ComfyUI directory. Rename this file to extra_model_paths.yaml and edit it with your favorite text editor.
@ -208,13 +190,7 @@ comfy install

 ## Manual Install (Windows, Linux)

-Python 3.14 works but you may encounter issues with the torch compile node. The free threaded variant is still missing some dependencies.
-
-Python 3.13 is very well supported. If you have trouble with some custom node dependencies on 3.13 you can try 3.12
-
-torch 2.4 and above is supported but some features might only work on newer versions. We generally recommend using the latest major version of pytorch unless it is less than 2 weeks old.
-
-### Instructions:
+Python 3.13 is very well supported. If you have trouble with some custom node dependencies you can try 3.12

 Git clone this repo.

@ -223,36 +199,18 @@ Put your SD checkpoints (the huge ckpt/safetensors files) in: models/checkpoints
 Put your VAE in: models/vae


-### AMD GPUs (Linux)
-
+### AMD GPUs (Linux only)
 AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:

 ```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.4```

-This is the command to install the nightly with ROCm 7.0 which might have some performance improvements:
+This is the command to install the nightly with ROCm 6.4 which might have some performance improvements:

-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm7.1```
-
-
-### AMD GPUs (Experimental: Windows and Linux), RDNA 3, 3.5 and 4 only.
-
-These have less hardware support than the builds above but they work on windows. You also need to install the pytorch version specific to your hardware.
-
-RDNA 3 (RX 7000 series):
-
-```pip install --pre torch torchvision torchaudio --index-url https://rocm.nightlies.amd.com/v2/gfx110X-dgpu/```
-
-RDNA 3.5 (Strix halo/Ryzen AI Max+ 365):
-
-```pip install --pre torch torchvision torchaudio --index-url https://rocm.nightlies.amd.com/v2/gfx1151/```
-
-RDNA 4 (RX 9000 series):
-
-```pip install --pre torch torchvision torchaudio --index-url https://rocm.nightlies.amd.com/v2/gfx120X-all/```
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.4```

 ### Intel GPUs (Windows and Linux)

-Intel Arc GPU users can install native PyTorch with torch.xpu support using pip. More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html)
+(Option 1) Intel Arc GPU users can install native PyTorch with torch.xpu support using pip. More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html)

 1. To install PyTorch xpu, use the following command:

@ -262,15 +220,19 @@ This is the command to install the Pytorch xpu nightly which might have some per

 ```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu```

+(Option 2) Alternatively, Intel GPUs supported by Intel Extension for PyTorch (IPEX) can leverage IPEX for improved performance.
+
+1. visit [Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu) for more information.
+
 ### NVIDIA

 Nvidia users should install stable pytorch using this command:

-```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu130```
+```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu129```

 This is the command to install pytorch nightly instead which might have performance improvements.

-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu130```
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu129```

 #### Troubleshooting

@ -301,6 +263,12 @@ You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS ve

 > **Note**: Remember to add your models, VAE, LoRAs etc. to the corresponding Comfy folders, as discussed in [ComfyUI manual installation](#manual-install-windows-linux).

+#### DirectML (AMD Cards on Windows)
+
+This is very badly supported and is not recommended. There are some unofficial builds of pytorch ROCm on windows that exist that will give you a much better experience than this. This readme will be updated once official pytorch ROCm builds for windows come out.
+
+```pip install torch-directml``` Then you can launch ComfyUI with: ```python main.py --directml```
+
 #### Ascend NPUs

 For models compatible with Ascend Extension for PyTorch (torch_npu). To get started, ensure your environment meets the prerequisites outlined on the [installation](https://ascend.github.io/docs/sources/ascend/quick_install.html) page. Here's a step-by-step guide tailored to your platform and installation method:
@ -325,32 +293,6 @@ For models compatible with Iluvatar Extension for PyTorch. Here's a step-by-step
 1. Install the Iluvatar Corex Toolkit by adhering to the platform-specific instructions on the [Installation](https://support.iluvatar.com/#/DocumentCentre?id=1&nameCenter=2&productId=520117912052801536)
 2. Launch ComfyUI by running `python main.py`

-
-## [ComfyUI-Manager](https://github.com/Comfy-Org/ComfyUI-Manager/tree/manager-v4)
-
-**ComfyUI-Manager** is an extension that allows you to easily install, update, and manage custom nodes for ComfyUI.
-
-### Setup
-
-1. Install the manager dependencies:
-   ```bash
-   pip install -r manager_requirements.txt
-   ```
-
-2. Enable the manager with the `--enable-manager` flag when running ComfyUI:
-   ```bash
-   python main.py --enable-manager
-   ```
-
-### Command Line Options
-
-| Flag | Description |
-|------|-------------|
-| `--enable-manager` | Enable ComfyUI-Manager |
-| `--enable-manager-legacy-ui` | Use the legacy manager UI instead of the new UI (requires `--enable-manager`) |
-| `--disable-manager-ui` | Disable the manager UI and endpoints while keeping background features like security checks and scheduled installation completion (requires `--enable-manager`) |
-
-
 # Running

 ```python main.py```
--- a/alembic_db/versions/0001_assets.py
+++ b/alembic_db/versions/0001_assets.py
@ -1,174 +0,0 @@
-"""
-Initial assets schema
-Revision ID: 0001_assets
-Revises: None
-Create Date: 2025-12-10 00:00:00
-"""
-
-from alembic import op
-import sqlalchemy as sa
-
-revision = "0001_assets"
-down_revision = None
-branch_labels = None
-depends_on = None
-
-
-def upgrade() -> None:
-    # ASSETS: content identity
-    op.create_table(
-        "assets",
-        sa.Column("id", sa.String(length=36), primary_key=True),
-        sa.Column("hash", sa.String(length=256), nullable=True),
-        sa.Column("size_bytes", sa.BigInteger(), nullable=False, server_default="0"),
-        sa.Column("mime_type", sa.String(length=255), nullable=True),
-        sa.Column("created_at", sa.DateTime(timezone=False), nullable=False),
-        sa.CheckConstraint("size_bytes >= 0", name="ck_assets_size_nonneg"),
-    )
-    op.create_index("uq_assets_hash", "assets", ["hash"], unique=True)
-    op.create_index("ix_assets_mime_type", "assets", ["mime_type"])
-
-    # ASSETS_INFO: user-visible references
-    op.create_table(
-        "assets_info",
-        sa.Column("id", sa.String(length=36), primary_key=True),
-        sa.Column("owner_id", sa.String(length=128), nullable=False, server_default=""),
-        sa.Column("name", sa.String(length=512), nullable=False),
-        sa.Column("asset_id", sa.String(length=36), sa.ForeignKey("assets.id", ondelete="RESTRICT"), nullable=False),
-        sa.Column("preview_id", sa.String(length=36), sa.ForeignKey("assets.id", ondelete="SET NULL"), nullable=True),
-        sa.Column("user_metadata", sa.JSON(), nullable=True),
-        sa.Column("created_at", sa.DateTime(timezone=False), nullable=False),
-        sa.Column("updated_at", sa.DateTime(timezone=False), nullable=False),
-        sa.Column("last_access_time", sa.DateTime(timezone=False), nullable=False),
-        sa.UniqueConstraint("asset_id", "owner_id", "name", name="uq_assets_info_asset_owner_name"),
-    )
-    op.create_index("ix_assets_info_owner_id", "assets_info", ["owner_id"])
-    op.create_index("ix_assets_info_asset_id", "assets_info", ["asset_id"])
-    op.create_index("ix_assets_info_name", "assets_info", ["name"])
-    op.create_index("ix_assets_info_created_at", "assets_info", ["created_at"])
-    op.create_index("ix_assets_info_last_access_time", "assets_info", ["last_access_time"])
-    op.create_index("ix_assets_info_owner_name", "assets_info", ["owner_id", "name"])
-
-    # TAGS: normalized tag vocabulary
-    op.create_table(
-        "tags",
-        sa.Column("name", sa.String(length=512), primary_key=True),
-        sa.Column("tag_type", sa.String(length=32), nullable=False, server_default="user"),
-        sa.CheckConstraint("name = lower(name)", name="ck_tags_lowercase"),
-    )
-    op.create_index("ix_tags_tag_type", "tags", ["tag_type"])
-
-    # ASSET_INFO_TAGS: many-to-many for tags on AssetInfo
-    op.create_table(
-        "asset_info_tags",
-        sa.Column("asset_info_id", sa.String(length=36), sa.ForeignKey("assets_info.id", ondelete="CASCADE"), nullable=False),
-        sa.Column("tag_name", sa.String(length=512), sa.ForeignKey("tags.name", ondelete="RESTRICT"), nullable=False),
-        sa.Column("origin", sa.String(length=32), nullable=False, server_default="manual"),
-        sa.Column("added_at", sa.DateTime(timezone=False), nullable=False),
-        sa.PrimaryKeyConstraint("asset_info_id", "tag_name", name="pk_asset_info_tags"),
-    )
-    op.create_index("ix_asset_info_tags_tag_name", "asset_info_tags", ["tag_name"])
-    op.create_index("ix_asset_info_tags_asset_info_id", "asset_info_tags", ["asset_info_id"])
-
-    # ASSET_CACHE_STATE: N:1 local cache rows per Asset
-    op.create_table(
-        "asset_cache_state",
-        sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
-        sa.Column("asset_id", sa.String(length=36), sa.ForeignKey("assets.id", ondelete="CASCADE"), nullable=False),
-        sa.Column("file_path", sa.Text(), nullable=False),  # absolute local path to cached file
-        sa.Column("mtime_ns", sa.BigInteger(), nullable=True),
-        sa.Column("needs_verify", sa.Boolean(), nullable=False, server_default=sa.text("false")),
-        sa.CheckConstraint("(mtime_ns IS NULL) OR (mtime_ns >= 0)", name="ck_acs_mtime_nonneg"),
-        sa.UniqueConstraint("file_path", name="uq_asset_cache_state_file_path"),
-    )
-    op.create_index("ix_asset_cache_state_file_path", "asset_cache_state", ["file_path"])
-    op.create_index("ix_asset_cache_state_asset_id", "asset_cache_state", ["asset_id"])
-
-    # ASSET_INFO_META: typed KV projection of user_metadata for filtering/sorting
-    op.create_table(
-        "asset_info_meta",
-        sa.Column("asset_info_id", sa.String(length=36), sa.ForeignKey("assets_info.id", ondelete="CASCADE"), nullable=False),
-        sa.Column("key", sa.String(length=256), nullable=False),
-        sa.Column("ordinal", sa.Integer(), nullable=False, server_default="0"),
-        sa.Column("val_str", sa.String(length=2048), nullable=True),
-        sa.Column("val_num", sa.Numeric(38, 10), nullable=True),
-        sa.Column("val_bool", sa.Boolean(), nullable=True),
-        sa.Column("val_json", sa.JSON(), nullable=True),
-        sa.PrimaryKeyConstraint("asset_info_id", "key", "ordinal", name="pk_asset_info_meta"),
-    )
-    op.create_index("ix_asset_info_meta_key", "asset_info_meta", ["key"])
-    op.create_index("ix_asset_info_meta_key_val_str", "asset_info_meta", ["key", "val_str"])
-    op.create_index("ix_asset_info_meta_key_val_num", "asset_info_meta", ["key", "val_num"])
-    op.create_index("ix_asset_info_meta_key_val_bool", "asset_info_meta", ["key", "val_bool"])
-
-    # Tags vocabulary
-    tags_table = sa.table(
-        "tags",
-        sa.column("name", sa.String(length=512)),
-        sa.column("tag_type", sa.String()),
-    )
-    op.bulk_insert(
-        tags_table,
-        [
-            {"name": "models", "tag_type": "system"},
-            {"name": "input", "tag_type": "system"},
-            {"name": "output", "tag_type": "system"},
-
-            {"name": "configs", "tag_type": "system"},
-            {"name": "checkpoints", "tag_type": "system"},
-            {"name": "loras", "tag_type": "system"},
-            {"name": "vae", "tag_type": "system"},
-            {"name": "text_encoders", "tag_type": "system"},
-            {"name": "diffusion_models", "tag_type": "system"},
-            {"name": "clip_vision", "tag_type": "system"},
-            {"name": "style_models", "tag_type": "system"},
-            {"name": "embeddings", "tag_type": "system"},
-            {"name": "diffusers", "tag_type": "system"},
-            {"name": "vae_approx", "tag_type": "system"},
-            {"name": "controlnet", "tag_type": "system"},
-            {"name": "gligen", "tag_type": "system"},
-            {"name": "upscale_models", "tag_type": "system"},
-            {"name": "hypernetworks", "tag_type": "system"},
-            {"name": "photomaker", "tag_type": "system"},
-            {"name": "classifiers", "tag_type": "system"},
-
-            {"name": "encoder", "tag_type": "system"},
-            {"name": "decoder", "tag_type": "system"},
-
-            {"name": "missing", "tag_type": "system"},
-            {"name": "rescan", "tag_type": "system"},
-        ],
-    )
-
-
-def downgrade() -> None:
-    op.drop_index("ix_asset_info_meta_key_val_bool", table_name="asset_info_meta")
-    op.drop_index("ix_asset_info_meta_key_val_num", table_name="asset_info_meta")
-    op.drop_index("ix_asset_info_meta_key_val_str", table_name="asset_info_meta")
-    op.drop_index("ix_asset_info_meta_key", table_name="asset_info_meta")
-    op.drop_table("asset_info_meta")
-
-    op.drop_index("ix_asset_cache_state_asset_id", table_name="asset_cache_state")
-    op.drop_index("ix_asset_cache_state_file_path", table_name="asset_cache_state")
-    op.drop_constraint("uq_asset_cache_state_file_path", table_name="asset_cache_state")
-    op.drop_table("asset_cache_state")
-
-    op.drop_index("ix_asset_info_tags_asset_info_id", table_name="asset_info_tags")
-    op.drop_index("ix_asset_info_tags_tag_name", table_name="asset_info_tags")
-    op.drop_table("asset_info_tags")
-
-    op.drop_index("ix_tags_tag_type", table_name="tags")
-    op.drop_table("tags")
-
-    op.drop_constraint("uq_assets_info_asset_owner_name", table_name="assets_info")
-    op.drop_index("ix_assets_info_owner_name", table_name="assets_info")
-    op.drop_index("ix_assets_info_last_access_time", table_name="assets_info")
-    op.drop_index("ix_assets_info_created_at", table_name="assets_info")
-    op.drop_index("ix_assets_info_name", table_name="assets_info")
-    op.drop_index("ix_assets_info_asset_id", table_name="assets_info")
-    op.drop_index("ix_assets_info_owner_id", table_name="assets_info")
-    op.drop_table("assets_info")
-
-    op.drop_index("uq_assets_hash", table_name="assets")
-    op.drop_index("ix_assets_mime_type", table_name="assets")
-    op.drop_table("assets")
--- a/api_server/routes/internal/internal_routes.py
+++ b/api_server/routes/internal/internal_routes.py
@ -58,13 +58,8 @@ class InternalRoutes:
                return web.json_response({"error": "Invalid directory type"}, status=400)

            directory = get_directory_by_type(directory_type)
-
-            def is_visible_file(entry: os.DirEntry) -> bool:
-                """Filter out hidden files (e.g., .DS_Store on macOS)."""
-                return entry.is_file() and not entry.name.startswith('.')
-
            sorted_files = sorted(
-                (entry for entry in os.scandir(directory) if is_visible_file(entry)),
+                (entry for entry in os.scandir(directory) if entry.is_file()),
                key=lambda entry: -entry.stat().st_mtime
            )
            return web.json_response([entry.name for entry in sorted_files], status=200)
--- a/app/assets/api/routes.py
+++ b/app/assets/api/routes.py
@ -1,102 +0,0 @@
-import logging
-import uuid
-from aiohttp import web
-
-from pydantic import ValidationError
-
-import app.assets.manager as manager
-from app import user_manager
-from app.assets.api import schemas_in
-from app.assets.helpers import get_query_dict
-
-ROUTES = web.RouteTableDef()
-USER_MANAGER: user_manager.UserManager | None = None
-
-# UUID regex (canonical hyphenated form, case-insensitive)
-UUID_RE = r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}"
-
-def register_assets_system(app: web.Application, user_manager_instance: user_manager.UserManager) -> None:
-    global USER_MANAGER
-    USER_MANAGER = user_manager_instance
-    app.add_routes(ROUTES)
-
-def _error_response(status: int, code: str, message: str, details: dict | None = None) -> web.Response:
-    return web.json_response({"error": {"code": code, "message": message, "details": details or {}}}, status=status)
-
-
-def _validation_error_response(code: str, ve: ValidationError) -> web.Response:
-    return _error_response(400, code, "Validation failed.", {"errors": ve.json()})
-
-
-@ROUTES.get("/api/assets")
-async def list_assets(request: web.Request) -> web.Response:
-    """
-    GET request to list assets.
-    """
-    query_dict = get_query_dict(request)
-    try:
-        q = schemas_in.ListAssetsQuery.model_validate(query_dict)
-    except ValidationError as ve:
-        return _validation_error_response("INVALID_QUERY", ve)
-
-    payload = manager.list_assets(
-        include_tags=q.include_tags,
-        exclude_tags=q.exclude_tags,
-        name_contains=q.name_contains,
-        metadata_filter=q.metadata_filter,
-        limit=q.limit,
-        offset=q.offset,
-        sort=q.sort,
-        order=q.order,
-        owner_id=USER_MANAGER.get_request_user_id(request),
-    )
-    return web.json_response(payload.model_dump(mode="json"))
-
-
-@ROUTES.get(f"/api/assets/{{id:{UUID_RE}}}")
-async def get_asset(request: web.Request) -> web.Response:
-    """
-    GET request to get an asset's info as JSON.
-    """
-    asset_info_id = str(uuid.UUID(request.match_info["id"]))
-    try:
-        result = manager.get_asset(
-            asset_info_id=asset_info_id,
-            owner_id=USER_MANAGER.get_request_user_id(request),
-        )
-    except ValueError as e:
-        return _error_response(404, "ASSET_NOT_FOUND", str(e), {"id": asset_info_id})
-    except Exception:
-        logging.exception(
-            "get_asset failed for asset_info_id=%s, owner_id=%s",
-            asset_info_id,
-            USER_MANAGER.get_request_user_id(request),
-        )
-        return _error_response(500, "INTERNAL", "Unexpected server error.")
-    return web.json_response(result.model_dump(mode="json"), status=200)
-
-
-@ROUTES.get("/api/tags")
-async def get_tags(request: web.Request) -> web.Response:
-    """
-    GET request to list all tags based on query parameters.
-    """
-    query_map = dict(request.rel_url.query)
-
-    try:
-        query = schemas_in.TagsListQuery.model_validate(query_map)
-    except ValidationError as e:
-        return web.json_response(
-            {"error": {"code": "INVALID_QUERY", "message": "Invalid query parameters", "details": e.errors()}},
-            status=400,
-        )
-
-    result = manager.list_tags(
-        prefix=query.prefix,
-        limit=query.limit,
-        offset=query.offset,
-        order=query.order,
-        include_zero=query.include_zero,
-        owner_id=USER_MANAGER.get_request_user_id(request),
-    )
-    return web.json_response(result.model_dump(mode="json"))
--- a/app/assets/api/schemas_in.py
+++ b/app/assets/api/schemas_in.py
@ -1,94 +0,0 @@
-import json
-import uuid
-from typing import Any, Literal
-
-from pydantic import (
-    BaseModel,
-    ConfigDict,
-    Field,
-    conint,
-    field_validator,
-)
-
-
-class ListAssetsQuery(BaseModel):
-    include_tags: list[str] = Field(default_factory=list)
-    exclude_tags: list[str] = Field(default_factory=list)
-    name_contains: str | None = None
-
-    # Accept either a JSON string (query param) or a dict
-    metadata_filter: dict[str, Any] | None = None
-
-    limit: conint(ge=1, le=500) = 20
-    offset: conint(ge=0) = 0
-
-    sort: Literal["name", "created_at", "updated_at", "size", "last_access_time"] = "created_at"
-    order: Literal["asc", "desc"] = "desc"
-
-    @field_validator("include_tags", "exclude_tags", mode="before")
-    @classmethod
-    def _split_csv_tags(cls, v):
-        # Accept "a,b,c" or ["a","b"] (we are liberal in what we accept)
-        if v is None:
-            return []
-        if isinstance(v, str):
-            return [t.strip() for t in v.split(",") if t.strip()]
-        if isinstance(v, list):
-            out: list[str] = []
-            for item in v:
-                if isinstance(item, str):
-                    out.extend([t.strip() for t in item.split(",") if t.strip()])
-            return out
-        return v
-
-    @field_validator("metadata_filter", mode="before")
-    @classmethod
-    def _parse_metadata_json(cls, v):
-        if v is None or isinstance(v, dict):
-            return v
-        if isinstance(v, str) and v.strip():
-            try:
-                parsed = json.loads(v)
-            except Exception as e:
-                raise ValueError(f"metadata_filter must be JSON: {e}") from e
-            if not isinstance(parsed, dict):
-                raise ValueError("metadata_filter must be a JSON object")
-            return parsed
-        return None
-
-
-class TagsListQuery(BaseModel):
-    model_config = ConfigDict(extra="ignore", str_strip_whitespace=True)
-
-    prefix: str | None = Field(None, min_length=1, max_length=256)
-    limit: int = Field(100, ge=1, le=1000)
-    offset: int = Field(0, ge=0, le=10_000_000)
-    order: Literal["count_desc", "name_asc"] = "count_desc"
-    include_zero: bool = True
-
-    @field_validator("prefix")
-    @classmethod
-    def normalize_prefix(cls, v: str | None) -> str | None:
-        if v is None:
-            return v
-        v = v.strip()
-        return v.lower() or None
-
-
-class SetPreviewBody(BaseModel):
-    """Set or clear the preview for an AssetInfo. Provide an Asset.id or null."""
-    preview_id: str | None = None
-
-    @field_validator("preview_id", mode="before")
-    @classmethod
-    def _norm_uuid(cls, v):
-        if v is None:
-            return None
-        s = str(v).strip()
-        if not s:
-            return None
-        try:
-            uuid.UUID(s)
-        except Exception:
-            raise ValueError("preview_id must be a UUID")
-        return s
--- a/app/assets/api/schemas_out.py
+++ b/app/assets/api/schemas_out.py
@ -1,60 +0,0 @@
-from datetime import datetime
-from typing import Any
-
-from pydantic import BaseModel, ConfigDict, Field, field_serializer
-
-
-class AssetSummary(BaseModel):
-    id: str
-    name: str
-    asset_hash: str | None = None
-    size: int | None = None
-    mime_type: str | None = None
-    tags: list[str] = Field(default_factory=list)
-    preview_url: str | None = None
-    created_at: datetime | None = None
-    updated_at: datetime | None = None
-    last_access_time: datetime | None = None
-
-    model_config = ConfigDict(from_attributes=True)
-
-    @field_serializer("created_at", "updated_at", "last_access_time")
-    def _ser_dt(self, v: datetime | None, _info):
-        return v.isoformat() if v else None
-
-
-class AssetsList(BaseModel):
-    assets: list[AssetSummary]
-    total: int
-    has_more: bool
-
-
-class AssetDetail(BaseModel):
-    id: str
-    name: str
-    asset_hash: str | None = None
-    size: int | None = None
-    mime_type: str | None = None
-    tags: list[str] = Field(default_factory=list)
-    user_metadata: dict[str, Any] = Field(default_factory=dict)
-    preview_id: str | None = None
-    created_at: datetime | None = None
-    last_access_time: datetime | None = None
-
-    model_config = ConfigDict(from_attributes=True)
-
-    @field_serializer("created_at", "last_access_time")
-    def _ser_dt(self, v: datetime | None, _info):
-        return v.isoformat() if v else None
-
-
-class TagUsage(BaseModel):
-    name: str
-    count: int
-    type: str
-
-
-class TagsList(BaseModel):
-    tags: list[TagUsage] = Field(default_factory=list)
-    total: int
-    has_more: bool
--- a/app/assets/database/bulk_ops.py
+++ b/app/assets/database/bulk_ops.py
@ -1,188 +0,0 @@
-import os
-import uuid
-import sqlalchemy
-from typing import Iterable
-from sqlalchemy.orm import Session
-from sqlalchemy.dialects import sqlite
-
-from app.assets.helpers import utcnow
-from app.assets.database.models import Asset, AssetCacheState, AssetInfo, AssetInfoTag, AssetInfoMeta
-
-MAX_BIND_PARAMS = 800
-
-def _chunk_rows(rows: list[dict], cols_per_row: int, max_bind_params: int) -> Iterable[list[dict]]:
-    if not rows:
-        return []
-    rows_per_stmt = max(1, max_bind_params // max(1, cols_per_row))
-    for i in range(0, len(rows), rows_per_stmt):
-        yield rows[i:i + rows_per_stmt]
-
-def _iter_chunks(seq, n: int):
-    for i in range(0, len(seq), n):
-        yield seq[i:i + n]
-
-def _rows_per_stmt(cols: int) -> int:
-    return max(1, MAX_BIND_PARAMS // max(1, cols))
-
-
-def seed_from_paths_batch(
-    session: Session,
-    *,
-    specs: list[dict],
-    owner_id: str = "",
-) -> dict:
-    """Each spec is a dict with keys:
-      - abs_path: str
-      - size_bytes: int
-      - mtime_ns: int
-      - info_name: str
-      - tags: list[str]
-      - fname: Optional[str]
-    """
-    if not specs:
-        return {"inserted_infos": 0, "won_states": 0, "lost_states": 0}
-
-    now = utcnow()
-    asset_rows: list[dict] = []
-    state_rows: list[dict] = []
-    path_to_asset: dict[str, str] = {}
-    asset_to_info: dict[str, dict] = {}  # asset_id -> prepared info row
-    path_list: list[str] = []
-
-    for sp in specs:
-        ap = os.path.abspath(sp["abs_path"])
-        aid = str(uuid.uuid4())
-        iid = str(uuid.uuid4())
-        path_list.append(ap)
-        path_to_asset[ap] = aid
-
-        asset_rows.append(
-            {
-                "id": aid,
-                "hash": None,
-                "size_bytes": sp["size_bytes"],
-                "mime_type": None,
-                "created_at": now,
-            }
-        )
-        state_rows.append(
-            {
-                "asset_id": aid,
-                "file_path": ap,
-                "mtime_ns": sp["mtime_ns"],
-            }
-        )
-        asset_to_info[aid] = {
-            "id": iid,
-            "owner_id": owner_id,
-            "name": sp["info_name"],
-            "asset_id": aid,
-            "preview_id": None,
-            "user_metadata": {"filename": sp["fname"]} if sp["fname"] else None,
-            "created_at": now,
-            "updated_at": now,
-            "last_access_time": now,
-            "_tags": sp["tags"],
-            "_filename": sp["fname"],
-        }
-
-    # insert all seed Assets (hash=NULL)
-    ins_asset = sqlite.insert(Asset)
-    for chunk in _iter_chunks(asset_rows, _rows_per_stmt(5)):
-        session.execute(ins_asset, chunk)
-
-    # try to claim AssetCacheState (file_path)
-    winners_by_path: set[str] = set()
-    ins_state = (
-        sqlite.insert(AssetCacheState)
-        .on_conflict_do_nothing(index_elements=[AssetCacheState.file_path])
-        .returning(AssetCacheState.file_path)
-    )
-    for chunk in _iter_chunks(state_rows, _rows_per_stmt(3)):
-        winners_by_path.update((session.execute(ins_state, chunk)).scalars().all())
-
-    all_paths_set = set(path_list)
-    losers_by_path = all_paths_set - winners_by_path
-    lost_assets = [path_to_asset[p] for p in losers_by_path]
-    if lost_assets:  # losers get their Asset removed
-        for id_chunk in _iter_chunks(lost_assets, MAX_BIND_PARAMS):
-            session.execute(sqlalchemy.delete(Asset).where(Asset.id.in_(id_chunk)))
-
-    if not winners_by_path:
-        return {"inserted_infos": 0, "won_states": 0, "lost_states": len(losers_by_path)}
-
-    # insert AssetInfo only for winners
-    winner_info_rows = [asset_to_info[path_to_asset[p]] for p in winners_by_path]
-    ins_info = (
-        sqlite.insert(AssetInfo)
-        .on_conflict_do_nothing(index_elements=[AssetInfo.asset_id, AssetInfo.owner_id, AssetInfo.name])
-        .returning(AssetInfo.id)
-    )
-
-    inserted_info_ids: set[str] = set()
-    for chunk in _iter_chunks(winner_info_rows, _rows_per_stmt(9)):
-        inserted_info_ids.update((session.execute(ins_info, chunk)).scalars().all())
-
-    # build and insert tag + meta rows for the AssetInfo
-    tag_rows: list[dict] = []
-    meta_rows: list[dict] = []
-    if inserted_info_ids:
-        for row in winner_info_rows:
-            iid = row["id"]
-            if iid not in inserted_info_ids:
-                continue
-            for t in row["_tags"]:
-                tag_rows.append({
-                    "asset_info_id": iid,
-                    "tag_name": t,
-                    "origin": "automatic",
-                    "added_at": now,
-                })
-            if row["_filename"]:
-                meta_rows.append(
-                    {
-                        "asset_info_id": iid,
-                        "key": "filename",
-                        "ordinal": 0,
-                        "val_str": row["_filename"],
-                        "val_num": None,
-                        "val_bool": None,
-                        "val_json": None,
-                    }
-                )
-
-    bulk_insert_tags_and_meta(session, tag_rows=tag_rows, meta_rows=meta_rows, max_bind_params=MAX_BIND_PARAMS)
-    return {
-        "inserted_infos": len(inserted_info_ids),
-        "won_states": len(winners_by_path),
-        "lost_states": len(losers_by_path),
-    }
-
-
-def bulk_insert_tags_and_meta(
-    session: Session,
-    *,
-    tag_rows: list[dict],
-    meta_rows: list[dict],
-    max_bind_params: int,
-) -> None:
-    """Batch insert into asset_info_tags and asset_info_meta with ON CONFLICT DO NOTHING.
-    - tag_rows keys: asset_info_id, tag_name, origin, added_at
-    - meta_rows keys: asset_info_id, key, ordinal, val_str, val_num, val_bool, val_json
-    """
-    if tag_rows:
-        ins_links = (
-            sqlite.insert(AssetInfoTag)
-            .on_conflict_do_nothing(index_elements=[AssetInfoTag.asset_info_id, AssetInfoTag.tag_name])
-        )
-        for chunk in _chunk_rows(tag_rows, cols_per_row=4, max_bind_params=max_bind_params):
-            session.execute(ins_links, chunk)
-    if meta_rows:
-        ins_meta = (
-            sqlite.insert(AssetInfoMeta)
-            .on_conflict_do_nothing(
-                index_elements=[AssetInfoMeta.asset_info_id, AssetInfoMeta.key, AssetInfoMeta.ordinal]
-            )
-        )
-        for chunk in _chunk_rows(meta_rows, cols_per_row=7, max_bind_params=max_bind_params):
-            session.execute(ins_meta, chunk)
--- a/app/assets/database/models.py
+++ b/app/assets/database/models.py
@ -1,233 +0,0 @@
-from __future__ import annotations
-
-import uuid
-from datetime import datetime
-
-from typing import Any
-from sqlalchemy import (
-    JSON,
-    BigInteger,
-    Boolean,
-    CheckConstraint,
-    DateTime,
-    ForeignKey,
-    Index,
-    Integer,
-    Numeric,
-    String,
-    Text,
-    UniqueConstraint,
-)
-from sqlalchemy.orm import Mapped, foreign, mapped_column, relationship
-
-from app.assets.helpers import utcnow
-from app.database.models import to_dict, Base
-
-
-class Asset(Base):
-    __tablename__ = "assets"
-
-    id: Mapped[str] = mapped_column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
-    hash: Mapped[str | None] = mapped_column(String(256), nullable=True)
-    size_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False, default=0)
-    mime_type: Mapped[str | None] = mapped_column(String(255))
-    created_at: Mapped[datetime] = mapped_column(
-        DateTime(timezone=False), nullable=False, default=utcnow
-    )
-
-    infos: Mapped[list[AssetInfo]] = relationship(
-        "AssetInfo",
-        back_populates="asset",
-        primaryjoin=lambda: Asset.id == foreign(AssetInfo.asset_id),
-        foreign_keys=lambda: [AssetInfo.asset_id],
-        cascade="all,delete-orphan",
-        passive_deletes=True,
-    )
-
-    preview_of: Mapped[list[AssetInfo]] = relationship(
-        "AssetInfo",
-        back_populates="preview_asset",
-        primaryjoin=lambda: Asset.id == foreign(AssetInfo.preview_id),
-        foreign_keys=lambda: [AssetInfo.preview_id],
-        viewonly=True,
-    )
-
-    cache_states: Mapped[list[AssetCacheState]] = relationship(
-        back_populates="asset",
-        cascade="all, delete-orphan",
-        passive_deletes=True,
-    )
-
-    __table_args__ = (
-        Index("uq_assets_hash", "hash", unique=True),
-        Index("ix_assets_mime_type", "mime_type"),
-        CheckConstraint("size_bytes >= 0", name="ck_assets_size_nonneg"),
-    )
-
-    def to_dict(self, include_none: bool = False) -> dict[str, Any]:
-        return to_dict(self, include_none=include_none)
-
-    def __repr__(self) -> str:
-        return f"<Asset id={self.id} hash={(self.hash or '')[:12]}>"
-
-
-class AssetCacheState(Base):
-    __tablename__ = "asset_cache_state"
-
-    id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
-    asset_id: Mapped[str] = mapped_column(String(36), ForeignKey("assets.id", ondelete="CASCADE"), nullable=False)
-    file_path: Mapped[str] = mapped_column(Text, nullable=False)
-    mtime_ns: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
-    needs_verify: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
-
-    asset: Mapped[Asset] = relationship(back_populates="cache_states")
-
-    __table_args__ = (
-        Index("ix_asset_cache_state_file_path", "file_path"),
-        Index("ix_asset_cache_state_asset_id", "asset_id"),
-        CheckConstraint("(mtime_ns IS NULL) OR (mtime_ns >= 0)", name="ck_acs_mtime_nonneg"),
-        UniqueConstraint("file_path", name="uq_asset_cache_state_file_path"),
-    )
-
-    def to_dict(self, include_none: bool = False) -> dict[str, Any]:
-        return to_dict(self, include_none=include_none)
-
-    def __repr__(self) -> str:
-        return f"<AssetCacheState id={self.id} asset_id={self.asset_id} path={self.file_path!r}>"
-
-
-class AssetInfo(Base):
-    __tablename__ = "assets_info"
-
-    id: Mapped[str] = mapped_column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
-    owner_id: Mapped[str] = mapped_column(String(128), nullable=False, default="")
-    name: Mapped[str] = mapped_column(String(512), nullable=False)
-    asset_id: Mapped[str] = mapped_column(String(36), ForeignKey("assets.id", ondelete="RESTRICT"), nullable=False)
-    preview_id: Mapped[str | None] = mapped_column(String(36), ForeignKey("assets.id", ondelete="SET NULL"))
-    user_metadata: Mapped[dict[str, Any] | None] = mapped_column(JSON(none_as_null=True))
-    created_at: Mapped[datetime] = mapped_column(DateTime(timezone=False), nullable=False, default=utcnow)
-    updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=False), nullable=False, default=utcnow)
-    last_access_time: Mapped[datetime] = mapped_column(DateTime(timezone=False), nullable=False, default=utcnow)
-
-    asset: Mapped[Asset] = relationship(
-        "Asset",
-        back_populates="infos",
-        foreign_keys=[asset_id],
-        lazy="selectin",
-    )
-    preview_asset: Mapped[Asset | None] = relationship(
-        "Asset",
-        back_populates="preview_of",
-        foreign_keys=[preview_id],
-    )
-
-    metadata_entries: Mapped[list[AssetInfoMeta]] = relationship(
-        back_populates="asset_info",
-        cascade="all,delete-orphan",
-        passive_deletes=True,
-    )
-
-    tag_links: Mapped[list[AssetInfoTag]] = relationship(
-        back_populates="asset_info",
-        cascade="all,delete-orphan",
-        passive_deletes=True,
-        overlaps="tags,asset_infos",
-    )
-
-    tags: Mapped[list[Tag]] = relationship(
-        secondary="asset_info_tags",
-        back_populates="asset_infos",
-        lazy="selectin",
-        viewonly=True,
-        overlaps="tag_links,asset_info_links,asset_infos,tag",
-    )
-
-    __table_args__ = (
-        UniqueConstraint("asset_id", "owner_id", "name", name="uq_assets_info_asset_owner_name"),
-        Index("ix_assets_info_owner_name", "owner_id", "name"),
-        Index("ix_assets_info_owner_id", "owner_id"),
-        Index("ix_assets_info_asset_id", "asset_id"),
-        Index("ix_assets_info_name", "name"),
-        Index("ix_assets_info_created_at", "created_at"),
-        Index("ix_assets_info_last_access_time", "last_access_time"),
-    )
-
-    def to_dict(self, include_none: bool = False) -> dict[str, Any]:
-        data = to_dict(self, include_none=include_none)
-        data["tags"] = [t.name for t in self.tags]
-        return data
-
-    def __repr__(self) -> str:
-        return f"<AssetInfo id={self.id} name={self.name!r} asset_id={self.asset_id}>"
-
-
-class AssetInfoMeta(Base):
-    __tablename__ = "asset_info_meta"
-
-    asset_info_id: Mapped[str] = mapped_column(
-        String(36), ForeignKey("assets_info.id", ondelete="CASCADE"), primary_key=True
-    )
-    key: Mapped[str] = mapped_column(String(256), primary_key=True)
-    ordinal: Mapped[int] = mapped_column(Integer, primary_key=True, default=0)
-
-    val_str: Mapped[str | None] = mapped_column(String(2048), nullable=True)
-    val_num: Mapped[float | None] = mapped_column(Numeric(38, 10), nullable=True)
-    val_bool: Mapped[bool | None] = mapped_column(Boolean, nullable=True)
-    val_json: Mapped[Any | None] = mapped_column(JSON(none_as_null=True), nullable=True)
-
-    asset_info: Mapped[AssetInfo] = relationship(back_populates="metadata_entries")
-
-    __table_args__ = (
-        Index("ix_asset_info_meta_key", "key"),
-        Index("ix_asset_info_meta_key_val_str", "key", "val_str"),
-        Index("ix_asset_info_meta_key_val_num", "key", "val_num"),
-        Index("ix_asset_info_meta_key_val_bool", "key", "val_bool"),
-    )
-
-
-class AssetInfoTag(Base):
-    __tablename__ = "asset_info_tags"
-
-    asset_info_id: Mapped[str] = mapped_column(
-        String(36), ForeignKey("assets_info.id", ondelete="CASCADE"), primary_key=True
-    )
-    tag_name: Mapped[str] = mapped_column(
-        String(512), ForeignKey("tags.name", ondelete="RESTRICT"), primary_key=True
-    )
-    origin: Mapped[str] = mapped_column(String(32), nullable=False, default="manual")
-    added_at: Mapped[datetime] = mapped_column(
-        DateTime(timezone=False), nullable=False, default=utcnow
-    )
-
-    asset_info: Mapped[AssetInfo] = relationship(back_populates="tag_links")
-    tag: Mapped[Tag] = relationship(back_populates="asset_info_links")
-
-    __table_args__ = (
-        Index("ix_asset_info_tags_tag_name", "tag_name"),
-        Index("ix_asset_info_tags_asset_info_id", "asset_info_id"),
-    )
-
-
-class Tag(Base):
-    __tablename__ = "tags"
-
-    name: Mapped[str] = mapped_column(String(512), primary_key=True)
-    tag_type: Mapped[str] = mapped_column(String(32), nullable=False, default="user")
-
-    asset_info_links: Mapped[list[AssetInfoTag]] = relationship(
-        back_populates="tag",
-        overlaps="asset_infos,tags",
-    )
-    asset_infos: Mapped[list[AssetInfo]] = relationship(
-        secondary="asset_info_tags",
-        back_populates="tags",
-        viewonly=True,
-        overlaps="asset_info_links,tag_links,tags,asset_info",
-    )
-
-    __table_args__ = (
-        Index("ix_tags_tag_type", "tag_type"),
-    )
-
-    def __repr__(self) -> str:
-        return f"<Tag {self.name}>"
--- a/app/assets/database/queries.py
+++ b/app/assets/database/queries.py
@ -1,267 +0,0 @@
-import sqlalchemy as sa
-from collections import defaultdict
-from sqlalchemy import select, exists, func
-from sqlalchemy.orm import Session, contains_eager, noload
-from app.assets.database.models import Asset, AssetInfo, AssetInfoMeta, AssetInfoTag, Tag
-from app.assets.helpers import escape_like_prefix, normalize_tags
-from typing import Sequence
-
-
-def visible_owner_clause(owner_id: str) -> sa.sql.ClauseElement:
-    """Build owner visibility predicate for reads. Owner-less rows are visible to everyone."""
-    owner_id = (owner_id or "").strip()
-    if owner_id == "":
-        return AssetInfo.owner_id == ""
-    return AssetInfo.owner_id.in_(["", owner_id])
-
-
-def apply_tag_filters(
-    stmt: sa.sql.Select,
-    include_tags: Sequence[str] | None = None,
-    exclude_tags: Sequence[str] | None = None,
-) -> sa.sql.Select:
-    """include_tags: every tag must be present; exclude_tags: none may be present."""
-    include_tags = normalize_tags(include_tags)
-    exclude_tags = normalize_tags(exclude_tags)
-
-    if include_tags:
-        for tag_name in include_tags:
-            stmt = stmt.where(
-                exists().where(
-                    (AssetInfoTag.asset_info_id == AssetInfo.id)
-                    & (AssetInfoTag.tag_name == tag_name)
-                )
-            )
-
-    if exclude_tags:
-        stmt = stmt.where(
-            ~exists().where(
-                (AssetInfoTag.asset_info_id == AssetInfo.id)
-                & (AssetInfoTag.tag_name.in_(exclude_tags))
-            )
-        )
-    return stmt
-
-def apply_metadata_filter(
-    stmt: sa.sql.Select,
-    metadata_filter: dict | None = None,
-) -> sa.sql.Select:
-    """Apply filters using asset_info_meta projection table."""
-    if not metadata_filter:
-        return stmt
-
-    def _exists_for_pred(key: str, *preds) -> sa.sql.ClauseElement:
-        return sa.exists().where(
-            AssetInfoMeta.asset_info_id == AssetInfo.id,
-            AssetInfoMeta.key == key,
-            *preds,
-        )
-
-    def _exists_clause_for_value(key: str, value) -> sa.sql.ClauseElement:
-        if value is None:
-            no_row_for_key = sa.not_(
-                sa.exists().where(
-                    AssetInfoMeta.asset_info_id == AssetInfo.id,
-                    AssetInfoMeta.key == key,
-                )
-            )
-            null_row = _exists_for_pred(
-                key,
-                AssetInfoMeta.val_json.is_(None),
-                AssetInfoMeta.val_str.is_(None),
-                AssetInfoMeta.val_num.is_(None),
-                AssetInfoMeta.val_bool.is_(None),
-            )
-            return sa.or_(no_row_for_key, null_row)
-
-        if isinstance(value, bool):
-            return _exists_for_pred(key, AssetInfoMeta.val_bool == bool(value))
-        if isinstance(value, (int, float)):
-            from decimal import Decimal
-            num = value if isinstance(value, Decimal) else Decimal(str(value))
-            return _exists_for_pred(key, AssetInfoMeta.val_num == num)
-        if isinstance(value, str):
-            return _exists_for_pred(key, AssetInfoMeta.val_str == value)
-        return _exists_for_pred(key, AssetInfoMeta.val_json == value)
-
-    for k, v in metadata_filter.items():
-        if isinstance(v, list):
-            ors = [_exists_clause_for_value(k, elem) for elem in v]
-            if ors:
-                stmt = stmt.where(sa.or_(*ors))
-        else:
-            stmt = stmt.where(_exists_clause_for_value(k, v))
-    return stmt
-
-
-def asset_exists_by_hash(session: Session, asset_hash: str) -> bool:
-    """
-    Check if an asset with a given hash exists in database.
-    """
-    row = (
-        session.execute(
-            select(sa.literal(True)).select_from(Asset).where(Asset.hash == asset_hash).limit(1)
-        )
-    ).first()
-    return row is not None
-
-def get_asset_info_by_id(session: Session, asset_info_id: str) -> AssetInfo | None:
-    return session.get(AssetInfo, asset_info_id)
-
-def list_asset_infos_page(
-    session: Session,
-    owner_id: str = "",
-    include_tags: Sequence[str] | None = None,
-    exclude_tags: Sequence[str] | None = None,
-    name_contains: str | None = None,
-    metadata_filter: dict | None = None,
-    limit: int = 20,
-    offset: int = 0,
-    sort: str = "created_at",
-    order: str = "desc",
-) -> tuple[list[AssetInfo], dict[str, list[str]], int]:
-    base = (
-        select(AssetInfo)
-        .join(Asset, Asset.id == AssetInfo.asset_id)
-        .options(contains_eager(AssetInfo.asset), noload(AssetInfo.tags))
-        .where(visible_owner_clause(owner_id))
-    )
-
-    if name_contains:
-        escaped, esc = escape_like_prefix(name_contains)
-        base = base.where(AssetInfo.name.ilike(f"%{escaped}%", escape=esc))
-
-    base = apply_tag_filters(base, include_tags, exclude_tags)
-    base = apply_metadata_filter(base, metadata_filter)
-
-    sort = (sort or "created_at").lower()
-    order = (order or "desc").lower()
-    sort_map = {
-        "name": AssetInfo.name,
-        "created_at": AssetInfo.created_at,
-        "updated_at": AssetInfo.updated_at,
-        "last_access_time": AssetInfo.last_access_time,
-        "size": Asset.size_bytes,
-    }
-    sort_col = sort_map.get(sort, AssetInfo.created_at)
-    sort_exp = sort_col.desc() if order == "desc" else sort_col.asc()
-
-    base = base.order_by(sort_exp).limit(limit).offset(offset)
-
-    count_stmt = (
-        select(sa.func.count())
-        .select_from(AssetInfo)
-        .join(Asset, Asset.id == AssetInfo.asset_id)
-        .where(visible_owner_clause(owner_id))
-    )
-    if name_contains:
-        escaped, esc = escape_like_prefix(name_contains)
-        count_stmt = count_stmt.where(AssetInfo.name.ilike(f"%{escaped}%", escape=esc))
-    count_stmt = apply_tag_filters(count_stmt, include_tags, exclude_tags)
-    count_stmt = apply_metadata_filter(count_stmt, metadata_filter)
-
-    total = int((session.execute(count_stmt)).scalar_one() or 0)
-
-    infos = (session.execute(base)).unique().scalars().all()
-
-    id_list: list[str] = [i.id for i in infos]
-    tag_map: dict[str, list[str]] = defaultdict(list)
-    if id_list:
-        rows = session.execute(
-            select(AssetInfoTag.asset_info_id, Tag.name)
-            .join(Tag, Tag.name == AssetInfoTag.tag_name)
-            .where(AssetInfoTag.asset_info_id.in_(id_list))
-        )
-        for aid, tag_name in rows.all():
-            tag_map[aid].append(tag_name)
-
-    return infos, tag_map, total
-
-def fetch_asset_info_asset_and_tags(
-    session: Session,
-    asset_info_id: str,
-    owner_id: str = "",
-) -> tuple[AssetInfo, Asset, list[str]] | None:
-    stmt = (
-        select(AssetInfo, Asset, Tag.name)
-        .join(Asset, Asset.id == AssetInfo.asset_id)
-        .join(AssetInfoTag, AssetInfoTag.asset_info_id == AssetInfo.id, isouter=True)
-        .join(Tag, Tag.name == AssetInfoTag.tag_name, isouter=True)
-        .where(
-            AssetInfo.id == asset_info_id,
-            visible_owner_clause(owner_id),
-        )
-        .options(noload(AssetInfo.tags))
-        .order_by(Tag.name.asc())
-    )
-
-    rows = (session.execute(stmt)).all()
-    if not rows:
-        return None
-
-    first_info, first_asset, _ = rows[0]
-    tags: list[str] = []
-    seen: set[str] = set()
-    for _info, _asset, tag_name in rows:
-        if tag_name and tag_name not in seen:
-            seen.add(tag_name)
-            tags.append(tag_name)
-    return first_info, first_asset, tags
-
-def list_tags_with_usage(
-    session: Session,
-    prefix: str | None = None,
-    limit: int = 100,
-    offset: int = 0,
-    include_zero: bool = True,
-    order: str = "count_desc",
-    owner_id: str = "",
-) -> tuple[list[tuple[str, str, int]], int]:
-    counts_sq = (
-        select(
-            AssetInfoTag.tag_name.label("tag_name"),
-            func.count(AssetInfoTag.asset_info_id).label("cnt"),
-        )
-        .select_from(AssetInfoTag)
-        .join(AssetInfo, AssetInfo.id == AssetInfoTag.asset_info_id)
-        .where(visible_owner_clause(owner_id))
-        .group_by(AssetInfoTag.tag_name)
-        .subquery()
-    )
-
-    q = (
-        select(
-            Tag.name,
-            Tag.tag_type,
-            func.coalesce(counts_sq.c.cnt, 0).label("count"),
-        )
-        .select_from(Tag)
-        .join(counts_sq, counts_sq.c.tag_name == Tag.name, isouter=True)
-    )
-
-    if prefix:
-        escaped, esc = escape_like_prefix(prefix.strip().lower())
-        q = q.where(Tag.name.like(escaped + "%", escape=esc))
-
-    if not include_zero:
-        q = q.where(func.coalesce(counts_sq.c.cnt, 0) > 0)
-
-    if order == "name_asc":
-        q = q.order_by(Tag.name.asc())
-    else:
-        q = q.order_by(func.coalesce(counts_sq.c.cnt, 0).desc(), Tag.name.asc())
-
-    total_q = select(func.count()).select_from(Tag)
-    if prefix:
-        escaped, esc = escape_like_prefix(prefix.strip().lower())
-        total_q = total_q.where(Tag.name.like(escaped + "%", escape=esc))
-    if not include_zero:
-        total_q = total_q.where(
-            Tag.name.in_(select(AssetInfoTag.tag_name).group_by(AssetInfoTag.tag_name))
-        )
-
-    rows = (session.execute(q.limit(limit).offset(offset))).all()
-    total = (session.execute(total_q)).scalar_one()
-
-    rows_norm = [(name, ttype, int(count or 0)) for (name, ttype, count) in rows]
-    return rows_norm, int(total or 0)
--- a/app/assets/database/tags.py
+++ b/app/assets/database/tags.py
@ -1,62 +0,0 @@
-from typing import Iterable
-
-import sqlalchemy
-from sqlalchemy.orm import Session
-from sqlalchemy.dialects import sqlite
-
-from app.assets.helpers import normalize_tags, utcnow
-from app.assets.database.models import Tag, AssetInfoTag, AssetInfo
-
-
-def ensure_tags_exist(session: Session, names: Iterable[str], tag_type: str = "user") -> None:
-    wanted = normalize_tags(list(names))
-    if not wanted:
-        return
-    rows = [{"name": n, "tag_type": tag_type} for n in list(dict.fromkeys(wanted))]
-    ins = (
-            sqlite.insert(Tag)
-            .values(rows)
-            .on_conflict_do_nothing(index_elements=[Tag.name])
-        )
-    return session.execute(ins)
-
-def add_missing_tag_for_asset_id(
-    session: Session,
-    *,
-    asset_id: str,
-    origin: str = "automatic",
-) -> None:
-    select_rows = (
-        sqlalchemy.select(
-            AssetInfo.id.label("asset_info_id"),
-            sqlalchemy.literal("missing").label("tag_name"),
-            sqlalchemy.literal(origin).label("origin"),
-            sqlalchemy.literal(utcnow()).label("added_at"),
-        )
-        .where(AssetInfo.asset_id == asset_id)
-        .where(
-            sqlalchemy.not_(
-                sqlalchemy.exists().where((AssetInfoTag.asset_info_id == AssetInfo.id) & (AssetInfoTag.tag_name == "missing"))
-            )
-        )
-    )
-    session.execute(
-        sqlite.insert(AssetInfoTag)
-        .from_select(
-            ["asset_info_id", "tag_name", "origin", "added_at"],
-            select_rows,
-        )
-        .on_conflict_do_nothing(index_elements=[AssetInfoTag.asset_info_id, AssetInfoTag.tag_name])
-    )
-
-def remove_missing_tag_for_asset_id(
-    session: Session,
-    *,
-    asset_id: str,
-) -> None:
-    session.execute(
-        sqlalchemy.delete(AssetInfoTag).where(
-            AssetInfoTag.asset_info_id.in_(sqlalchemy.select(AssetInfo.id).where(AssetInfo.asset_id == asset_id)),
-            AssetInfoTag.tag_name == "missing",
-        )
-    )
--- a/app/assets/hashing.py
+++ b/app/assets/hashing.py
@ -1,75 +0,0 @@
-from blake3 import blake3
-from typing import IO
-import os
-import asyncio
-
-
-DEFAULT_CHUNK = 8 * 1024 *1024 # 8MB
-
-# NOTE: this allows hashing different representations of a file-like object
-def blake3_hash(
-    fp: str | IO[bytes],
-    chunk_size: int = DEFAULT_CHUNK,
-) -> str:
-    """
-    Returns a BLAKE3 hex digest for ``fp``, which may be:
-      - a filename (str/bytes) or PathLike
-      - an open binary file object
-    If ``fp`` is a file object, it must be opened in **binary** mode and support
-    ``read``, ``seek``, and ``tell``. The function will seek to the start before
-    reading and will attempt to restore the original position afterward.
-    """
-    # duck typing to check if input is a file-like object
-    if hasattr(fp, "read"):
-        return _hash_file_obj(fp, chunk_size)
-
-    with open(os.fspath(fp), "rb") as f:
-        return _hash_file_obj(f, chunk_size)
-
-
-async def blake3_hash_async(
-    fp: str | IO[bytes],
-    chunk_size: int = DEFAULT_CHUNK,
-) -> str:
-    """Async wrapper for ``blake3_hash_sync``.
-    Uses a worker thread so the event loop remains responsive.
-    """
-    # If it is a path, open inside the worker thread to keep I/O off the loop.
-    if hasattr(fp, "read"):
-        return await asyncio.to_thread(blake3_hash, fp, chunk_size)
-
-    def _worker() -> str:
-        with open(os.fspath(fp), "rb") as f:
-            return _hash_file_obj(f, chunk_size)
-
-    return await asyncio.to_thread(_worker)
-
-
-def _hash_file_obj(file_obj: IO, chunk_size: int = DEFAULT_CHUNK) -> str:
-    """
-    Hash an already-open binary file object by streaming in chunks.
-    - Seeks to the beginning before reading (if supported).
-    - Restores the original position afterward (if tell/seek are supported).
-    """
-    if chunk_size <= 0:
-        chunk_size = DEFAULT_CHUNK
-
-    # in case file object is already open and not at the beginning, track so can be restored after hashing
-    orig_pos = file_obj.tell()
-
-    try:
-        # seek to the beginning before reading
-        if orig_pos != 0:
-            file_obj.seek(0)
-
-        h = blake3()
-        while True:
-            chunk = file_obj.read(chunk_size)
-            if not chunk:
-                break
-            h.update(chunk)
-        return h.hexdigest()
-    finally:
-        # restore original position in file object, if needed
-        if orig_pos != 0:
-            file_obj.seek(orig_pos)
--- a/app/assets/helpers.py
+++ b/app/assets/helpers.py
@ -1,217 +0,0 @@
-import contextlib
-import os
-from aiohttp import web
-from datetime import datetime, timezone
-from pathlib import Path
-from typing import Literal, Any
-
-import folder_paths
-
-
-RootType = Literal["models", "input", "output"]
-ALLOWED_ROOTS: tuple[RootType, ...] = ("models", "input", "output")
-
-def get_query_dict(request: web.Request) -> dict[str, Any]:
-    """
-    Gets a dictionary of query parameters from the request.
-
-    'request.query' is a MultiMapping[str], needs to be converted to a dictionary to be validated by Pydantic.
-    """
-    query_dict = {
-        key: request.query.getall(key) if len(request.query.getall(key)) > 1 else request.query.get(key)
-        for key in request.query.keys()
-    }
-    return query_dict
-
-def list_tree(base_dir: str) -> list[str]:
-    out: list[str] = []
-    base_abs = os.path.abspath(base_dir)
-    if not os.path.isdir(base_abs):
-        return out
-    for dirpath, _subdirs, filenames in os.walk(base_abs, topdown=True, followlinks=False):
-        for name in filenames:
-            out.append(os.path.abspath(os.path.join(dirpath, name)))
-    return out
-
-def prefixes_for_root(root: RootType) -> list[str]:
-    if root == "models":
-        bases: list[str] = []
-        for _bucket, paths in get_comfy_models_folders():
-            bases.extend(paths)
-        return [os.path.abspath(p) for p in bases]
-    if root == "input":
-        return [os.path.abspath(folder_paths.get_input_directory())]
-    if root == "output":
-        return [os.path.abspath(folder_paths.get_output_directory())]
-    return []
-
-def escape_like_prefix(s: str, escape: str = "!") -> tuple[str, str]:
-    """Escapes %, _ and the escape char itself in a LIKE prefix.
-    Returns (escaped_prefix, escape_char). Caller should append '%' and pass escape=escape_char to .like().
-    """
-    s = s.replace(escape, escape + escape)  # escape the escape char first
-    s = s.replace("%", escape + "%").replace("_", escape + "_")  # escape LIKE wildcards
-    return s, escape
-
-def fast_asset_file_check(
-    *,
-    mtime_db: int | None,
-    size_db: int | None,
-    stat_result: os.stat_result,
-) -> bool:
-    if mtime_db is None:
-        return False
-    actual_mtime_ns = getattr(stat_result, "st_mtime_ns", int(stat_result.st_mtime * 1_000_000_000))
-    if int(mtime_db) != int(actual_mtime_ns):
-        return False
-    sz = int(size_db or 0)
-    if sz > 0:
-        return int(stat_result.st_size) == sz
-    return True
-
-def utcnow() -> datetime:
-    """Naive UTC timestamp (no tzinfo). We always treat DB datetimes as UTC."""
-    return datetime.now(timezone.utc).replace(tzinfo=None)
-
-def get_comfy_models_folders() -> list[tuple[str, list[str]]]:
-    """Build a list of (folder_name, base_paths[]) categories that are configured for model locations.
-
-    We trust `folder_paths.folder_names_and_paths` and include a category if
-    *any* of its base paths lies under the Comfy `models_dir`.
-    """
-    targets: list[tuple[str, list[str]]] = []
-    models_root = os.path.abspath(folder_paths.models_dir)
-    for name, values in folder_paths.folder_names_and_paths.items():
-        paths, _exts = values[0], values[1]  # NOTE: this prevents nodepacks that hackily edit folder_... from breaking ComfyUI
-        if any(os.path.abspath(p).startswith(models_root + os.sep) for p in paths):
-            targets.append((name, paths))
-    return targets
-
-def compute_relative_filename(file_path: str) -> str | None:
-    """
-    Return the model's path relative to the last well-known folder (the model category),
-    using forward slashes, eg:
-      /.../models/checkpoints/flux/123/flux.safetensors -> "flux/123/flux.safetensors"
-      /.../models/text_encoders/clip_g.safetensors -> "clip_g.safetensors"
-
-    For non-model paths, returns None.
-    NOTE: this is a temporary helper, used only for initializing metadata["filename"] field.
-    """
-    try:
-        root_category, rel_path = get_relative_to_root_category_path_of_asset(file_path)
-    except ValueError:
-        return None
-
-    p = Path(rel_path)
-    parts = [seg for seg in p.parts if seg not in (".", "..", p.anchor)]
-    if not parts:
-        return None
-
-    if root_category == "models":
-        # parts[0] is the category ("checkpoints", "vae", etc) – drop it
-        inside = parts[1:] if len(parts) > 1 else [parts[0]]
-        return "/".join(inside)
-    return "/".join(parts)  # input/output: keep all parts
-
-
-def get_relative_to_root_category_path_of_asset(file_path: str) -> tuple[Literal["input", "output", "models"], str]:
-    """Given an absolute or relative file path, determine which root category the path belongs to:
-      - 'input' if the file resides under `folder_paths.get_input_directory()`
-      - 'output' if the file resides under `folder_paths.get_output_directory()`
-      - 'models' if the file resides under any base path of categories returned by `get_comfy_models_folders()`
-
-    Returns:
-        (root_category, relative_path_inside_that_root)
-        For 'models', the relative path is prefixed with the category name:
-            e.g. ('models', 'vae/test/sub/ae.safetensors')
-
-    Raises:
-        ValueError: if the path does not belong to input, output, or configured model bases.
-    """
-    fp_abs = os.path.abspath(file_path)
-
-    def _is_within(child: str, parent: str) -> bool:
-        try:
-            return os.path.commonpath([child, parent]) == parent
-        except Exception:
-            return False
-
-    def _rel(child: str, parent: str) -> str:
-        return os.path.relpath(os.path.join(os.sep, os.path.relpath(child, parent)), os.sep)
-
-    # 1) input
-    input_base = os.path.abspath(folder_paths.get_input_directory())
-    if _is_within(fp_abs, input_base):
-        return "input", _rel(fp_abs, input_base)
-
-    # 2) output
-    output_base = os.path.abspath(folder_paths.get_output_directory())
-    if _is_within(fp_abs, output_base):
-        return "output", _rel(fp_abs, output_base)
-
-    # 3) models (check deepest matching base to avoid ambiguity)
-    best: tuple[int, str, str] | None = None  # (base_len, bucket, rel_inside_bucket)
-    for bucket, bases in get_comfy_models_folders():
-        for b in bases:
-            base_abs = os.path.abspath(b)
-            if not _is_within(fp_abs, base_abs):
-                continue
-            cand = (len(base_abs), bucket, _rel(fp_abs, base_abs))
-            if best is None or cand[0] > best[0]:
-                best = cand
-
-    if best is not None:
-        _, bucket, rel_inside = best
-        combined = os.path.join(bucket, rel_inside)
-        return "models", os.path.relpath(os.path.join(os.sep, combined), os.sep)
-
-    raise ValueError(f"Path is not within input, output, or configured model bases: {file_path}")
-
-def get_name_and_tags_from_asset_path(file_path: str) -> tuple[str, list[str]]:
-    """Return a tuple (name, tags) derived from a filesystem path.
-
-    Semantics:
-      - Root category is determined by `get_relative_to_root_category_path_of_asset`.
-      - The returned `name` is the base filename with extension from the relative path.
-      - The returned `tags` are:
-            [root_category] + parent folders of the relative path (in order)
-        For 'models', this means:
-            file '/.../ModelsDir/vae/test_tag/ae.safetensors'
-            -> root_category='models', some_path='vae/test_tag/ae.safetensors'
-            -> name='ae.safetensors', tags=['models', 'vae', 'test_tag']
-
-    Raises:
-        ValueError: if the path does not belong to input, output, or configured model bases.
-    """
-    root_category, some_path = get_relative_to_root_category_path_of_asset(file_path)
-    p = Path(some_path)
-    parent_parts = [part for part in p.parent.parts if part not in (".", "..", p.anchor)]
-    return p.name, list(dict.fromkeys(normalize_tags([root_category, *parent_parts])))
-
-def normalize_tags(tags: list[str] | None) -> list[str]:
-    """
-    Normalize a list of tags by:
-      - Stripping whitespace and converting to lowercase.
-      - Removing duplicates.
-    """
-    return [t.strip().lower() for t in (tags or []) if (t or "").strip()]
-
-def collect_models_files() -> list[str]:
-    out: list[str] = []
-    for folder_name, bases in get_comfy_models_folders():
-        rel_files = folder_paths.get_filename_list(folder_name) or []
-        for rel_path in rel_files:
-            abs_path = folder_paths.get_full_path(folder_name, rel_path)
-            if not abs_path:
-                continue
-            abs_path = os.path.abspath(abs_path)
-            allowed = False
-            for b in bases:
-                base_abs = os.path.abspath(b)
-                with contextlib.suppress(Exception):
-                    if os.path.commonpath([abs_path, base_abs]) == base_abs:
-                        allowed = True
-                        break
-            if allowed:
-                out.append(abs_path)
-    return out
--- a/app/assets/manager.py
+++ b/app/assets/manager.py
@ -1,123 +0,0 @@
-from typing import Sequence
-
-from app.database.db import create_session
-from app.assets.api import schemas_out
-from app.assets.database.queries import (
-    asset_exists_by_hash,
-    fetch_asset_info_asset_and_tags,
-    list_asset_infos_page,
-    list_tags_with_usage,
-)
-
-
-def _safe_sort_field(requested: str | None) -> str:
-    if not requested:
-        return "created_at"
-    v = requested.lower()
-    if v in {"name", "created_at", "updated_at", "size", "last_access_time"}:
-        return v
-    return "created_at"
-
-
-def asset_exists(asset_hash: str) -> bool:
-    with create_session() as session:
-        return asset_exists_by_hash(session, asset_hash=asset_hash)
-
-def list_assets(
-    include_tags: Sequence[str] | None = None,
-    exclude_tags: Sequence[str] | None = None,
-    name_contains: str | None = None,
-    metadata_filter: dict | None = None,
-    limit: int = 20,
-    offset: int = 0,
-    sort: str = "created_at",
-    order: str = "desc",
-    owner_id: str = "",
-) -> schemas_out.AssetsList:
-    sort = _safe_sort_field(sort)
-    order = "desc" if (order or "desc").lower() not in {"asc", "desc"} else order.lower()
-
-    with create_session() as session:
-        infos, tag_map, total = list_asset_infos_page(
-            session,
-            owner_id=owner_id,
-            include_tags=include_tags,
-            exclude_tags=exclude_tags,
-            name_contains=name_contains,
-            metadata_filter=metadata_filter,
-            limit=limit,
-            offset=offset,
-            sort=sort,
-            order=order,
-        )
-
-    summaries: list[schemas_out.AssetSummary] = []
-    for info in infos:
-        asset = info.asset
-        tags = tag_map.get(info.id, [])
-        summaries.append(
-            schemas_out.AssetSummary(
-                id=info.id,
-                name=info.name,
-                asset_hash=asset.hash if asset else None,
-                size=int(asset.size_bytes) if asset else None,
-                mime_type=asset.mime_type if asset else None,
-                tags=tags,
-                preview_url=f"/api/assets/{info.id}/content",
-                created_at=info.created_at,
-                updated_at=info.updated_at,
-                last_access_time=info.last_access_time,
-            )
-        )
-
-    return schemas_out.AssetsList(
-        assets=summaries,
-        total=total,
-        has_more=(offset + len(summaries)) < total,
-    )
-
-def get_asset(asset_info_id: str, owner_id: str = "") -> schemas_out.AssetDetail:
-    with create_session() as session:
-        res = fetch_asset_info_asset_and_tags(session, asset_info_id=asset_info_id, owner_id=owner_id)
-        if not res:
-            raise ValueError(f"AssetInfo {asset_info_id} not found")
-        info, asset, tag_names = res
-        preview_id = info.preview_id
-
-    return schemas_out.AssetDetail(
-        id=info.id,
-        name=info.name,
-        asset_hash=asset.hash if asset else None,
-        size=int(asset.size_bytes) if asset and asset.size_bytes is not None else None,
-        mime_type=asset.mime_type if asset else None,
-        tags=tag_names,
-        user_metadata=info.user_metadata or {},
-        preview_id=preview_id,
-        created_at=info.created_at,
-        last_access_time=info.last_access_time,
-    )
-
-def list_tags(
-    prefix: str | None = None,
-    limit: int = 100,
-    offset: int = 0,
-    order: str = "count_desc",
-    include_zero: bool = True,
-    owner_id: str = "",
-) -> schemas_out.TagsList:
-    limit = max(1, min(1000, limit))
-    offset = max(0, offset)
-
-    with create_session() as session:
-        rows, total = list_tags_with_usage(
-            session,
-            prefix=prefix,
-            limit=limit,
-            offset=offset,
-            include_zero=include_zero,
-            order=order,
-            owner_id=owner_id,
-        )
-
-    tags = [schemas_out.TagUsage(name=name, count=count, type=tag_type) for (name, tag_type, count) in rows]
-    return schemas_out.TagsList(tags=tags, total=total, has_more=(offset + len(tags)) < total)
--- a/app/assets/scanner.py
+++ b/app/assets/scanner.py
@ -1,229 +0,0 @@
-import contextlib
-import time
-import logging
-import os
-import sqlalchemy
-
-import folder_paths
-from app.database.db import create_session, dependencies_available
-from app.assets.helpers import (
-    collect_models_files, compute_relative_filename, fast_asset_file_check, get_name_and_tags_from_asset_path,
-    list_tree,prefixes_for_root, escape_like_prefix,
-    RootType
-)
-from app.assets.database.tags import add_missing_tag_for_asset_id, ensure_tags_exist, remove_missing_tag_for_asset_id
-from app.assets.database.bulk_ops import seed_from_paths_batch
-from app.assets.database.models import Asset, AssetCacheState, AssetInfo
-
-
-def seed_assets(roots: tuple[RootType, ...], enable_logging: bool = False) -> None:
-    """
-    Scan the given roots and seed the assets into the database.
-    """
-    if not dependencies_available():
-        if enable_logging:
-            logging.warning("Database dependencies not available, skipping assets scan")
-        return
-    t_start = time.perf_counter()
-    created = 0
-    skipped_existing = 0
-    paths: list[str] = []
-    try:
-        existing_paths: set[str] = set()
-        for r in roots:
-            try:
-                survivors: set[str] = _fast_db_consistency_pass(r, collect_existing_paths=True, update_missing_tags=True)
-                if survivors:
-                    existing_paths.update(survivors)
-            except Exception as e:
-                logging.exception("fast DB scan failed for %s: %s", r, e)
-
-        if "models" in roots:
-            paths.extend(collect_models_files())
-        if "input" in roots:
-            paths.extend(list_tree(folder_paths.get_input_directory()))
-        if "output" in roots:
-            paths.extend(list_tree(folder_paths.get_output_directory()))
-
-        specs: list[dict] = []
-        tag_pool: set[str] = set()
-        for p in paths:
-            abs_p = os.path.abspath(p)
-            if abs_p in existing_paths:
-                skipped_existing += 1
-                continue
-            try:
-                stat_p = os.stat(abs_p, follow_symlinks=False)
-            except OSError:
-                continue
-            # skip empty files
-            if not stat_p.st_size:
-                continue
-            name, tags = get_name_and_tags_from_asset_path(abs_p)
-            specs.append(
-                {
-                    "abs_path": abs_p,
-                    "size_bytes": stat_p.st_size,
-                    "mtime_ns": getattr(stat_p, "st_mtime_ns", int(stat_p.st_mtime * 1_000_000_000)),
-                    "info_name": name,
-                    "tags": tags,
-                    "fname": compute_relative_filename(abs_p),
-                }
-            )
-            for t in tags:
-                tag_pool.add(t)
-        # if no file specs, nothing to do
-        if not specs:
-            return
-        with create_session() as sess:
-            if tag_pool:
-                ensure_tags_exist(sess, tag_pool, tag_type="user")
-
-            result = seed_from_paths_batch(sess, specs=specs, owner_id="")
-            created += result["inserted_infos"]
-            sess.commit()
-    finally:
-        if enable_logging:
-            logging.info(
-                "Assets scan(roots=%s) completed in %.3fs (created=%d, skipped_existing=%d, total_seen=%d)",
-                roots,
-                time.perf_counter() - t_start,
-                created,
-                skipped_existing,
-                len(paths),
-            )
-
-
-def _fast_db_consistency_pass(
-    root: RootType,
-    *,
-    collect_existing_paths: bool = False,
-    update_missing_tags: bool = False,
-) -> set[str] | None:
-    """Fast DB+FS pass for a root:
-      - Toggle needs_verify per state using fast check
-      - For hashed assets with at least one fast-ok state in this root: delete stale missing states
-      - For seed assets with all states missing: delete Asset and its AssetInfos
-      - Optionally add/remove 'missing' tags based on fast-ok in this root
-      - Optionally return surviving absolute paths
-    """
-    prefixes = prefixes_for_root(root)
-    if not prefixes:
-        return set() if collect_existing_paths else None
-
-    conds = []
-    for p in prefixes:
-        base = os.path.abspath(p)
-        if not base.endswith(os.sep):
-            base += os.sep
-        escaped, esc = escape_like_prefix(base)
-        conds.append(AssetCacheState.file_path.like(escaped + "%", escape=esc))
-
-    with create_session() as sess:
-        rows = (
-            sess.execute(
-                sqlalchemy.select(
-                    AssetCacheState.id,
-                    AssetCacheState.file_path,
-                    AssetCacheState.mtime_ns,
-                    AssetCacheState.needs_verify,
-                    AssetCacheState.asset_id,
-                    Asset.hash,
-                    Asset.size_bytes,
-                )
-                .join(Asset, Asset.id == AssetCacheState.asset_id)
-                .where(sqlalchemy.or_(*conds))
-                .order_by(AssetCacheState.asset_id.asc(), AssetCacheState.id.asc())
-            )
-        ).all()
-
-        by_asset: dict[str, dict] = {}
-        for sid, fp, mtime_db, needs_verify, aid, a_hash, a_size in rows:
-            acc = by_asset.get(aid)
-            if acc is None:
-                acc = {"hash": a_hash, "size_db": int(a_size or 0), "states": []}
-                by_asset[aid] = acc
-
-            fast_ok = False
-            try:
-                exists = True
-                fast_ok = fast_asset_file_check(
-                    mtime_db=mtime_db,
-                    size_db=acc["size_db"],
-                    stat_result=os.stat(fp, follow_symlinks=True),
-                )
-            except FileNotFoundError:
-                exists = False
-            except OSError:
-                exists = False
-
-            acc["states"].append({
-                "sid": sid,
-                "fp": fp,
-                "exists": exists,
-                "fast_ok": fast_ok,
-                "needs_verify": bool(needs_verify),
-            })
-
-        to_set_verify: list[int] = []
-        to_clear_verify: list[int] = []
-        stale_state_ids: list[int] = []
-        survivors: set[str] = set()
-
-        for aid, acc in by_asset.items():
-            a_hash = acc["hash"]
-            states = acc["states"]
-            any_fast_ok = any(s["fast_ok"] for s in states)
-            all_missing = all(not s["exists"] for s in states)
-
-            for s in states:
-                if not s["exists"]:
-                    continue
-                if s["fast_ok"] and s["needs_verify"]:
-                    to_clear_verify.append(s["sid"])
-                if not s["fast_ok"] and not s["needs_verify"]:
-                    to_set_verify.append(s["sid"])
-
-            if a_hash is None:
-                if states and all_missing:  # remove seed Asset completely, if no valid AssetCache exists
-                    sess.execute(sqlalchemy.delete(AssetInfo).where(AssetInfo.asset_id == aid))
-                    asset = sess.get(Asset, aid)
-                    if asset:
-                        sess.delete(asset)
-                else:
-                    for s in states:
-                        if s["exists"]:
-                            survivors.add(os.path.abspath(s["fp"]))
-                continue
-
-            if any_fast_ok:  # if Asset has at least one valid AssetCache record, remove any invalid AssetCache records
-                for s in states:
-                    if not s["exists"]:
-                        stale_state_ids.append(s["sid"])
-                if update_missing_tags:
-                    with contextlib.suppress(Exception):
-                        remove_missing_tag_for_asset_id(sess, asset_id=aid)
-            elif update_missing_tags:
-                with contextlib.suppress(Exception):
-                    add_missing_tag_for_asset_id(sess, asset_id=aid, origin="automatic")
-
-            for s in states:
-                if s["exists"]:
-                    survivors.add(os.path.abspath(s["fp"]))
-
-        if stale_state_ids:
-            sess.execute(sqlalchemy.delete(AssetCacheState).where(AssetCacheState.id.in_(stale_state_ids)))
-        if to_set_verify:
-            sess.execute(
-                sqlalchemy.update(AssetCacheState)
-                .where(AssetCacheState.id.in_(to_set_verify))
-                .values(needs_verify=True)
-            )
-        if to_clear_verify:
-            sess.execute(
-                sqlalchemy.update(AssetCacheState)
-                .where(AssetCacheState.id.in_(to_clear_verify))
-                .values(needs_verify=False)
-            )
-        sess.commit()
-        return survivors if collect_existing_paths else None
--- a/app/database/models.py
+++ b/app/database/models.py
@ -1,21 +1,14 @@
-from typing import Any
-from datetime import datetime
-from sqlalchemy.orm import DeclarativeBase
+from sqlalchemy.orm import declarative_base

-class Base(DeclarativeBase):
-    pass
+Base = declarative_base()

-def to_dict(obj: Any, include_none: bool = False) -> dict[str, Any]:
+
+def to_dict(obj):
    fields = obj.__table__.columns.keys()
-    out: dict[str, Any] = {}
-    for field in fields:
-        val = getattr(obj, field)
-        if val is None and not include_none:
-            continue
-        if isinstance(val, datetime):
-            out[field] = val.isoformat()
-        else:
-            out[field] = val
-    return out
+    return {
+        field: (val.to_dict() if hasattr(val, "to_dict") else val)
+        for field in fields
+        if (val := getattr(obj, field))
+    }

 # TODO: Define models here
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@ -10,8 +10,7 @@ import importlib
 from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
-from typing import Dict, TypedDict, Optional
-from aiohttp import web
+from typing import TypedDict, Optional
 from importlib.metadata import version

 import requests
@ -43,7 +42,6 @@ def get_installed_frontend_version():
    frontend_version_str = version("comfyui-frontend-package")
    return frontend_version_str

-
 def get_required_frontend_version():
    """Get the required frontend version from requirements.txt."""
    try:
@ -65,7 +63,6 @@ def get_required_frontend_version():
        logging.error(f"Error reading requirements.txt: {e}")
        return None

-
 def check_frontend_version():
    """Check if the frontend version is up to date."""

@ -206,37 +203,6 @@ class FrontendManager:
        """Get the required frontend package version."""
        return get_required_frontend_version()

-    @classmethod
-    def get_installed_templates_version(cls) -> str:
-        """Get the currently installed workflow templates package version."""
-        try:
-            templates_version_str = version("comfyui-workflow-templates")
-            return templates_version_str
-        except Exception:
-            return None
-
-    @classmethod
-    def get_required_templates_version(cls) -> str:
-        """Get the required workflow templates version from requirements.txt."""
-        try:
-            with open(requirements_path, "r", encoding="utf-8") as f:
-                for line in f:
-                    line = line.strip()
-                    if line.startswith("comfyui-workflow-templates=="):
-                        version_str = line.split("==")[-1]
-                        if not is_valid_version(version_str):
-                            logging.error(f"Invalid templates version format in requirements.txt: {version_str}")
-                            return None
-                        return version_str
-                logging.error("comfyui-workflow-templates not found in requirements.txt")
-                return None
-        except FileNotFoundError:
-            logging.error("requirements.txt not found. Cannot determine required templates version.")
-            return None
-        except Exception as e:
-            logging.error(f"Error reading requirements.txt: {e}")
-            return None
-
    @classmethod
    def default_frontend_path(cls) -> str:
        try:
@ -258,54 +224,7 @@ comfyui-frontend-package is not installed.
            sys.exit(-1)

    @classmethod
-    def template_asset_map(cls) -> Optional[Dict[str, str]]:
-        """Return a mapping of template asset names to their absolute paths."""
-        try:
-            from comfyui_workflow_templates import (
-                get_asset_path,
-                iter_templates,
-            )
-        except ImportError:
-            logging.error(
-                f"""
-********** ERROR ***********
-
-comfyui-workflow-templates is not installed.
-
-{frontend_install_warning_message()}
-
-********** ERROR ***********
-""".strip()
-            )
-            return None
-
-        try:
-            template_entries = list(iter_templates())
-        except Exception as exc:
-            logging.error(f"Failed to enumerate workflow templates: {exc}")
-            return None
-
-        asset_map: Dict[str, str] = {}
-        try:
-            for entry in template_entries:
-                for asset in entry.assets:
-                    asset_map[asset.filename] = get_asset_path(
-                        entry.template_id, asset.filename
-                    )
-        except Exception as exc:
-            logging.error(f"Failed to resolve template asset paths: {exc}")
-            return None
-
-        if not asset_map:
-            logging.error("No workflow template assets found. Did the packages install correctly?")
-            return None
-
-        return asset_map
-
-
-    @classmethod
-    def legacy_templates_path(cls) -> Optional[str]:
-        """Return the legacy templates directory shipped inside the meta package."""
+    def templates_path(cls) -> str:
        try:
            import comfyui_workflow_templates

@ -324,7 +243,6 @@ comfyui-workflow-templates is not installed.
 ********** ERROR ***********
 """.strip()
            )
-            return None

    @classmethod
    def embedded_docs_path(cls) -> str:
@ -441,17 +359,3 @@ comfyui-workflow-templates is not installed.
            logging.info("Falling back to the default frontend.")
            check_frontend_version()
            return cls.default_frontend_path()
-    @classmethod
-    def template_asset_handler(cls):
-        assets = cls.template_asset_map()
-        if not assets:
-            return None
-
-        async def serve_template(request: web.Request) -> web.StreamResponse:
-            rel_path = request.match_info.get("path", "")
-            target = assets.get(rel_path)
-            if target is None:
-                raise web.HTTPNotFound()
-            return web.FileResponse(target)
-
-        return serve_template
--- a/app/model_manager.py
+++ b/app/model_manager.py
@ -44,7 +44,7 @@ class ModelFileManager:
        @routes.get("/experiment/models/{folder}")
        async def get_all_models(request):
            folder = request.match_info.get("folder", None)
-            if folder not in folder_paths.folder_names_and_paths:
+            if not folder in folder_paths.folder_names_and_paths:
                return web.Response(status=404)
            files = self.get_model_file_list(folder)
            return web.json_response(files)
@ -55,7 +55,7 @@ class ModelFileManager:
            path_index = int(request.match_info.get("path_index", None))
            filename = request.match_info.get("filename", None)

-            if folder_name not in folder_paths.folder_names_and_paths:
+            if not folder_name in folder_paths.folder_names_and_paths:
                return web.Response(status=404)

            folders = folder_paths.folder_names_and_paths[folder_name]
--- a/app/subgraph_manager.py
+++ b/app/subgraph_manager.py
@ -1,112 +0,0 @@
-from __future__ import annotations
-
-from typing import TypedDict
-import os
-import folder_paths
-import glob
-from aiohttp import web
-import hashlib
-
-
-class Source:
-    custom_node = "custom_node"
-
-class SubgraphEntry(TypedDict):
-    source: str
-    """
-    Source of subgraph - custom_nodes vs templates.
-    """
-    path: str
-    """
-    Relative path of the subgraph file.
-    For custom nodes, will be the relative directory like <custom_node_dir>/subgraphs/<name>.json
-    """
-    name: str
-    """
-    Name of subgraph file.
-    """
-    info: CustomNodeSubgraphEntryInfo
-    """
-    Additional info about subgraph; in the case of custom_nodes, will contain nodepack name
-    """
-    data: str
-
-class CustomNodeSubgraphEntryInfo(TypedDict):
-    node_pack: str
-    """Node pack name."""
-
-class SubgraphManager:
-    def __init__(self):
-        self.cached_custom_node_subgraphs: dict[SubgraphEntry] | None = None
-
-    async def load_entry_data(self, entry: SubgraphEntry):
-        with open(entry['path'], 'r') as f:
-            entry['data'] = f.read()
-        return entry
-
-    async def sanitize_entry(self, entry: SubgraphEntry | None, remove_data=False) -> SubgraphEntry | None:
-        if entry is None:
-            return None
-        entry = entry.copy()
-        entry.pop('path', None)
-        if remove_data:
-            entry.pop('data', None)
-        return entry
-
-    async def sanitize_entries(self, entries: dict[str, SubgraphEntry], remove_data=False) -> dict[str, SubgraphEntry]:
-        entries = entries.copy()
-        for key in list(entries.keys()):
-            entries[key] = await self.sanitize_entry(entries[key], remove_data)
-        return entries
-
-    async def get_custom_node_subgraphs(self, loadedModules, force_reload=False):
-        # if not forced to reload and cached, return cache
-        if not force_reload and self.cached_custom_node_subgraphs is not None:
-            return self.cached_custom_node_subgraphs
-        # Load subgraphs from custom nodes
-        subfolder = "subgraphs"
-        subgraphs_dict: dict[SubgraphEntry] = {}
-
-        for folder in folder_paths.get_folder_paths("custom_nodes"):
-            pattern = os.path.join(folder, f"*/{subfolder}/*.json")
-            matched_files = glob.glob(pattern)
-            for file in matched_files:
-                # replace backslashes with forward slashes
-                file = file.replace('\\', '/')
-                info: CustomNodeSubgraphEntryInfo = {
-                    "node_pack": "custom_nodes." + file.split('/')[-3]
-                }
-                source = Source.custom_node
-                # hash source + path to make sure id will be as unique as possible, but
-                # reproducible across backend reloads
-                id = hashlib.sha256(f"{source}{file}".encode()).hexdigest()
-                entry: SubgraphEntry = {
-                    "source": Source.custom_node,
-                    "name": os.path.splitext(os.path.basename(file))[0],
-                    "path": file,
-                    "info": info,
-                }
-                subgraphs_dict[id] = entry
-        self.cached_custom_node_subgraphs = subgraphs_dict
-        return subgraphs_dict
-
-    async def get_custom_node_subgraph(self, id: str, loadedModules):
-        subgraphs = await self.get_custom_node_subgraphs(loadedModules)
-        entry: SubgraphEntry = subgraphs.get(id, None)
-        if entry is not None and entry.get('data', None) is None:
-            await self.load_entry_data(entry)
-        return entry
-
-    def add_routes(self, routes, loadedModules):
-        @routes.get("/global_subgraphs")
-        async def get_global_subgraphs(request):
-            subgraphs_dict = await self.get_custom_node_subgraphs(loadedModules)
-            # NOTE: we may want to include other sources of global subgraphs such as templates in the future;
-            # that's the reasoning for the current implementation
-            return web.json_response(await self.sanitize_entries(subgraphs_dict, remove_data=True))
-
-        @routes.get("/global_subgraphs/{id}")
-        async def get_global_subgraph(request):
-            id = request.match_info.get("id", None)
-            subgraph = await self.get_custom_node_subgraph(id, loadedModules)
-            return web.json_response(await self.sanitize_entry(subgraph))
--- a/app/user_manager.py
+++ b/app/user_manager.py
@ -59,9 +59,6 @@ class UserManager():
        user = "default"
        if args.multi_user and "comfy-user" in request.headers:
            user = request.headers["comfy-user"]
-            # Block System Users (use same error message to prevent probing)
-            if user.startswith(folder_paths.SYSTEM_USER_PREFIX):
-                raise KeyError("Unknown user: " + user)

        if user not in self.users:
            raise KeyError("Unknown user: " + user)
@ -69,16 +66,15 @@ class UserManager():
        return user

    def get_request_user_filepath(self, request, file, type="userdata", create_dir=True):
+        user_directory = folder_paths.get_user_directory()
+
        if type == "userdata":
-            root_dir = folder_paths.get_user_directory()
+            root_dir = user_directory
        else:
            raise KeyError("Unknown filepath type:" + type)

        user = self.get_request_user_id(request)
-        user_root = folder_paths.get_public_user_directory(user)
-        if user_root is None:
-            return None
-        path = user_root
+        path = user_root = os.path.abspath(os.path.join(root_dir, user))

        # prevent leaving /{type}
        if os.path.commonpath((root_dir, user_root)) != root_dir:
@ -105,11 +101,7 @@ class UserManager():
        name = name.strip()
        if not name:
            raise ValueError("username not provided")
-        if name.startswith(folder_paths.SYSTEM_USER_PREFIX):
-            raise ValueError("System User prefix not allowed")
        user_id = re.sub("[^a-zA-Z0-9-_]+", '-', name)
-        if user_id.startswith(folder_paths.SYSTEM_USER_PREFIX):
-            raise ValueError("System User prefix not allowed")
        user_id = user_id + "_" + str(uuid.uuid4())

        self.users[user_id] = name
@ -140,10 +132,7 @@ class UserManager():
            if username in self.users.values():
                return web.json_response({"error": "Duplicate username."}, status=400)

-            try:
-                user_id = self.add_user(username)
-            except ValueError as e:
-                return web.json_response({"error": str(e)}, status=400)
+            user_id = self.add_user(username)
            return web.json_response(user_id)

        @routes.get("/userdata")
@ -435,7 +424,7 @@ class UserManager():
                return source

            dest = get_user_data_path(request, check_exists=False, param="dest")
-            if not isinstance(dest, str):
+            if not isinstance(source, str):
                return dest

            overwrite = request.query.get("overwrite", 'true') != "false"
--- a/comfy/audio_encoders/audio_encoders.py
+++ b/comfy/audio_encoders/audio_encoders.py
@ -1,5 +1,4 @@
 from .wav2vec2 import Wav2Vec2Model
-from .whisper import WhisperLargeV3
 import comfy.model_management
 import comfy.ops
 import comfy.utils
@ -12,18 +11,7 @@ class AudioEncoderModel():
        self.load_device = comfy.model_management.text_encoder_device()
        offload_device = comfy.model_management.text_encoder_offload_device()
        self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
-        model_type = config.pop("model_type")
-        model_config = dict(config)
-        model_config.update({
-            "dtype": self.dtype,
-            "device": offload_device,
-            "operations": comfy.ops.manual_cast
-        })
-
-        if model_type == "wav2vec2":
-            self.model = Wav2Vec2Model(**model_config)
-        elif model_type == "whisper3":
-            self.model = WhisperLargeV3(**model_config)
+        self.model = Wav2Vec2Model(dtype=self.dtype, device=offload_device, operations=comfy.ops.manual_cast)
        self.model.eval()
        self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
        self.model_sample_rate = 16000
@ -41,51 +29,14 @@ class AudioEncoderModel():
        outputs = {}
        outputs["encoded_audio"] = out
        outputs["encoded_audio_all_layers"] = all_layers
-        outputs["audio_samples"] = audio.shape[2]
        return outputs


 def load_audio_encoder_from_sd(sd, prefix=""):
+    audio_encoder = AudioEncoderModel(None)
    sd = comfy.utils.state_dict_prefix_replace(sd, {"wav2vec2.": ""})
-    if "encoder.layer_norm.bias" in sd: #wav2vec2
-        embed_dim = sd["encoder.layer_norm.bias"].shape[0]
-        if embed_dim == 1024:# large
-            config = {
-                "model_type": "wav2vec2",
-                "embed_dim": 1024,
-                "num_heads": 16,
-                "num_layers": 24,
-                "conv_norm": True,
-                "conv_bias": True,
-                "do_normalize": True,
-                "do_stable_layer_norm": True
-                }
-        elif embed_dim == 768: # base
-            config = {
-                "model_type": "wav2vec2",
-                "embed_dim": 768,
-                "num_heads": 12,
-                "num_layers": 12,
-                "conv_norm": False,
-                "conv_bias": False,
-                "do_normalize": False, # chinese-wav2vec2-base has this False
-                "do_stable_layer_norm": False
-            }
-        else:
-            raise RuntimeError("ERROR: audio encoder file is invalid or unsupported embed_dim: {}".format(embed_dim))
-    elif "model.encoder.embed_positions.weight" in sd:
-        sd = comfy.utils.state_dict_prefix_replace(sd, {"model.": ""})
-        config = {
-            "model_type": "whisper3",
-        }
-    else:
-        raise RuntimeError("ERROR: audio encoder not supported.")
-
-    audio_encoder = AudioEncoderModel(config)
    m, u = audio_encoder.load_sd(sd)
    if len(m) > 0:
        logging.warning("missing audio encoder: {}".format(m))
-    if len(u) > 0:
-        logging.warning("unexpected audio encoder: {}".format(u))

    return audio_encoder
--- a/comfy/audio_encoders/wav2vec2.py
+++ b/comfy/audio_encoders/wav2vec2.py
@ -13,49 +13,19 @@ class LayerNormConv(nn.Module):
        x = self.conv(x)
        return torch.nn.functional.gelu(self.layer_norm(x.transpose(-2, -1)).transpose(-2, -1))

-class LayerGroupNormConv(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, stride, bias=False, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.conv = operations.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, bias=bias, device=device, dtype=dtype)
-        self.layer_norm = operations.GroupNorm(num_groups=out_channels, num_channels=out_channels, affine=True, device=device, dtype=dtype)
-
-    def forward(self, x):
-        x = self.conv(x)
-        return torch.nn.functional.gelu(self.layer_norm(x))
-
-class ConvNoNorm(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, stride, bias=False, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.conv = operations.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, bias=bias, device=device, dtype=dtype)
-
-    def forward(self, x):
-        x = self.conv(x)
-        return torch.nn.functional.gelu(x)
-

 class ConvFeatureEncoder(nn.Module):
-    def __init__(self, conv_dim, conv_bias=False, conv_norm=True, dtype=None, device=None, operations=None):
+    def __init__(self, conv_dim, dtype=None, device=None, operations=None):
        super().__init__()
-        if conv_norm:
-            self.conv_layers = nn.ModuleList([
-                LayerNormConv(1, conv_dim, kernel_size=10, stride=5, bias=True, device=device, dtype=dtype, operations=operations),
-                LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
-                LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
-                LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
-                LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
-                LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
-                LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
-            ])
-        else:
-            self.conv_layers = nn.ModuleList([
-                LayerGroupNormConv(1, conv_dim, kernel_size=10, stride=5, bias=conv_bias, device=device, dtype=dtype, operations=operations),
-                ConvNoNorm(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
-                ConvNoNorm(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
-                ConvNoNorm(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
-                ConvNoNorm(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
-                ConvNoNorm(conv_dim, conv_dim, kernel_size=2, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
-                ConvNoNorm(conv_dim, conv_dim, kernel_size=2, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
-            ])
+        self.conv_layers = nn.ModuleList([
+            LayerNormConv(1, conv_dim, kernel_size=10, stride=5, bias=True, device=device, dtype=dtype, operations=operations),
+            LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
+            LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
+            LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
+            LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
+            LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
+            LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
+        ])

    def forward(self, x):
        x = x.unsqueeze(1)
@ -106,7 +76,6 @@ class TransformerEncoder(nn.Module):
        num_heads=12,
        num_layers=12,
        mlp_ratio=4.0,
-        do_stable_layer_norm=True,
        dtype=None, device=None, operations=None
    ):
        super().__init__()
@ -117,25 +86,20 @@ class TransformerEncoder(nn.Module):
                embed_dim=embed_dim,
                num_heads=num_heads,
                mlp_ratio=mlp_ratio,
-                do_stable_layer_norm=do_stable_layer_norm,
                device=device, dtype=dtype, operations=operations
            )
            for _ in range(num_layers)
        ])

        self.layer_norm = operations.LayerNorm(embed_dim, eps=1e-05, device=device, dtype=dtype)
-        self.do_stable_layer_norm = do_stable_layer_norm

    def forward(self, x, mask=None):
        x = x + self.pos_conv_embed(x)
        all_x = ()
-        if not self.do_stable_layer_norm:
-            x = self.layer_norm(x)
        for layer in self.layers:
            all_x += (x,)
            x = layer(x, mask)
-        if self.do_stable_layer_norm:
-            x = self.layer_norm(x)
+        x = self.layer_norm(x)
        all_x += (x,)
        return x, all_x

@ -181,7 +145,6 @@ class TransformerEncoderLayer(nn.Module):
        embed_dim=768,
        num_heads=12,
        mlp_ratio=4.0,
-        do_stable_layer_norm=True,
        dtype=None, device=None, operations=None
    ):
        super().__init__()
@ -191,19 +154,15 @@ class TransformerEncoderLayer(nn.Module):
        self.layer_norm = operations.LayerNorm(embed_dim, device=device, dtype=dtype)
        self.feed_forward = FeedForward(embed_dim, mlp_ratio, device=device, dtype=dtype, operations=operations)
        self.final_layer_norm = operations.LayerNorm(embed_dim, device=device, dtype=dtype)
-        self.do_stable_layer_norm = do_stable_layer_norm

    def forward(self, x, mask=None):
        residual = x
-        if self.do_stable_layer_norm:
-            x = self.layer_norm(x)
+        x = self.layer_norm(x)
        x = self.attention(x, mask=mask)
        x = residual + x
-        if not self.do_stable_layer_norm:
-            x = self.layer_norm(x)
-            return self.final_layer_norm(x + self.feed_forward(x))
-        else:
-            return x + self.feed_forward(self.final_layer_norm(x))
+
+        x = x + self.feed_forward(self.final_layer_norm(x))
+        return x


 class Wav2Vec2Model(nn.Module):
@ -215,38 +174,34 @@ class Wav2Vec2Model(nn.Module):
        final_dim=256,
        num_heads=16,
        num_layers=24,
-        conv_norm=True,
-        conv_bias=True,
-        do_normalize=True,
-        do_stable_layer_norm=True,
        dtype=None, device=None, operations=None
    ):
        super().__init__()

        conv_dim = 512
-        self.feature_extractor = ConvFeatureEncoder(conv_dim, conv_norm=conv_norm, conv_bias=conv_bias, device=device, dtype=dtype, operations=operations)
+        self.feature_extractor = ConvFeatureEncoder(conv_dim, device=device, dtype=dtype, operations=operations)
        self.feature_projection = FeatureProjection(conv_dim, embed_dim, device=device, dtype=dtype, operations=operations)

        self.masked_spec_embed = nn.Parameter(torch.empty(embed_dim, device=device, dtype=dtype))
-        self.do_normalize = do_normalize

        self.encoder = TransformerEncoder(
            embed_dim=embed_dim,
            num_heads=num_heads,
            num_layers=num_layers,
-            do_stable_layer_norm=do_stable_layer_norm,
            device=device, dtype=dtype, operations=operations
        )

    def forward(self, x, mask_time_indices=None, return_dict=False):
+
        x = torch.mean(x, dim=1)

-        if self.do_normalize:
-            x = (x - x.mean()) / torch.sqrt(x.var() + 1e-7)
+        x = (x - x.mean()) / torch.sqrt(x.var() + 1e-7)

        features = self.feature_extractor(x)
        features = self.feature_projection(features)
+
        batch_size, seq_len, _ = features.shape

        x, all_x = self.encoder(features)
+
        return x, all_x
--- a/comfy/audio_encoders/whisper.py
+++ b/comfy/audio_encoders/whisper.py
@ -1,186 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchaudio
-from typing import Optional
-from comfy.ldm.modules.attention import optimized_attention_masked
-import comfy.ops
-
-class WhisperFeatureExtractor(nn.Module):
-    def __init__(self, n_mels=128, device=None):
-        super().__init__()
-        self.sample_rate = 16000
-        self.n_fft = 400
-        self.hop_length = 160
-        self.n_mels = n_mels
-        self.chunk_length = 30
-        self.n_samples = 480000
-
-        self.mel_spectrogram = torchaudio.transforms.MelSpectrogram(
-            sample_rate=self.sample_rate,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
-            n_mels=self.n_mels,
-            f_min=0,
-            f_max=8000,
-            norm="slaney",
-            mel_scale="slaney",
-        ).to(device)
-
-    def __call__(self, audio):
-        audio = torch.mean(audio, dim=1)
-        batch_size = audio.shape[0]
-        processed_audio = []
-
-        for i in range(batch_size):
-            aud = audio[i]
-            if aud.shape[0] > self.n_samples:
-                aud = aud[:self.n_samples]
-            elif aud.shape[0] < self.n_samples:
-                aud = F.pad(aud, (0, self.n_samples - aud.shape[0]))
-            processed_audio.append(aud)
-
-        audio = torch.stack(processed_audio)
-
-        mel_spec = self.mel_spectrogram(audio.to(self.mel_spectrogram.spectrogram.window.device))[:, :, :-1].to(audio.device)
-
-        log_mel_spec = torch.clamp(mel_spec, min=1e-10).log10()
-        log_mel_spec = torch.maximum(log_mel_spec, log_mel_spec.max() - 8.0)
-        log_mel_spec = (log_mel_spec + 4.0) / 4.0
-
-        return log_mel_spec
-
-
-class MultiHeadAttention(nn.Module):
-    def __init__(self, d_model: int, n_heads: int, dtype=None, device=None, operations=None):
-        super().__init__()
-        assert d_model % n_heads == 0
-
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.d_k = d_model // n_heads
-
-        self.q_proj = operations.Linear(d_model, d_model, dtype=dtype, device=device)
-        self.k_proj = operations.Linear(d_model, d_model, bias=False, dtype=dtype, device=device)
-        self.v_proj = operations.Linear(d_model, d_model, dtype=dtype, device=device)
-        self.out_proj = operations.Linear(d_model, d_model, dtype=dtype, device=device)
-
-    def forward(
-        self,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        batch_size, seq_len, _ = query.shape
-
-        q = self.q_proj(query)
-        k = self.k_proj(key)
-        v = self.v_proj(value)
-
-        attn_output = optimized_attention_masked(q, k, v, self.n_heads, mask)
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output
-
-
-class EncoderLayer(nn.Module):
-    def __init__(self, d_model: int, n_heads: int, d_ff: int, dtype=None, device=None, operations=None):
-        super().__init__()
-
-        self.self_attn = MultiHeadAttention(d_model, n_heads, dtype=dtype, device=device, operations=operations)
-        self.self_attn_layer_norm = operations.LayerNorm(d_model, dtype=dtype, device=device)
-
-        self.fc1 = operations.Linear(d_model, d_ff, dtype=dtype, device=device)
-        self.fc2 = operations.Linear(d_ff, d_model, dtype=dtype, device=device)
-        self.final_layer_norm = operations.LayerNorm(d_model, dtype=dtype, device=device)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:
-        residual = x
-        x = self.self_attn_layer_norm(x)
-        x = self.self_attn(x, x, x, attention_mask)
-        x = residual + x
-
-        residual = x
-        x = self.final_layer_norm(x)
-        x = self.fc1(x)
-        x = F.gelu(x)
-        x = self.fc2(x)
-        x = residual + x
-
-        return x
-
-
-class AudioEncoder(nn.Module):
-    def __init__(
-        self,
-        n_mels: int = 128,
-        n_ctx: int = 1500,
-        n_state: int = 1280,
-        n_head: int = 20,
-        n_layer: int = 32,
-        dtype=None,
-        device=None,
-        operations=None
-    ):
-        super().__init__()
-
-        self.conv1 = operations.Conv1d(n_mels, n_state, kernel_size=3, padding=1, dtype=dtype, device=device)
-        self.conv2 = operations.Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1, dtype=dtype, device=device)
-
-        self.embed_positions = operations.Embedding(n_ctx, n_state, dtype=dtype, device=device)
-
-        self.layers = nn.ModuleList([
-            EncoderLayer(n_state, n_head, n_state * 4, dtype=dtype, device=device, operations=operations)
-            for _ in range(n_layer)
-        ])
-
-        self.layer_norm = operations.LayerNorm(n_state, dtype=dtype, device=device)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = F.gelu(self.conv1(x))
-        x = F.gelu(self.conv2(x))
-
-        x = x.transpose(1, 2)
-
-        x = x + comfy.ops.cast_to_input(self.embed_positions.weight[:, :x.shape[1]], x)
-
-        all_x = ()
-        for layer in self.layers:
-            all_x += (x,)
-            x = layer(x)
-
-        x = self.layer_norm(x)
-        all_x += (x,)
-        return x, all_x
-
-
-class WhisperLargeV3(nn.Module):
-    def __init__(
-        self,
-        n_mels: int = 128,
-        n_audio_ctx: int = 1500,
-        n_audio_state: int = 1280,
-        n_audio_head: int = 20,
-        n_audio_layer: int = 32,
-        dtype=None,
-        device=None,
-        operations=None
-    ):
-        super().__init__()
-
-        self.feature_extractor = WhisperFeatureExtractor(n_mels=n_mels, device=device)
-
-        self.encoder = AudioEncoder(
-            n_mels, n_audio_ctx, n_audio_state, n_audio_head, n_audio_layer,
-            dtype=dtype, device=device, operations=operations
-        )
-
-    def forward(self, audio):
-        mel = self.feature_extractor(audio)
-        x, all_x = self.encoder(mel)
-        return x, all_x
--- a/comfy/cldm/cldm.py
+++ b/comfy/cldm/cldm.py
@ -413,8 +413,7 @@ class ControlNet(nn.Module):
        out_middle = []

        if self.num_classes is not None:
-            if y is None:
-                raise ValueError("y is None, did you try using a controlnet for SDXL on SD1?")
+            assert y.shape[0] == x.shape[0]
            emb = emb + self.label_emb(y)

        h = x
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -97,13 +97,6 @@ class LatentPreviewMethod(enum.Enum):
    Latent2RGB = "latent2rgb"
    TAESD = "taesd"

-    @classmethod
-    def from_string(cls, value: str):
-        for member in cls:
-            if member.value == value:
-                return member
-        return None
-
 parser.add_argument("--preview-method", type=LatentPreviewMethod, default=LatentPreviewMethod.NoPreviews, help="Default preview method for sampler nodes.", action=EnumAction)

 parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.")
@ -112,7 +105,6 @@ cache_group = parser.add_mutually_exclusive_group()
 cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
 cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
 cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")
-cache_group.add_argument("--cache-ram", nargs='?', const=4.0, type=float, default=0, help="Use RAM pressure caching with the specified headroom threshold. If available RAM drops below the threhold the cache remove large items to free RAM. Default 4GB")

 attn_group = parser.add_mutually_exclusive_group()
 attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
@ -128,12 +120,6 @@ upcast.add_argument("--force-upcast-attention", action="store_true", help="Force
 upcast.add_argument("--dont-upcast-attention", action="store_true", help="Disable all upcasting of attention. Should be unnecessary except for debugging.")


-parser.add_argument("--enable-manager", action="store_true", help="Enable the ComfyUI-Manager feature.")
-manager_group = parser.add_mutually_exclusive_group()
-manager_group.add_argument("--disable-manager-ui", action="store_true", help="Disables only the ComfyUI-Manager UI and endpoints. Scheduled installations and similar background tasks will still operate.")
-manager_group.add_argument("--enable-manager-legacy-ui", action="store_true", help="Enables the legacy UI of ComfyUI-Manager")
-
-
 vram_group = parser.add_mutually_exclusive_group()
 vram_group.add_argument("--gpu-only", action="store_true", help="Store and run everything (text encoders/CLIP models, etc... on the GPU).")
 vram_group.add_argument("--highvram", action="store_true", help="By default models will be unloaded to CPU memory after being used. This option keeps them in GPU memory.")
@ -144,8 +130,7 @@ vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for e

 parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reserved depending on your OS.")

-parser.add_argument("--async-offload", nargs='?', const=2, type=int, default=None, metavar="NUM_STREAMS", help="Use async weight offloading. An optional argument controls the amount of offload streams. Default is 2. Enabled by default on Nvidia.")
-parser.add_argument("--disable-async-offload", action="store_true", help="Disable async weight offloading.")
+parser.add_argument("--async-offload", action="store_true", help="Use async weight offloading.")

 parser.add_argument("--force-non-blocking", action="store_true", help="Force ComfyUI to use non-blocking operations for all applicable tensors. This may improve performance on some non-Nvidia systems but can cause issues with some workflows.")

@ -160,9 +145,7 @@ class PerformanceFeature(enum.Enum):
    CublasOps = "cublas_ops"
    AutoTune = "autotune"

-parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. This is used to test new features so using it might crash your comfyui. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature))))
-
-parser.add_argument("--disable-pinned-memory", action="store_true", help="Disable pinned memory use.")
+parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature))))

 parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.")
 parser.add_argument("--disable-mmap", action="store_true", help="Don't use mmap when loading safetensors.")
@ -174,14 +157,13 @@ parser.add_argument("--windows-standalone-build", action="store_true", help="Win
 parser.add_argument("--disable-metadata", action="store_true", help="Disable saving prompt metadata in files.")
 parser.add_argument("--disable-all-custom-nodes", action="store_true", help="Disable loading all custom nodes.")
 parser.add_argument("--whitelist-custom-nodes", type=str, nargs='+', default=[], help="Specify custom node folders to load even when --disable-all-custom-nodes is enabled.")
-parser.add_argument("--disable-api-nodes", action="store_true", help="Disable loading all api nodes. Also prevents the frontend from communicating with the internet.")
+parser.add_argument("--disable-api-nodes", action="store_true", help="Disable loading all api nodes.")

 parser.add_argument("--multi-user", action="store_true", help="Enables per-user storage.")

 parser.add_argument("--verbose", default='INFO', const='DEBUG', nargs="?", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Set the logging level')
 parser.add_argument("--log-stdout", action="store_true", help="Send normal process output to stdout instead of stderr (default).")

-
 # The default built-in provider hosted under web/
 DEFAULT_VERSION_STRING = "comfyanonymous/ComfyUI@latest"

@ -231,7 +213,6 @@ database_default_path = os.path.abspath(
    os.path.join(os.path.dirname(__file__), "..", "user", "comfyui.db")
 )
 parser.add_argument("--database-url", type=str, default=f"sqlite:///{database_default_path}", help="Specify the database URL, e.g. for an in-memory database you can use 'sqlite:///:memory:'.")
-parser.add_argument("--disable-assets-autoscan", action="store_true", help="Disable asset scanning on startup for database synchronization.")

 if comfy.options.args_parsing:
    args = parser.parse_args()
--- a/comfy/clip_model.py
+++ b/comfy/clip_model.py
@ -2,25 +2,6 @@ import torch
 from comfy.ldm.modules.attention import optimized_attention_for_device
 import comfy.ops

-def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], crop=True):
-    image = image[:, :, :, :3] if image.shape[3] > 3 else image
-    mean = torch.tensor(mean, device=image.device, dtype=image.dtype)
-    std = torch.tensor(std, device=image.device, dtype=image.dtype)
-    image = image.movedim(-1, 1)
-    if not (image.shape[2] == size and image.shape[3] == size):
-        if crop:
-            scale = (size / min(image.shape[2], image.shape[3]))
-            scale_size = (round(scale * image.shape[2]), round(scale * image.shape[3]))
-        else:
-            scale_size = (size, size)
-
-        image = torch.nn.functional.interpolate(image, size=scale_size, mode="bicubic", antialias=True)
-        h = (image.shape[2] - size)//2
-        w = (image.shape[3] - size)//2
-        image = image[:,:,h:h+size,w:w+size]
-    image = torch.clip((255. * image), 0, 255).round() / 255.0
-    return (image - mean.view([3,1,1])) / std.view([3,1,1])
-
 class CLIPAttention(torch.nn.Module):
    def __init__(self, embed_dim, heads, dtype, device, operations):
        super().__init__()
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@ -1,5 +1,6 @@
 from .utils import load_torch_file, transformers_convert, state_dict_prefix_replace
 import os
+import torch
 import json
 import logging

@ -16,7 +17,24 @@ class Output:
    def __setitem__(self, key, item):
        setattr(self, key, item)

-clip_preprocess = comfy.clip_model.clip_preprocess  # Prevent some stuff from breaking, TODO: remove eventually
+def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], crop=True):
+    image = image[:, :, :, :3] if image.shape[3] > 3 else image
+    mean = torch.tensor(mean, device=image.device, dtype=image.dtype)
+    std = torch.tensor(std, device=image.device, dtype=image.dtype)
+    image = image.movedim(-1, 1)
+    if not (image.shape[2] == size and image.shape[3] == size):
+        if crop:
+            scale = (size / min(image.shape[2], image.shape[3]))
+            scale_size = (round(scale * image.shape[2]), round(scale * image.shape[3]))
+        else:
+            scale_size = (size, size)
+
+        image = torch.nn.functional.interpolate(image, size=scale_size, mode="bicubic", antialias=True)
+        h = (image.shape[2] - size)//2
+        w = (image.shape[3] - size)//2
+        image = image[:,:,h:h+size,w:w+size]
+    image = torch.clip((255. * image), 0, 255).round() / 255.0
+    return (image - mean.view([3,1,1])) / std.view([3,1,1])

 IMAGE_ENCODERS = {
    "clip_vision_model": comfy.clip_model.CLIPVisionModelProjection,
@ -55,7 +73,7 @@ class ClipVisionModel():

    def encode_image(self, image, crop=True):
        comfy.model_management.load_model_gpu(self.patcher)
-        pixel_values = comfy.clip_model.clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop).float()
+        pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop).float()
        out = self.model(pixel_values=pixel_values, intermediate_output='all' if self.return_all_hidden_states else -2)

        outputs = Output()
--- a/comfy/context_windows.py
+++ b/comfy/context_windows.py
@ -51,43 +51,32 @@ class ContextHandlerABC(ABC):


 class IndexListContextWindow(ContextWindowABC):
-    def __init__(self, index_list: list[int], dim: int=0, total_frames: int=0):
+    def __init__(self, index_list: list[int], dim: int=0):
        self.index_list = index_list
        self.context_length = len(index_list)
        self.dim = dim
-        self.total_frames = total_frames
-        self.center_ratio = (min(index_list) + max(index_list)) / (2 * total_frames)

-    def get_tensor(self, full: torch.Tensor, device=None, dim=None, retain_index_list=[]) -> torch.Tensor:
+    def get_tensor(self, full: torch.Tensor, device=None, dim=None) -> torch.Tensor:
        if dim is None:
            dim = self.dim
        if dim == 0 and full.shape[dim] == 1:
            return full
-        idx = tuple([slice(None)] * dim + [self.index_list])
-        window = full[idx]
-        if retain_index_list:
-            idx = tuple([slice(None)] * dim + [retain_index_list])
-            window[idx] = full[idx]
-        return window.to(device)
+        idx = [slice(None)] * dim + [self.index_list]
+        return full[idx].to(device)

    def add_window(self, full: torch.Tensor, to_add: torch.Tensor, dim=None) -> torch.Tensor:
        if dim is None:
            dim = self.dim
-        idx = tuple([slice(None)] * dim + [self.index_list])
+        idx = [slice(None)] * dim + [self.index_list]
        full[idx] += to_add
        return full

-    def get_region_index(self, num_regions: int) -> int:
-        region_idx = int(self.center_ratio * num_regions)
-        return min(max(region_idx, 0), num_regions - 1)
-

 class IndexListCallbacks:
    EVALUATE_CONTEXT_WINDOWS = "evaluate_context_windows"
    COMBINE_CONTEXT_WINDOW_RESULTS = "combine_context_window_results"
    EXECUTE_START = "execute_start"
    EXECUTE_CLEANUP = "execute_cleanup"
-    RESIZE_COND_ITEM = "resize_cond_item"

    def init_callbacks(self):
        return {}
@ -105,8 +94,7 @@ class ContextFuseMethod:

 ContextResults = collections.namedtuple("ContextResults", ['window_idx', 'sub_conds_out', 'sub_conds', 'window'])
 class IndexListContextHandler(ContextHandlerABC):
-    def __init__(self, context_schedule: ContextSchedule, fuse_method: ContextFuseMethod, context_length: int=1, context_overlap: int=0, context_stride: int=1,
-                 closed_loop: bool=False, dim:int=0, freenoise: bool=False, cond_retain_index_list: list[int]=[], split_conds_to_windows: bool=False):
+    def __init__(self, context_schedule: ContextSchedule, fuse_method: ContextFuseMethod, context_length: int=1, context_overlap: int=0, context_stride: int=1, closed_loop=False, dim=0):
        self.context_schedule = context_schedule
        self.fuse_method = fuse_method
        self.context_length = context_length
@ -115,18 +103,13 @@ class IndexListContextHandler(ContextHandlerABC):
        self.closed_loop = closed_loop
        self.dim = dim
        self._step = 0
-        self.freenoise = freenoise
-        self.cond_retain_index_list = [int(x.strip()) for x in cond_retain_index_list.split(",")] if cond_retain_index_list else []
-        self.split_conds_to_windows = split_conds_to_windows

        self.callbacks = {}

    def should_use_context(self, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]) -> bool:
        # for now, assume first dim is batch - should have stored on BaseModel in actual implementation
        if x_in.size(self.dim) > self.context_length:
-            logging.info(f"Using context windows {self.context_length} with overlap {self.context_overlap} for {x_in.size(self.dim)} frames.")
-            if self.cond_retain_index_list:
-                logging.info(f"Retaining original cond for indexes: {self.cond_retain_index_list}")
+            logging.info(f"Using context windows {self.context_length} for {x_in.size(self.dim)} frames.")
            return True
        return False

@ -140,11 +123,6 @@ class IndexListContextHandler(ContextHandlerABC):
            return None
        # reuse or resize cond items to match context requirements
        resized_cond = []
-        # if multiple conds, split based on primary region
-        if self.split_conds_to_windows and len(cond_in) > 1:
-            region = window.get_region_index(len(cond_in))
-            logging.info(f"Splitting conds to windows; using region {region} for window {window.index_list[0]}-{window.index_list[-1]} with center ratio {window.center_ratio:.3f}")
-            cond_in = [cond_in[region]]
        # cond object is a list containing a dict - outer list is irrelevant, so just loop through it
        for actual_cond in cond_in:
            resized_actual_cond = actual_cond.copy()
@ -167,38 +145,15 @@ class IndexListContextHandler(ContextHandlerABC):
                        new_cond_item = cond_item.copy()
                        # when in dictionary, look for tensors and CONDCrossAttn [comfy/conds.py] (has cond attr that is a tensor)
                        for cond_key, cond_value in new_cond_item.items():
-                            # Allow callbacks to handle custom conditioning items
-                            handled = False
-                            for callback in comfy.patcher_extension.get_all_callbacks(
-                                IndexListCallbacks.RESIZE_COND_ITEM, self.callbacks
-                            ):
-                                result = callback(cond_key, cond_value, window, x_in, device, new_cond_item)
-                                if result is not None:
-                                    new_cond_item[cond_key] = result
-                                    handled = True
-                                    break
-                            if handled:
-                                continue
                            if isinstance(cond_value, torch.Tensor):
                                if (self.dim < cond_value.ndim and cond_value(self.dim) == x_in.size(self.dim)) or \
                                   (cond_value.ndim < self.dim and cond_value.size(0) == x_in.size(self.dim)):
                                    new_cond_item[cond_key] = window.get_tensor(cond_value, device)
-                            # Handle audio_embed (temporal dim is 1)
-                            elif cond_key == "audio_embed" and hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor):
-                                audio_cond = cond_value.cond
-                                if audio_cond.ndim > 1 and audio_cond.size(1) == x_in.size(self.dim):
-                                    new_cond_item[cond_key] = cond_value._copy_with(window.get_tensor(audio_cond, device, dim=1))
-                            # Handle vace_context (temporal dim is 3)
-                            elif cond_key == "vace_context" and hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor):
-                                vace_cond = cond_value.cond
-                                if vace_cond.ndim >= 4 and vace_cond.size(3) == x_in.size(self.dim):
-                                    sliced_vace = window.get_tensor(vace_cond, device, dim=3, retain_index_list=self.cond_retain_index_list)
-                                    new_cond_item[cond_key] = cond_value._copy_with(sliced_vace)
                            # if has cond that is a Tensor, check if needs to be subset
                            elif hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor):
                                if  (self.dim < cond_value.cond.ndim and cond_value.cond.size(self.dim) == x_in.size(self.dim)) or \
                                    (cond_value.cond.ndim < self.dim and cond_value.cond.size(0) == x_in.size(self.dim)):
-                                    new_cond_item[cond_key] = cond_value._copy_with(window.get_tensor(cond_value.cond, device, retain_index_list=self.cond_retain_index_list))
+                                    new_cond_item[cond_key] = cond_value._copy_with(window.get_tensor(cond_value.cond, device))
                            elif cond_key == "num_video_frames": # for SVD
                                new_cond_item[cond_key] = cond_value._copy_with(cond_value.cond)
                                new_cond_item[cond_key].cond = window.context_length
@ -211,7 +166,7 @@ class IndexListContextHandler(ContextHandlerABC):
        return resized_cond

    def set_step(self, timestep: torch.Tensor, model_options: dict[str]):
-        mask = torch.isclose(model_options["transformer_options"]["sample_sigmas"], timestep[0], rtol=0.0001)
+        mask = torch.isclose(model_options["transformer_options"]["sample_sigmas"], timestep, rtol=0.0001)
        matches = torch.nonzero(mask)
        if torch.numel(matches) == 0:
            raise Exception("No sample_sigmas matched current timestep; something went wrong.")
@ -220,7 +175,7 @@ class IndexListContextHandler(ContextHandlerABC):
    def get_context_windows(self, model: BaseModel, x_in: torch.Tensor, model_options: dict[str]) -> list[IndexListContextWindow]:
        full_length = x_in.size(self.dim) # TODO: choose dim based on model
        context_windows = self.context_schedule.func(full_length, self, model_options)
-        context_windows = [IndexListContextWindow(window, dim=self.dim, total_frames=full_length) for window in context_windows]
+        context_windows = [IndexListContextWindow(window, dim=self.dim) for window in context_windows]
        return context_windows

    def execute(self, calc_cond_batch: Callable, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]):
@ -297,8 +252,8 @@ class IndexListContextHandler(ContextHandlerABC):
                    prev_weight = (bias_total / (bias_total + bias))
                    new_weight = (bias / (bias_total + bias))
                    # account for dims of tensors
-                    idx_window = tuple([slice(None)] * self.dim + [idx])
-                    pos_window = tuple([slice(None)] * self.dim + [pos])
+                    idx_window = [slice(None)] * self.dim + [idx]
+                    pos_window = [slice(None)] * self.dim + [pos]
                    # apply new values
                    conds_final[i][idx_window] = conds_final[i][idx_window] * prev_weight + sub_conds_out[i][pos_window] * new_weight
                    biases_final[i][idx] = bias_total + bias
@ -334,28 +289,6 @@ def create_prepare_sampling_wrapper(model: ModelPatcher):
    )


-def _sampler_sample_wrapper(executor, guider, sigmas, extra_args, callback, noise, *args, **kwargs):
-    model_options = extra_args.get("model_options", None)
-    if model_options is None:
-        raise Exception("model_options not found in sampler_sample_wrapper; this should never happen, something went wrong.")
-    handler: IndexListContextHandler = model_options.get("context_handler", None)
-    if handler is None:
-        raise Exception("context_handler not found in sampler_sample_wrapper; this should never happen, something went wrong.")
-    if not handler.freenoise:
-        return executor(guider, sigmas, extra_args, callback, noise, *args, **kwargs)
-    noise = apply_freenoise(noise, handler.dim, handler.context_length, handler.context_overlap, extra_args["seed"])
-
-    return executor(guider, sigmas, extra_args, callback, noise, *args, **kwargs)
-
-
-def create_sampler_sample_wrapper(model: ModelPatcher):
-    model.add_wrapper_with_key(
-        comfy.patcher_extension.WrappersMP.SAMPLER_SAMPLE,
-        "ContextWindows_sampler_sample",
-        _sampler_sample_wrapper
-    )
-
-
 def match_weights_to_dim(weights: list[float], x_in: torch.Tensor, dim: int, device=None) -> torch.Tensor:
    total_dims = len(x_in.shape)
    weights_tensor = torch.Tensor(weights).to(device=device)
@ -607,29 +540,3 @@ def shift_window_to_end(window: list[int], num_frames: int):
    for i in range(len(window)):
        # 2) add end_delta to each val to slide windows to end
        window[i] = window[i] + end_delta
-
-
-# https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved/blob/90fb1331201a4b29488089e4fbffc0d82cc6d0a9/animatediff/sample_settings.py#L465
-def apply_freenoise(noise: torch.Tensor, dim: int, context_length: int, context_overlap: int, seed: int):
-    logging.info("Context windows: Applying FreeNoise")
-    generator = torch.Generator(device='cpu').manual_seed(seed)
-    latent_video_length = noise.shape[dim]
-    delta = context_length - context_overlap
-
-    for start_idx in range(0, latent_video_length - context_length, delta):
-        place_idx = start_idx + context_length
-
-        actual_delta = min(delta, latent_video_length - place_idx)
-        if actual_delta <= 0:
-            break
-
-        list_idx = torch.randperm(actual_delta, generator=generator, device='cpu') + start_idx
-
-        source_slice = [slice(None)] * noise.ndim
-        source_slice[dim] = list_idx
-        target_slice = [slice(None)] * noise.ndim
-        target_slice[dim] = slice(place_idx, place_idx + actual_delta)
-
-        noise[tuple(target_slice)] = noise[tuple(source_slice)]
-
-    return noise
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@ -310,13 +310,11 @@ class ControlLoraOps:
            self.bias = None

        def forward(self, input):
-            weight, bias, offload_stream = comfy.ops.cast_bias_weight(self, input, offloadable=True)
+            weight, bias = comfy.ops.cast_bias_weight(self, input)
            if self.up is not None:
-                x = torch.nn.functional.linear(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias)
+                return torch.nn.functional.linear(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias)
            else:
-                x = torch.nn.functional.linear(input, weight, bias)
-            comfy.ops.uncast_bias_weight(self, weight, bias, offload_stream)
-            return x
+                return torch.nn.functional.linear(input, weight, bias)

    class Conv2d(torch.nn.Module, comfy.ops.CastWeightBiasOp):
        def __init__(
@ -352,13 +350,12 @@ class ControlLoraOps:


        def forward(self, input):
-            weight, bias, offload_stream = comfy.ops.cast_bias_weight(self, input, offloadable=True)
+            weight, bias = comfy.ops.cast_bias_weight(self, input)
            if self.up is not None:
-                x = torch.nn.functional.conv2d(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias, self.stride, self.padding, self.dilation, self.groups)
+                return torch.nn.functional.conv2d(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias, self.stride, self.padding, self.dilation, self.groups)
            else:
-                x = torch.nn.functional.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups)
-            comfy.ops.uncast_bias_weight(self, weight, bias, offload_stream)
-            return x
+                return torch.nn.functional.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups)
+

 class ControlLora(ControlNet):
    def __init__(self, control_weights, global_average_pooling=False, model_options={}): #TODO? model_options
--- a/comfy/hooks.py
+++ b/comfy/hooks.py
@ -527,8 +527,7 @@ class HookKeyframeGroup:
                        if self._current_keyframe.get_effective_guarantee_steps(max_sigma) > 0:
                            break
                    # if eval_c is outside the percent range, stop looking further
-                    else:
-                        break
+                    else: break
        # update steps current context is used
        self._current_used_steps += 1
        # update current timestep this was performed on
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@ -74,9 +74,6 @@ def get_ancestral_step(sigma_from, sigma_to, eta=1.):

 def default_noise_sampler(x, seed=None):
    if seed is not None:
-        if x.device == torch.device("cpu"):
-            seed += 1
-
        generator = torch.Generator(device=x.device)
        generator.manual_seed(seed)
    else:
@ -89,24 +86,24 @@ class BatchedBrownianTree:
    """A wrapper around torchsde.BrownianTree that enables batches of entropy."""

    def __init__(self, x, t0, t1, seed=None, **kwargs):
-        self.cpu_tree = kwargs.pop("cpu", True)
+        self.cpu_tree = True
+        if "cpu" in kwargs:
+            self.cpu_tree = kwargs.pop("cpu")
        t0, t1, self.sign = self.sort(t0, t1)
-        w0 = kwargs.pop('w0', None)
-        if w0 is None:
-            w0 = torch.zeros_like(x)
-        self.batched = False
+        w0 = kwargs.get('w0', torch.zeros_like(x))
        if seed is None:
-            seed = (torch.randint(0, 2 ** 63 - 1, ()).item(),)
-        elif isinstance(seed, (tuple, list)):
-            if len(seed) != x.shape[0]:
-                raise ValueError("Passing a list or tuple of seeds to BatchedBrownianTree requires a length matching the batch size.")
-            self.batched = True
+            seed = torch.randint(0, 2 ** 63 - 1, []).item()
+        self.batched = True
+        try:
+            assert len(seed) == x.shape[0]
            w0 = w0[0]
-        else:
-            seed = (seed,)
+        except TypeError:
+            seed = [seed]
+            self.batched = False
        if self.cpu_tree:
-            t0, w0, t1 = t0.detach().cpu(), w0.detach().cpu(), t1.detach().cpu()
-        self.trees = tuple(torchsde.BrownianTree(t0, w0, t1, entropy=s, **kwargs) for s in seed)
+            self.trees = [torchsde.BrownianTree(t0.cpu(), w0.cpu(), t1.cpu(), entropy=s, **kwargs) for s in seed]
+        else:
+            self.trees = [torchsde.BrownianTree(t0, w0, t1, entropy=s, **kwargs) for s in seed]

    @staticmethod
    def sort(a, b):
@ -114,10 +111,11 @@ class BatchedBrownianTree:

    def __call__(self, t0, t1):
        t0, t1, sign = self.sort(t0, t1)
-        device, dtype = t0.device, t0.dtype
        if self.cpu_tree:
-            t0, t1 = t0.detach().cpu().float(), t1.detach().cpu().float()
-        w = torch.stack([tree(t0, t1) for tree in self.trees]).to(device=device, dtype=dtype) * (self.sign * sign)
+            w = torch.stack([tree(t0.cpu().float(), t1.cpu().float()).to(t0.dtype).to(t0.device) for tree in self.trees]) * (self.sign * sign)
+        else:
+            w = torch.stack([tree(t0, t1) for tree in self.trees]) * (self.sign * sign)
+
        return w if self.batched else w[0]


@ -1560,13 +1558,10 @@ def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None


@torch.no_grad()
-def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=0.5, solver_type="phi_1"):
+def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=0.5):
    """SEEDS-2 - Stochastic Explicit Exponential Derivative-free Solvers (VP Data Prediction) stage 2.
    arXiv: https://arxiv.org/abs/2305.14267 (NeurIPS 2023)
    """
-    if solver_type not in {"phi_1", "phi_2"}:
-        raise ValueError("solver_type must be 'phi_1' or 'phi_2'")
-
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
@ -1606,14 +1601,8 @@ def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=Non
        denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args)

        # Step 2
-        if solver_type == "phi_1":
-            denoised_d = torch.lerp(denoised, denoised_2, fac)
-            x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * ei_h_phi_1(-h_eta) * denoised_d
-        elif solver_type == "phi_2":
-            b2 = ei_h_phi_2(-h_eta) / r
-            b1 = ei_h_phi_1(-h_eta) - b2
-            x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * (b1 * denoised + b2 * denoised_2)
-
+        denoised_d = torch.lerp(denoised, denoised_2, fac)
+        x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * ei_h_phi_1(-h_eta) * denoised_d
        if inject_noise:
            segment_factor = (r - 1) * h * eta
            sde_noise = sde_noise * segment_factor.exp()
@ -1621,17 +1610,6 @@ def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=Non
            x = x + sde_noise * sigmas[i + 1] * s_noise
    return x

-@torch.no_grad()
-def sample_exp_heun_2_x0(model, x, sigmas, extra_args=None, callback=None, disable=None, solver_type="phi_2"):
-    """Deterministic exponential Heun second order method in data prediction (x0) and logSNR time."""
-    return sample_seeds_2(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=0.0, s_noise=0.0, noise_sampler=None, r=1.0, solver_type=solver_type)
-
-
-@torch.no_grad()
-def sample_exp_heun_2_x0_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type="phi_2"):
-    """Stochastic exponential Heun second order method in data prediction (x0) and logSNR time."""
-    return sample_seeds_2(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, r=1.0, solver_type=solver_type)
-

@torch.no_grad()
 def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r_1=1./3, r_2=2./3):
@ -1779,7 +1757,7 @@ def sample_sa_solver(model, x, sigmas, extra_args=None, callback=None, disable=F
        # Predictor
        if sigmas[i + 1] == 0:
            # Denoising step
-            x_pred = denoised
+            x = denoised
        else:
            tau_t = tau_func(sigmas[i + 1])
            curr_lambdas = lambdas[i - predictor_order_used + 1:i + 1]
@ -1800,7 +1778,7 @@ def sample_sa_solver(model, x, sigmas, extra_args=None, callback=None, disable=F
            if tau_t > 0 and s_noise > 0:
                noise = noise_sampler(sigmas[i], sigmas[i + 1]) * sigmas[i + 1] * (-2 * tau_t ** 2 * h).expm1().neg().sqrt() * s_noise
                x_pred = x_pred + noise
-    return x_pred
+    return x


@torch.no_grad()
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@ -6,7 +6,6 @@ class LatentFormat:
    latent_dimensions = 2
    latent_rgb_factors = None
    latent_rgb_factors_bias = None
-    latent_rgb_factors_reshape = None
    taesd_decoder_name = None

    def process_in(self, latent):
@ -179,54 +178,6 @@ class Flux(SD3):
    def process_out(self, latent):
        return (latent / self.scale_factor) + self.shift_factor

-class Flux2(LatentFormat):
-    latent_channels = 128
-
-    def __init__(self):
-        self.latent_rgb_factors =[
-            [0.0058, 0.0113, 0.0073],
-            [0.0495, 0.0443, 0.0836],
-            [-0.0099, 0.0096, 0.0644],
-            [0.2144, 0.3009, 0.3652],
-            [0.0166, -0.0039, -0.0054],
-            [0.0157, 0.0103, -0.0160],
-            [-0.0398, 0.0902, -0.0235],
-            [-0.0052, 0.0095, 0.0109],
-            [-0.3527, -0.2712, -0.1666],
-            [-0.0301, -0.0356, -0.0180],
-            [-0.0107, 0.0078, 0.0013],
-            [0.0746, 0.0090, -0.0941],
-            [0.0156, 0.0169, 0.0070],
-            [-0.0034, -0.0040, -0.0114],
-            [0.0032, 0.0181, 0.0080],
-            [-0.0939, -0.0008, 0.0186],
-            [0.0018, 0.0043, 0.0104],
-            [0.0284, 0.0056, -0.0127],
-            [-0.0024, -0.0022, -0.0030],
-            [0.1207, -0.0026, 0.0065],
-            [0.0128, 0.0101, 0.0142],
-            [0.0137, -0.0072, -0.0007],
-            [0.0095, 0.0092, -0.0059],
-            [0.0000, -0.0077, -0.0049],
-            [-0.0465, -0.0204, -0.0312],
-            [0.0095, 0.0012, -0.0066],
-            [0.0290, -0.0034, 0.0025],
-            [0.0220, 0.0169, -0.0048],
-            [-0.0332, -0.0457, -0.0468],
-            [-0.0085, 0.0389, 0.0609],
-            [-0.0076, 0.0003, -0.0043],
-            [-0.0111, -0.0460, -0.0614],
-        ]
-
-        self.latent_rgb_factors_bias = [-0.0329, -0.0718, -0.0851]
-        self.latent_rgb_factors_reshape = lambda t: t.reshape(t.shape[0], 32, 2, 2, t.shape[-2], t.shape[-1]).permute(0, 1, 4, 2, 5, 3).reshape(t.shape[0], 32, t.shape[-2] * 2, t.shape[-1] * 2)
-
-    def process_in(self, latent):
-        return latent
-
-    def process_out(self, latent):
-        return latent
-
 class Mochi(LatentFormat):
    latent_channels = 12
    latent_dimensions = 3
@ -407,11 +358,6 @@ class LTXV(LatentFormat):

        self.latent_rgb_factors_bias = [-0.0571, -0.1657, -0.2512]

-class LTXAV(LTXV):
-    def __init__(self):
-        self.latent_rgb_factors = None
-        self.latent_rgb_factors_bias = None
-
 class HunyuanVideo(LatentFormat):
    latent_channels = 16
    latent_dimensions = 3
@ -436,7 +382,6 @@ class HunyuanVideo(LatentFormat):
    ]

    latent_rgb_factors_bias = [ 0.0259, -0.0192, -0.0761]
-    taesd_decoder_name = "taehv"

 class Cosmos1CV8x8x8(LatentFormat):
    latent_channels = 16
@ -500,7 +445,7 @@ class Wan21(LatentFormat):
        ]).view(1, self.latent_channels, 1, 1, 1)


-        self.taesd_decoder_name = "lighttaew2_1"
+        self.taesd_decoder_name = None #TODO

    def process_in(self, latent):
        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
@ -571,7 +516,6 @@ class Wan22(Wan21):

    def __init__(self):
        self.scale_factor = 1.0
-        self.taesd_decoder_name = "lighttaew2_2"
        self.latents_mean = torch.tensor([
                -0.2289, -0.0052, -0.1323, -0.2339, -0.2799, 0.0174, 0.1838, 0.1557,
                -0.1382, 0.0542, 0.2813, 0.0891, 0.1570, -0.0098, 0.0375, -0.1825,
@ -662,72 +606,6 @@ class HunyuanImage21(LatentFormat):

    latent_rgb_factors_bias = [0.0007, -0.0256, -0.0206]

-class HunyuanImage21Refiner(LatentFormat):
-    latent_channels = 64
-    latent_dimensions = 3
-    scale_factor = 1.03682
-
-    def process_in(self, latent):
-        out = latent * self.scale_factor
-        out = torch.cat((out[:, :, :1], out), dim=2)
-        out = out.permute(0, 2, 1, 3, 4)
-        b, f_times_2, c, h, w = out.shape
-        out = out.reshape(b, f_times_2 // 2, 2 * c, h, w)
-        out = out.permute(0, 2, 1, 3, 4).contiguous()
-        return out
-
-    def process_out(self, latent):
-        z = latent / self.scale_factor
-        z = z.permute(0, 2, 1, 3, 4)
-        b, f, c, h, w = z.shape
-        z = z.reshape(b, f, 2, c // 2, h, w)
-        z = z.permute(0, 1, 2, 3, 4, 5).reshape(b, f * 2, c // 2, h, w)
-        z = z.permute(0, 2, 1, 3, 4)
-        z = z[:, :, 1:]
-        return z
-
-class HunyuanVideo15(LatentFormat):
-    latent_rgb_factors = [
-        [ 0.0568, -0.0521, -0.0131],
-        [ 0.0014,  0.0735,  0.0326],
-        [ 0.0186,  0.0531, -0.0138],
-        [-0.0031,  0.0051,  0.0288],
-        [ 0.0110,  0.0556,  0.0432],
-        [-0.0041, -0.0023, -0.0485],
-        [ 0.0530,  0.0413,  0.0253],
-        [ 0.0283,  0.0251,  0.0339],
-        [ 0.0277, -0.0372, -0.0093],
-        [ 0.0393,  0.0944,  0.1131],
-        [ 0.0020,  0.0251,  0.0037],
-        [-0.0017,  0.0012,  0.0234],
-        [ 0.0468,  0.0436,  0.0203],
-        [ 0.0354,  0.0439, -0.0233],
-        [ 0.0090,  0.0123,  0.0346],
-        [ 0.0382,  0.0029,  0.0217],
-        [ 0.0261, -0.0300,  0.0030],
-        [-0.0088, -0.0220, -0.0283],
-        [-0.0272, -0.0121, -0.0363],
-        [-0.0664, -0.0622,  0.0144],
-        [ 0.0414,  0.0479,  0.0529],
-        [ 0.0355,  0.0612, -0.0247],
-        [ 0.0147,  0.0264,  0.0174],
-        [ 0.0438,  0.0038,  0.0542],
-        [ 0.0431, -0.0573, -0.0033],
-        [-0.0162, -0.0211, -0.0406],
-        [-0.0487, -0.0295, -0.0393],
-        [ 0.0005, -0.0109,  0.0253],
-        [ 0.0296,  0.0591,  0.0353],
-        [ 0.0119,  0.0181, -0.0306],
-        [-0.0085, -0.0362,  0.0229],
-        [ 0.0005, -0.0106,  0.0242]
-    ]
-
-    latent_rgb_factors_bias = [ 0.0456, -0.0202, -0.0644]
-    latent_channels = 32
-    latent_dimensions = 3
-    scale_factor = 1.03682
-    taesd_decoder_name = "lighttaehy1_5"
-
 class Hunyuan3Dv2(LatentFormat):
    latent_channels = 64
    latent_dimensions = 1
@ -746,20 +624,3 @@ class Hunyuan3Dv2mini(LatentFormat):
 class ACEAudio(LatentFormat):
    latent_channels = 8
    latent_dimensions = 2
-
-class ChromaRadiance(LatentFormat):
-    latent_channels = 3
-
-    def __init__(self):
-        self.latent_rgb_factors = [
-            # R    G    B
-            [ 1.0, 0.0, 0.0 ],
-            [ 0.0, 1.0, 0.0 ],
-            [ 0.0, 0.0, 1.0 ]
-        ]
-
-    def process_in(self, latent):
-        return latent
-
-    def process_out(self, latent):
-        return latent
--- a/comfy/ldm/ace/attention.py
+++ b/comfy/ldm/ace/attention.py
@ -133,7 +133,6 @@ class Attention(nn.Module):
        hidden_states: torch.Tensor,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
-        transformer_options={},
        **cross_attention_kwargs,
    ) -> torch.Tensor:
        return self.processor(
@ -141,7 +140,6 @@ class Attention(nn.Module):
            hidden_states,
            encoder_hidden_states=encoder_hidden_states,
            attention_mask=attention_mask,
-            transformer_options=transformer_options,
            **cross_attention_kwargs,
        )

@ -368,7 +366,6 @@ class CustomerAttnProcessor2_0:
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
-        transformer_options={},
        *args,
        **kwargs,
    ) -> torch.Tensor:
@ -436,7 +433,7 @@ class CustomerAttnProcessor2_0:

        # the output of sdp = (batch, num_heads, seq_len, head_dim)
        hidden_states = optimized_attention(
-            query, key, value, heads=query.shape[1], mask=attention_mask, skip_reshape=True, transformer_options=transformer_options,
+            query, key, value, heads=query.shape[1], mask=attention_mask, skip_reshape=True,
        ).to(query.dtype)

        # linear proj
@ -700,7 +697,6 @@ class LinearTransformerBlock(nn.Module):
        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
        temb: torch.FloatTensor = None,
-        transformer_options={},
    ):

        N = hidden_states.shape[0]
@ -724,7 +720,6 @@ class LinearTransformerBlock(nn.Module):
                encoder_attention_mask=encoder_attention_mask,
                rotary_freqs_cis=rotary_freqs_cis,
                rotary_freqs_cis_cross=rotary_freqs_cis_cross,
-                transformer_options=transformer_options,
            )
        else:
            attn_output, _ = self.attn(
@ -734,7 +729,6 @@ class LinearTransformerBlock(nn.Module):
                encoder_attention_mask=None,
                rotary_freqs_cis=rotary_freqs_cis,
                rotary_freqs_cis_cross=None,
-                transformer_options=transformer_options,
            )

        if self.use_adaln_single:
@ -749,7 +743,6 @@ class LinearTransformerBlock(nn.Module):
                encoder_attention_mask=encoder_attention_mask,
                rotary_freqs_cis=rotary_freqs_cis,
                rotary_freqs_cis_cross=rotary_freqs_cis_cross,
-                transformer_options=transformer_options,
            )
            hidden_states = attn_output + hidden_states

--- a/comfy/ldm/ace/model.py
+++ b/comfy/ldm/ace/model.py
@ -314,7 +314,6 @@ class ACEStepTransformer2DModel(nn.Module):
        output_length: int = 0,
        block_controlnet_hidden_states: Optional[Union[List[torch.Tensor], torch.Tensor]] = None,
        controlnet_scale: Union[float, torch.Tensor] = 1.0,
-        transformer_options={},
    ):
        embedded_timestep = self.timestep_embedder(self.time_proj(timestep).to(dtype=hidden_states.dtype))
        temb = self.t_block(embedded_timestep)
@ -340,7 +339,6 @@ class ACEStepTransformer2DModel(nn.Module):
                rotary_freqs_cis=rotary_freqs_cis,
                rotary_freqs_cis_cross=encoder_rotary_freqs_cis,
                temb=temb,
-                transformer_options=transformer_options,
            )

        output = self.final_layer(hidden_states, embedded_timestep, output_length)
@ -395,7 +393,6 @@ class ACEStepTransformer2DModel(nn.Module):

        output_length = hidden_states.shape[-1]

-        transformer_options = kwargs.get("transformer_options", {})
        output = self.decode(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
@ -405,7 +402,6 @@ class ACEStepTransformer2DModel(nn.Module):
            output_length=output_length,
            block_controlnet_hidden_states=block_controlnet_hidden_states,
            controlnet_scale=controlnet_scale,
-            transformer_options=transformer_options,
        )

        return output
--- a/comfy/ldm/ace/vae/music_dcae_pipeline.py
+++ b/comfy/ldm/ace/vae/music_dcae_pipeline.py
@ -23,6 +23,8 @@ class MusicDCAE(torch.nn.Module):
        else:
            self.source_sample_rate = source_sample_rate

+        # self.resampler = torchaudio.transforms.Resample(source_sample_rate, 44100)
+
        self.transform = transforms.Compose([
            transforms.Normalize(0.5, 0.5),
        ])
@ -35,6 +37,10 @@ class MusicDCAE(torch.nn.Module):
        self.scale_factor = 0.1786
        self.shift_factor = -1.9091

+    def load_audio(self, audio_path):
+        audio, sr = torchaudio.load(audio_path)
+        return audio, sr
+
    def forward_mel(self, audios):
        mels = []
        for i in range(len(audios)):
@ -67,8 +73,10 @@ class MusicDCAE(torch.nn.Module):
            latent = self.dcae.encoder(mel.unsqueeze(0))
            latents.append(latent)
        latents = torch.cat(latents, dim=0)
+        # latent_lengths = (audio_lengths / sr * 44100 / 512 / self.time_dimention_multiple).long()
        latents = (latents - self.shift_factor) * self.scale_factor
        return latents
+        # return latents, latent_lengths

    @torch.no_grad()
    def decode(self, latents, audio_lengths=None, sr=None):
@ -83,7 +91,9 @@ class MusicDCAE(torch.nn.Module):
            wav = self.vocoder.decode(mels[0]).squeeze(1)

            if sr is not None:
+                # resampler = torchaudio.transforms.Resample(44100, sr).to(latents.device).to(latents.dtype)
                wav = torchaudio.functional.resample(wav, 44100, sr)
+                # wav = resampler(wav)
            else:
                sr = 44100
            pred_wavs.append(wav)
@ -91,6 +101,7 @@ class MusicDCAE(torch.nn.Module):
        if audio_lengths is not None:
            pred_wavs = [wav[:, :length].cpu() for wav, length in zip(pred_wavs, audio_lengths)]
        return torch.stack(pred_wavs)
+        # return sr, pred_wavs

    def forward(self, audios, audio_lengths=None, sr=None):
        latents, latent_lengths = self.encode(audios=audios, audio_lengths=audio_lengths, sr=sr)
--- a/comfy/ldm/audio/dit.py
+++ b/comfy/ldm/audio/dit.py
@ -298,8 +298,7 @@ class Attention(nn.Module):
        mask = None,
        context_mask = None,
        rotary_pos_emb = None,
-        causal = None,
-        transformer_options={},
+        causal = None
    ):
        h, kv_h, has_context = self.num_heads, self.kv_heads, context is not None

@ -364,7 +363,7 @@ class Attention(nn.Module):
            heads_per_kv_head = h // kv_h
            k, v = map(lambda t: t.repeat_interleave(heads_per_kv_head, dim = 1), (k, v))

-        out = optimized_attention(q, k, v, h, skip_reshape=True, transformer_options=transformer_options)
+        out = optimized_attention(q, k, v, h, skip_reshape=True)
        out = self.to_out(out)

        if mask is not None:
@ -489,8 +488,7 @@ class TransformerBlock(nn.Module):
        global_cond=None,
        mask = None,
        context_mask = None,
-        rotary_pos_emb = None,
-        transformer_options={}
+        rotary_pos_emb = None
    ):
        if self.global_cond_dim is not None and self.global_cond_dim > 0 and global_cond is not None:

@ -500,12 +498,12 @@ class TransformerBlock(nn.Module):
            residual = x
            x = self.pre_norm(x)
            x = x * (1 + scale_self) + shift_self
-            x = self.self_attn(x, mask = mask, rotary_pos_emb = rotary_pos_emb, transformer_options=transformer_options)
+            x = self.self_attn(x, mask = mask, rotary_pos_emb = rotary_pos_emb)
            x = x * torch.sigmoid(1 - gate_self)
            x = x + residual

            if context is not None:
-                x = x + self.cross_attn(self.cross_attend_norm(x), context = context, context_mask = context_mask, transformer_options=transformer_options)
+                x = x + self.cross_attn(self.cross_attend_norm(x), context = context, context_mask = context_mask)

            if self.conformer is not None:
                x = x + self.conformer(x)
@ -519,10 +517,10 @@ class TransformerBlock(nn.Module):
            x = x + residual

        else:
-            x = x + self.self_attn(self.pre_norm(x), mask = mask, rotary_pos_emb = rotary_pos_emb, transformer_options=transformer_options)
+            x = x + self.self_attn(self.pre_norm(x), mask = mask, rotary_pos_emb = rotary_pos_emb)

            if context is not None:
-                x = x + self.cross_attn(self.cross_attend_norm(x), context = context, context_mask = context_mask, transformer_options=transformer_options)
+                x = x + self.cross_attn(self.cross_attend_norm(x), context = context, context_mask = context_mask)

            if self.conformer is not None:
                x = x + self.conformer(x)
@ -608,8 +606,7 @@ class ContinuousTransformer(nn.Module):
        return_info = False,
        **kwargs
    ):
-        transformer_options = kwargs.get("transformer_options", {})
-        patches_replace = transformer_options.get("patches_replace", {})
+        patches_replace = kwargs.get("transformer_options", {}).get("patches_replace", {})
        batch, seq, device = *x.shape[:2], x.device
        context = kwargs["context"]

@ -648,13 +645,13 @@ class ContinuousTransformer(nn.Module):
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"] = layer(args["img"], rotary_pos_emb=args["pe"], global_cond=args["vec"], context=args["txt"], transformer_options=args["transformer_options"])
+                    out["img"] = layer(args["img"], rotary_pos_emb=args["pe"], global_cond=args["vec"], context=args["txt"])
                    return out

-                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": global_cond, "pe": rotary_pos_emb, "transformer_options": transformer_options}, {"original_block": block_wrap})
+                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": global_cond, "pe": rotary_pos_emb}, {"original_block": block_wrap})
                x = out["img"]
            else:
-                x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, context=context, transformer_options=transformer_options)
+                x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, context=context)
            # x = checkpoint(layer, x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs)

            if return_info:
--- a/comfy/ldm/aura/mmdit.py
+++ b/comfy/ldm/aura/mmdit.py
@ -85,7 +85,7 @@ class SingleAttention(nn.Module):
        )

    #@torch.compile()
-    def forward(self, c, transformer_options={}):
+    def forward(self, c):

        bsz, seqlen1, _ = c.shape

@ -95,7 +95,7 @@ class SingleAttention(nn.Module):
        v = v.view(bsz, seqlen1, self.n_heads, self.head_dim)
        q, k = self.q_norm1(q), self.k_norm1(k)

-        output = optimized_attention(q.permute(0, 2, 1, 3), k.permute(0, 2, 1, 3), v.permute(0, 2, 1, 3), self.n_heads, skip_reshape=True, transformer_options=transformer_options)
+        output = optimized_attention(q.permute(0, 2, 1, 3), k.permute(0, 2, 1, 3), v.permute(0, 2, 1, 3), self.n_heads, skip_reshape=True)
        c = self.w1o(output)
        return c

@ -144,7 +144,7 @@ class DoubleAttention(nn.Module):


    #@torch.compile()
-    def forward(self, c, x, transformer_options={}):
+    def forward(self, c, x):

        bsz, seqlen1, _ = c.shape
        bsz, seqlen2, _ = x.shape
@ -168,7 +168,7 @@ class DoubleAttention(nn.Module):
            torch.cat([cv, xv], dim=1),
        )

-        output = optimized_attention(q.permute(0, 2, 1, 3), k.permute(0, 2, 1, 3), v.permute(0, 2, 1, 3), self.n_heads, skip_reshape=True, transformer_options=transformer_options)
+        output = optimized_attention(q.permute(0, 2, 1, 3), k.permute(0, 2, 1, 3), v.permute(0, 2, 1, 3), self.n_heads, skip_reshape=True)

        c, x = output.split([seqlen1, seqlen2], dim=1)
        c = self.w1o(c)
@ -207,7 +207,7 @@ class MMDiTBlock(nn.Module):
        self.is_last = is_last

    #@torch.compile()
-    def forward(self, c, x, global_cond, transformer_options={}, **kwargs):
+    def forward(self, c, x, global_cond, **kwargs):

        cres, xres = c, x

@ -225,7 +225,7 @@ class MMDiTBlock(nn.Module):
        x = modulate(self.normX1(x), xshift_msa, xscale_msa)

        # attention
-        c, x = self.attn(c, x, transformer_options=transformer_options)
+        c, x = self.attn(c, x)


        c = self.normC2(cres + cgate_msa.unsqueeze(1) * c)
@ -255,13 +255,13 @@ class DiTBlock(nn.Module):
        self.mlp = MLP(dim, hidden_dim=dim * 4, dtype=dtype, device=device, operations=operations)

    #@torch.compile()
-    def forward(self, cx, global_cond, transformer_options={}, **kwargs):
+    def forward(self, cx, global_cond, **kwargs):
        cxres = cx
        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.modCX(
            global_cond
        ).chunk(6, dim=1)
        cx = modulate(self.norm1(cx), shift_msa, scale_msa)
-        cx = self.attn(cx, transformer_options=transformer_options)
+        cx = self.attn(cx)
        cx = self.norm2(cxres + gate_msa.unsqueeze(1) * cx)
        mlpout = self.mlp(modulate(cx, shift_mlp, scale_mlp))
        cx = gate_mlp.unsqueeze(1) * mlpout
@ -473,14 +473,13 @@ class MMDiT(nn.Module):
                        out = {}
                        out["txt"], out["img"] = layer(args["txt"],
                                                       args["img"],
-                                                       args["vec"],
-                                                       transformer_options=args["transformer_options"])
+                                                       args["vec"])
                        return out
-                    out = blocks_replace[("double_block", i)]({"img": x, "txt": c, "vec": global_cond, "transformer_options": transformer_options}, {"original_block": block_wrap})
+                    out = blocks_replace[("double_block", i)]({"img": x, "txt": c, "vec": global_cond}, {"original_block": block_wrap})
                    c = out["txt"]
                    x = out["img"]
                else:
-                    c, x = layer(c, x, global_cond, transformer_options=transformer_options, **kwargs)
+                    c, x = layer(c, x, global_cond, **kwargs)

        if len(self.single_layers) > 0:
            c_len = c.size(1)
@ -489,13 +488,13 @@ class MMDiT(nn.Module):
                if ("single_block", i) in blocks_replace:
                    def block_wrap(args):
                        out = {}
-                        out["img"] = layer(args["img"], args["vec"], transformer_options=args["transformer_options"])
+                        out["img"] = layer(args["img"], args["vec"])
                        return out

-                    out = blocks_replace[("single_block", i)]({"img": cx, "vec": global_cond, "transformer_options": transformer_options}, {"original_block": block_wrap})
+                    out = blocks_replace[("single_block", i)]({"img": cx, "vec": global_cond}, {"original_block": block_wrap})
                    cx = out["img"]
                else:
-                    cx = layer(cx, global_cond, transformer_options=transformer_options, **kwargs)
+                    cx = layer(cx, global_cond, **kwargs)

            x = cx[:, c_len:]

--- a/comfy/ldm/cascade/common.py
+++ b/comfy/ldm/cascade/common.py
@ -32,12 +32,12 @@ class OptimizedAttention(nn.Module):

        self.out_proj = operations.Linear(c, c, bias=True, dtype=dtype, device=device)

-    def forward(self, q, k, v, transformer_options={}):
+    def forward(self, q, k, v):
        q = self.to_q(q)
        k = self.to_k(k)
        v = self.to_v(v)

-        out = optimized_attention(q, k, v, self.heads, transformer_options=transformer_options)
+        out = optimized_attention(q, k, v, self.heads)

        return self.out_proj(out)

@ -47,13 +47,13 @@ class Attention2D(nn.Module):
        self.attn = OptimizedAttention(c, nhead, dtype=dtype, device=device, operations=operations)
        # self.attn = nn.MultiheadAttention(c, nhead, dropout=dropout, bias=True, batch_first=True, dtype=dtype, device=device)

-    def forward(self, x, kv, self_attn=False, transformer_options={}):
+    def forward(self, x, kv, self_attn=False):
        orig_shape = x.shape
        x = x.view(x.size(0), x.size(1), -1).permute(0, 2, 1)  # Bx4xHxW -> Bx(HxW)x4
        if self_attn:
            kv = torch.cat([x, kv], dim=1)
        # x = self.attn(x, kv, kv, need_weights=False)[0]
-        x = self.attn(x, kv, kv, transformer_options=transformer_options)
+        x = self.attn(x, kv, kv)
        x = x.permute(0, 2, 1).view(*orig_shape)
        return x

@ -114,9 +114,9 @@ class AttnBlock(nn.Module):
            operations.Linear(c_cond, c, dtype=dtype, device=device)
        )

-    def forward(self, x, kv, transformer_options={}):
+    def forward(self, x, kv):
        kv = self.kv_mapper(kv)
-        x = x + self.attention(self.norm(x), kv, self_attn=self.self_attn, transformer_options=transformer_options)
+        x = x + self.attention(self.norm(x), kv, self_attn=self.self_attn)
        return x


--- a/comfy/ldm/cascade/stage_b.py
+++ b/comfy/ldm/cascade/stage_b.py
@ -173,7 +173,7 @@ class StageB(nn.Module):
        clip = self.clip_norm(clip)
        return clip

-    def _down_encode(self, x, r_embed, clip, transformer_options={}):
+    def _down_encode(self, x, r_embed, clip):
        level_outputs = []
        block_group = zip(self.down_blocks, self.down_downscalers, self.down_repeat_mappers)
        for down_block, downscaler, repmap in block_group:
@ -187,7 +187,7 @@ class StageB(nn.Module):
                    elif isinstance(block, AttnBlock) or (
                            hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
                                                                                  AttnBlock)):
-                        x = block(x, clip, transformer_options=transformer_options)
+                        x = block(x, clip)
                    elif isinstance(block, TimestepBlock) or (
                            hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
                                                                                  TimestepBlock)):
@ -199,7 +199,7 @@ class StageB(nn.Module):
            level_outputs.insert(0, x)
        return level_outputs

-    def _up_decode(self, level_outputs, r_embed, clip, transformer_options={}):
+    def _up_decode(self, level_outputs, r_embed, clip):
        x = level_outputs[0]
        block_group = zip(self.up_blocks, self.up_upscalers, self.up_repeat_mappers)
        for i, (up_block, upscaler, repmap) in enumerate(block_group):
@ -216,7 +216,7 @@ class StageB(nn.Module):
                    elif isinstance(block, AttnBlock) or (
                            hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
                                                                                  AttnBlock)):
-                        x = block(x, clip, transformer_options=transformer_options)
+                        x = block(x, clip)
                    elif isinstance(block, TimestepBlock) or (
                            hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
                                                                                  TimestepBlock)):
@ -228,7 +228,7 @@ class StageB(nn.Module):
            x = upscaler(x)
        return x

-    def forward(self, x, r, effnet, clip, pixels=None, transformer_options={}, **kwargs):
+    def forward(self, x, r, effnet, clip, pixels=None, **kwargs):
        if pixels is None:
            pixels = x.new_zeros(x.size(0), 3, 8, 8)

@ -245,8 +245,8 @@ class StageB(nn.Module):
            nn.functional.interpolate(effnet, size=x.shape[-2:], mode='bilinear', align_corners=True))
        x = x + nn.functional.interpolate(self.pixels_mapper(pixels), size=x.shape[-2:], mode='bilinear',
                                          align_corners=True)
-        level_outputs = self._down_encode(x, r_embed, clip, transformer_options=transformer_options)
-        x = self._up_decode(level_outputs, r_embed, clip, transformer_options=transformer_options)
+        level_outputs = self._down_encode(x, r_embed, clip)
+        x = self._up_decode(level_outputs, r_embed, clip)
        return self.clf(x)

    def update_weights_ema(self, src_model, beta=0.999):
--- a/comfy/ldm/cascade/stage_c.py
+++ b/comfy/ldm/cascade/stage_c.py
@ -182,7 +182,7 @@ class StageC(nn.Module):
        clip = self.clip_norm(clip)
        return clip

-    def _down_encode(self, x, r_embed, clip, cnet=None, transformer_options={}):
+    def _down_encode(self, x, r_embed, clip, cnet=None):
        level_outputs = []
        block_group = zip(self.down_blocks, self.down_downscalers, self.down_repeat_mappers)
        for down_block, downscaler, repmap in block_group:
@ -201,7 +201,7 @@ class StageC(nn.Module):
                    elif isinstance(block, AttnBlock) or (
                            hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
                                                                                  AttnBlock)):
-                        x = block(x, clip, transformer_options=transformer_options)
+                        x = block(x, clip)
                    elif isinstance(block, TimestepBlock) or (
                            hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
                                                                                  TimestepBlock)):
@ -213,7 +213,7 @@ class StageC(nn.Module):
            level_outputs.insert(0, x)
        return level_outputs

-    def _up_decode(self, level_outputs, r_embed, clip, cnet=None, transformer_options={}):
+    def _up_decode(self, level_outputs, r_embed, clip, cnet=None):
        x = level_outputs[0]
        block_group = zip(self.up_blocks, self.up_upscalers, self.up_repeat_mappers)
        for i, (up_block, upscaler, repmap) in enumerate(block_group):
@ -235,7 +235,7 @@ class StageC(nn.Module):
                    elif isinstance(block, AttnBlock) or (
                            hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
                                                                                  AttnBlock)):
-                        x = block(x, clip, transformer_options=transformer_options)
+                        x = block(x, clip)
                    elif isinstance(block, TimestepBlock) or (
                            hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
                                                                                  TimestepBlock)):
@ -247,7 +247,7 @@ class StageC(nn.Module):
            x = upscaler(x)
        return x

-    def forward(self, x, r, clip_text, clip_text_pooled, clip_img, control=None, transformer_options={}, **kwargs):
+    def forward(self, x, r, clip_text, clip_text_pooled, clip_img, control=None, **kwargs):
        # Process the conditioning embeddings
        r_embed = self.gen_r_embedding(r).to(dtype=x.dtype)
        for c in self.t_conds:
@ -262,8 +262,8 @@ class StageC(nn.Module):

        # Model Blocks
        x = self.embedding(x)
-        level_outputs = self._down_encode(x, r_embed, clip, cnet, transformer_options=transformer_options)
-        x = self._up_decode(level_outputs, r_embed, clip, cnet, transformer_options=transformer_options)
+        level_outputs = self._down_encode(x, r_embed, clip, cnet)
+        x = self._up_decode(level_outputs, r_embed, clip, cnet)
        return self.clf(x)

    def update_weights_ema(self, src_model, beta=0.999):
--- a/comfy/ldm/chroma/layers.py
+++ b/comfy/ldm/chroma/layers.py
@ -1,15 +1,15 @@
 import torch
 from torch import Tensor, nn

+from comfy.ldm.flux.math import attention
 from comfy.ldm.flux.layers import (
    MLPEmbedder,
    RMSNorm,
+    QKNorm,
+    SelfAttention,
    ModulationOut,
 )

-# TODO: remove this in a few months
-SingleStreamBlock = None
-DoubleStreamBlock = None


 class ChromaModulationOut(ModulationOut):
@ -48,6 +48,124 @@ class Approximator(nn.Module):
        return x


+class DoubleStreamBlock(nn.Module):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None):
+        super().__init__()
+
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
+
+        self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.img_mlp = nn.Sequential(
+            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
+            nn.GELU(approximate="tanh"),
+            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
+        )
+
+        self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
+
+        self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.txt_mlp = nn.Sequential(
+            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
+            nn.GELU(approximate="tanh"),
+            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
+        )
+        self.flipped_img_txt = flipped_img_txt
+
+    def forward(self, img: Tensor, txt: Tensor, pe: Tensor, vec: Tensor, attn_mask=None):
+        (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec
+
+        # prepare image for attention
+        img_modulated = torch.addcmul(img_mod1.shift, 1 + img_mod1.scale, self.img_norm1(img))
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+
+        # prepare txt for attention
+        txt_modulated = torch.addcmul(txt_mod1.shift, 1 + txt_mod1.scale, self.txt_norm1(txt))
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+
+        # run actual attention
+        attn = attention(torch.cat((txt_q, img_q), dim=2),
+                         torch.cat((txt_k, img_k), dim=2),
+                         torch.cat((txt_v, img_v), dim=2),
+                         pe=pe, mask=attn_mask)
+
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+
+        # calculate the img bloks
+        img.addcmul_(img_mod1.gate, self.img_attn.proj(img_attn))
+        img.addcmul_(img_mod2.gate, self.img_mlp(torch.addcmul(img_mod2.shift, 1 + img_mod2.scale, self.img_norm2(img))))
+
+        # calculate the txt bloks
+        txt.addcmul_(txt_mod1.gate, self.txt_attn.proj(txt_attn))
+        txt.addcmul_(txt_mod2.gate, self.txt_mlp(torch.addcmul(txt_mod2.shift, 1 + txt_mod2.scale, self.txt_norm2(txt))))
+
+        if txt.dtype == torch.float16:
+            txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504)
+
+        return img, txt
+
+
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float = None,
+        dtype=None,
+        device=None,
+        operations=None
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device)
+        # proj and mlp_out
+        self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device)
+
+        self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
+
+        self.hidden_size = hidden_size
+        self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+
+        self.mlp_act = nn.GELU(approximate="tanh")
+
+    def forward(self, x: Tensor, pe: Tensor, vec: Tensor, attn_mask=None) -> Tensor:
+        mod = vec
+        x_mod = torch.addcmul(mod.shift, 1 + mod.scale, self.pre_norm(x))
+        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+
+        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k = self.norm(q, k, v)
+
+        # compute attention
+        attn = attention(q, k, v, pe=pe, mask=attn_mask)
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        x.addcmul_(mod.gate, output)
+        if x.dtype == torch.float16:
+            x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
+        return x
+
+
 class LastLayer(nn.Module):
    def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None):
        super().__init__()
--- a/comfy/ldm/chroma/model.py
+++ b/comfy/ldm/chroma/model.py
@ -11,12 +11,12 @@ import comfy.ldm.common_dit
 from comfy.ldm.flux.layers import (
    EmbedND,
    timestep_embedding,
-    DoubleStreamBlock,
-    SingleStreamBlock,
 )

 from .layers import (
+    DoubleStreamBlock,
    LastLayer,
+    SingleStreamBlock,
    Approximator,
    ChromaModulationOut,
 )
@ -40,8 +40,7 @@ class ChromaParams:
    out_dim: int
    hidden_dim: int
    n_layers: int
-    txt_ids_dims: list
-    vec_in_dim: int
+



@ -91,7 +90,6 @@ class Chroma(nn.Module):
                    self.num_heads,
                    mlp_ratio=params.mlp_ratio,
                    qkv_bias=params.qkv_bias,
-                    modulation=False,
                    dtype=dtype, device=device, operations=operations
                )
                for _ in range(params.depth)
@ -100,7 +98,7 @@ class Chroma(nn.Module):

        self.single_blocks = nn.ModuleList(
            [
-                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, modulation=False, dtype=dtype, device=device, operations=operations)
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations)
                for _ in range(params.depth_single_blocks)
            ]
        )
@ -153,6 +151,8 @@ class Chroma(nn.Module):
        attn_mask: Tensor = None,
    ) -> Tensor:
        patches_replace = transformer_options.get("patches_replace", {})
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")

        # running on sequences img
        img = self.img_in(img)
@ -180,10 +180,7 @@ class Chroma(nn.Module):
        pe = self.pe_embedder(ids)

        blocks_replace = patches_replace.get("dit", {})
-        transformer_options["total_blocks"] = len(self.double_blocks)
-        transformer_options["block_type"] = "double"
        for i, block in enumerate(self.double_blocks):
-            transformer_options["block_index"] = i
            if i not in self.skip_mmdit:
                double_mod = (
                    self.get_modulations(mod_vectors, "double_img", idx=i),
@ -196,16 +193,14 @@ class Chroma(nn.Module):
                                                       txt=args["txt"],
                                                       vec=args["vec"],
                                                       pe=args["pe"],
-                                                       attn_mask=args.get("attn_mask"),
-                                                       transformer_options=args.get("transformer_options"))
+                                                       attn_mask=args.get("attn_mask"))
                        return out

                    out = blocks_replace[("double_block", i)]({"img": img,
                                                               "txt": txt,
                                                               "vec": double_mod,
                                                               "pe": pe,
-                                                               "attn_mask": attn_mask,
-                                                               "transformer_options": transformer_options},
+                                                               "attn_mask": attn_mask},
                                                              {"original_block": block_wrap})
                    txt = out["txt"]
                    img = out["img"]
@ -214,8 +209,7 @@ class Chroma(nn.Module):
                                     txt=txt,
                                     vec=double_mod,
                                     pe=pe,
-                                     attn_mask=attn_mask,
-                                     transformer_options=transformer_options)
+                                     attn_mask=attn_mask)

                if control is not None: # Controlnet
                    control_i = control.get("input")
@ -226,10 +220,7 @@ class Chroma(nn.Module):

        img = torch.cat((txt, img), 1)

-        transformer_options["total_blocks"] = len(self.single_blocks)
-        transformer_options["block_type"] = "single"
        for i, block in enumerate(self.single_blocks):
-            transformer_options["block_index"] = i
            if i not in self.skip_dit:
                single_mod = self.get_modulations(mod_vectors, "single", idx=i)
                if ("single_block", i) in blocks_replace:
@ -238,19 +229,17 @@ class Chroma(nn.Module):
                        out["img"] = block(args["img"],
                                           vec=args["vec"],
                                           pe=args["pe"],
-                                           attn_mask=args.get("attn_mask"),
-                                           transformer_options=args.get("transformer_options"))
+                                           attn_mask=args.get("attn_mask"))
                        return out

                    out = blocks_replace[("single_block", i)]({"img": img,
                                                               "vec": single_mod,
                                                               "pe": pe,
-                                                               "attn_mask": attn_mask,
-                                                               "transformer_options": transformer_options},
+                                                               "attn_mask": attn_mask},
                                                              {"original_block": block_wrap})
                    img = out["img"]
                else:
-                    img = block(img, vec=single_mod, pe=pe, attn_mask=attn_mask, transformer_options=transformer_options)
+                    img = block(img, vec=single_mod, pe=pe, attn_mask=attn_mask)

                if control is not None: # Controlnet
                    control_o = control.get("output")
@ -260,9 +249,8 @@ class Chroma(nn.Module):
                            img[:, txt.shape[1] :, ...] += add

        img = img[:, txt.shape[1] :, ...]
-        if hasattr(self, "final_layer"):
-            final_mod = self.get_modulations(mod_vectors, "final")
-            img = self.final_layer(img, vec=final_mod)  # (N, T, patch_size ** 2 * out_channels)
+        final_mod = self.get_modulations(mod_vectors, "final")
+        img = self.final_layer(img, vec=final_mod)  # (N, T, patch_size ** 2 * out_channels)
        return img

    def forward(self, x, timestep, context, guidance, control=None, transformer_options={}, **kwargs):
@ -278,9 +266,6 @@ class Chroma(nn.Module):

        img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=self.patch_size, pw=self.patch_size)

-        if img.ndim != 3 or context.ndim != 3:
-            raise ValueError("Input img and txt tensors must have 3 dimensions.")
-
        h_len = ((h + (self.patch_size // 2)) // self.patch_size)
        w_len = ((w + (self.patch_size // 2)) // self.patch_size)
        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
--- a/comfy/ldm/chroma_radiance/layers.py
+++ b/comfy/ldm/chroma_radiance/layers.py
@ -1,206 +0,0 @@
-# Adapted from https://github.com/lodestone-rock/flow
-from functools import lru_cache
-
-import torch
-from torch import nn
-
-from comfy.ldm.flux.layers import RMSNorm
-
-
-class NerfEmbedder(nn.Module):
-    """
-    An embedder module that combines input features with a 2D positional
-    encoding that mimics the Discrete Cosine Transform (DCT).
-
-    This module takes an input tensor of shape (B, P^2, C), where P is the
-    patch size, and enriches it with positional information before projecting
-    it to a new hidden size.
-    """
-    def __init__(
-        self,
-        in_channels: int,
-        hidden_size_input: int,
-        max_freqs: int,
-        dtype=None,
-        device=None,
-        operations=None,
-    ):
-        """
-        Initializes the NerfEmbedder.
-
-        Args:
-            in_channels (int): The number of channels in the input tensor.
-            hidden_size_input (int): The desired dimension of the output embedding.
-            max_freqs (int): The number of frequency components to use for both
-                             the x and y dimensions of the positional encoding.
-                             The total number of positional features will be max_freqs^2.
-        """
-        super().__init__()
-        self.dtype = dtype
-        self.max_freqs = max_freqs
-        self.hidden_size_input = hidden_size_input
-
-        # A linear layer to project the concatenated input features and
-        # positional encodings to the final output dimension.
-        self.embedder = nn.Sequential(
-            operations.Linear(in_channels + max_freqs**2, hidden_size_input, dtype=dtype, device=device)
-        )
-
-    @lru_cache(maxsize=4)
-    def fetch_pos(self, patch_size: int, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
-        """
-        Generates and caches 2D DCT-like positional embeddings for a given patch size.
-
-        The LRU cache is a performance optimization that avoids recomputing the
-        same positional grid on every forward pass.
-
-        Args:
-            patch_size (int): The side length of the square input patch.
-            device: The torch device to create the tensors on.
-            dtype: The torch dtype for the tensors.
-
-        Returns:
-            A tensor of shape (1, patch_size^2, max_freqs^2) containing the
-            positional embeddings.
-        """
-        # Create normalized 1D coordinate grids from 0 to 1.
-        pos_x = torch.linspace(0, 1, patch_size, device=device, dtype=dtype)
-        pos_y = torch.linspace(0, 1, patch_size, device=device, dtype=dtype)
-
-        # Create a 2D meshgrid of coordinates.
-        pos_y, pos_x = torch.meshgrid(pos_y, pos_x, indexing="ij")
-
-        # Reshape positions to be broadcastable with frequencies.
-        # Shape becomes (patch_size^2, 1, 1).
-        pos_x = pos_x.reshape(-1, 1, 1)
-        pos_y = pos_y.reshape(-1, 1, 1)
-
-        # Create a 1D tensor of frequency values from 0 to max_freqs-1.
-        freqs = torch.linspace(0, self.max_freqs - 1, self.max_freqs, dtype=dtype, device=device)
-
-        # Reshape frequencies to be broadcastable for creating 2D basis functions.
-        # freqs_x shape: (1, max_freqs, 1)
-        # freqs_y shape: (1, 1, max_freqs)
-        freqs_x = freqs[None, :, None]
-        freqs_y = freqs[None, None, :]
-
-        # A custom weighting coefficient, not part of standard DCT.
-        # This seems to down-weight the contribution of higher-frequency interactions.
-        coeffs = (1 + freqs_x * freqs_y) ** -1
-
-        # Calculate the 1D cosine basis functions for x and y coordinates.
-        # This is the core of the DCT formulation.
-        dct_x = torch.cos(pos_x * freqs_x * torch.pi)
-        dct_y = torch.cos(pos_y * freqs_y * torch.pi)
-
-        # Combine the 1D basis functions to create 2D basis functions by element-wise
-        # multiplication, and apply the custom coefficients. Broadcasting handles the
-        # combination of all (pos_x, freqs_x) with all (pos_y, freqs_y).
-        # The result is flattened into a feature vector for each position.
-        dct = (dct_x * dct_y * coeffs).view(1, -1, self.max_freqs ** 2)
-
-        return dct
-
-    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass for the embedder.
-
-        Args:
-            inputs (Tensor): The input tensor of shape (B, P^2, C).
-
-        Returns:
-            Tensor: The output tensor of shape (B, P^2, hidden_size_input).
-        """
-        # Get the batch size, number of pixels, and number of channels.
-        B, P2, C = inputs.shape
-
-        # Infer the patch side length from the number of pixels (P^2).
-        patch_size = int(P2 ** 0.5)
-
-        input_dtype = inputs.dtype
-        inputs = inputs.to(dtype=self.dtype)
-
-        # Fetch the pre-computed or cached positional embeddings.
-        dct = self.fetch_pos(patch_size, inputs.device, self.dtype)
-
-        # Repeat the positional embeddings for each item in the batch.
-        dct = dct.repeat(B, 1, 1)
-
-        # Concatenate the original input features with the positional embeddings
-        # along the feature dimension.
-        inputs = torch.cat((inputs, dct), dim=-1)
-
-        # Project the combined tensor to the target hidden size.
-        return self.embedder(inputs).to(dtype=input_dtype)
-
-
-class NerfGLUBlock(nn.Module):
-    """
-    A NerfBlock using a Gated Linear Unit (GLU) like MLP.
-    """
-    def __init__(self, hidden_size_s: int, hidden_size_x: int, mlp_ratio, dtype=None, device=None, operations=None):
-        super().__init__()
-        # The total number of parameters for the MLP is increased to accommodate
-        # the gate, value, and output projection matrices.
-        # We now need to generate parameters for 3 matrices.
-        total_params = 3 * hidden_size_x**2 * mlp_ratio
-        self.param_generator = operations.Linear(hidden_size_s, total_params, dtype=dtype, device=device)
-        self.norm = RMSNorm(hidden_size_x, dtype=dtype, device=device, operations=operations)
-        self.mlp_ratio = mlp_ratio
-
-
-    def forward(self, x: torch.Tensor, s: torch.Tensor) -> torch.Tensor:
-        batch_size, num_x, hidden_size_x = x.shape
-        mlp_params = self.param_generator(s)
-
-        # Split the generated parameters into three parts for the gate, value, and output projection.
-        fc1_gate_params, fc1_value_params, fc2_params = mlp_params.chunk(3, dim=-1)
-
-        # Reshape the parameters into matrices for batch matrix multiplication.
-        fc1_gate = fc1_gate_params.view(batch_size, hidden_size_x, hidden_size_x * self.mlp_ratio)
-        fc1_value = fc1_value_params.view(batch_size, hidden_size_x, hidden_size_x * self.mlp_ratio)
-        fc2 = fc2_params.view(batch_size, hidden_size_x * self.mlp_ratio, hidden_size_x)
-
-        # Normalize the generated weight matrices as in the original implementation.
-        fc1_gate = torch.nn.functional.normalize(fc1_gate, dim=-2)
-        fc1_value = torch.nn.functional.normalize(fc1_value, dim=-2)
-        fc2 = torch.nn.functional.normalize(fc2, dim=-2)
-
-        res_x = x
-        x = self.norm(x)
-
-        # Apply the final output projection.
-        x = torch.bmm(torch.nn.functional.silu(torch.bmm(x, fc1_gate)) * torch.bmm(x, fc1_value), fc2)
-
-        return x + res_x
-
-
-class NerfFinalLayer(nn.Module):
-    def __init__(self, hidden_size, out_channels, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.norm = RMSNorm(hidden_size, dtype=dtype, device=device, operations=operations)
-        self.linear = operations.Linear(hidden_size, out_channels, dtype=dtype, device=device)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # RMSNorm normalizes over the last dimension, but our channel dim (C) is at dim=1.
-        # So we temporarily move the channel dimension to the end for the norm operation.
-        return self.linear(self.norm(x.movedim(1, -1))).movedim(-1, 1)
-
-
-class NerfFinalLayerConv(nn.Module):
-    def __init__(self, hidden_size: int, out_channels: int, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.norm = RMSNorm(hidden_size, dtype=dtype, device=device, operations=operations)
-        self.conv = operations.Conv2d(
-            in_channels=hidden_size,
-            out_channels=out_channels,
-            kernel_size=3,
-            padding=1,
-            dtype=dtype,
-            device=device,
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # RMSNorm normalizes over the last dimension, but our channel dim (C) is at dim=1.
-        # So we temporarily move the channel dimension to the end for the norm operation.
-        return self.conv(self.norm(x.movedim(1, -1)).movedim(-1, 1))
--- a/comfy/ldm/chroma_radiance/model.py
+++ b/comfy/ldm/chroma_radiance/model.py
@ -1,335 +0,0 @@
-# Credits:
-# Original Flux code can be found on: https://github.com/black-forest-labs/flux
-# Chroma Radiance adaption referenced from https://github.com/lodestone-rock/flow
-
-from dataclasses import dataclass
-from typing import Optional
-
-import torch
-from torch import Tensor, nn
-from einops import repeat
-import comfy.ldm.common_dit
-
-from comfy.ldm.flux.layers import EmbedND, DoubleStreamBlock, SingleStreamBlock
-
-from comfy.ldm.chroma.model import Chroma, ChromaParams
-from comfy.ldm.chroma.layers import (
-    Approximator,
-)
-from .layers import (
-    NerfEmbedder,
-    NerfGLUBlock,
-    NerfFinalLayer,
-    NerfFinalLayerConv,
-)
-
-
-@dataclass
-class ChromaRadianceParams(ChromaParams):
-    patch_size: int
-    nerf_hidden_size: int
-    nerf_mlp_ratio: int
-    nerf_depth: int
-    nerf_max_freqs: int
-    # Setting nerf_tile_size to 0 disables tiling.
-    nerf_tile_size: int
-    # Currently one of linear (legacy) or conv.
-    nerf_final_head_type: str
-    # None means use the same dtype as the model.
-    nerf_embedder_dtype: Optional[torch.dtype]
-    use_x0: bool
-
-class ChromaRadiance(Chroma):
-    """
-    Transformer model for flow matching on sequences.
-    """
-
-    def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs):
-        if operations is None:
-            raise RuntimeError("Attempt to create ChromaRadiance object without setting operations")
-        nn.Module.__init__(self)
-        self.dtype = dtype
-        params = ChromaRadianceParams(**kwargs)
-        self.params = params
-        self.patch_size = params.patch_size
-        self.in_channels = params.in_channels
-        self.out_channels = params.out_channels
-        if params.hidden_size % params.num_heads != 0:
-            raise ValueError(
-                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
-            )
-        pe_dim = params.hidden_size // params.num_heads
-        if sum(params.axes_dim) != pe_dim:
-            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
-        self.hidden_size = params.hidden_size
-        self.num_heads = params.num_heads
-        self.in_dim = params.in_dim
-        self.out_dim = params.out_dim
-        self.hidden_dim = params.hidden_dim
-        self.n_layers = params.n_layers
-        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
-        self.img_in_patch = operations.Conv2d(
-            params.in_channels,
-            params.hidden_size,
-            kernel_size=params.patch_size,
-            stride=params.patch_size,
-            bias=True,
-            dtype=dtype,
-            device=device,
-        )
-        self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, dtype=dtype, device=device)
-        # set as nn identity for now, will overwrite it later.
-        self.distilled_guidance_layer = Approximator(
-                    in_dim=self.in_dim,
-                    hidden_dim=self.hidden_dim,
-                    out_dim=self.out_dim,
-                    n_layers=self.n_layers,
-                    dtype=dtype, device=device, operations=operations
-                )
-
-        self.double_blocks = nn.ModuleList(
-            [
-                DoubleStreamBlock(
-                    self.hidden_size,
-                    self.num_heads,
-                    mlp_ratio=params.mlp_ratio,
-                    qkv_bias=params.qkv_bias,
-                    modulation=False,
-                    dtype=dtype, device=device, operations=operations
-                )
-                for _ in range(params.depth)
-            ]
-        )
-
-        self.single_blocks = nn.ModuleList(
-            [
-                SingleStreamBlock(
-                    self.hidden_size,
-                    self.num_heads,
-                    mlp_ratio=params.mlp_ratio,
-                    modulation=False,
-                    dtype=dtype, device=device, operations=operations,
-                )
-                for _ in range(params.depth_single_blocks)
-            ]
-        )
-
-        # pixel channel concat with DCT
-        self.nerf_image_embedder = NerfEmbedder(
-            in_channels=params.in_channels,
-            hidden_size_input=params.nerf_hidden_size,
-            max_freqs=params.nerf_max_freqs,
-            dtype=params.nerf_embedder_dtype or dtype,
-            device=device,
-            operations=operations,
-        )
-
-        self.nerf_blocks = nn.ModuleList([
-            NerfGLUBlock(
-                hidden_size_s=params.hidden_size,
-                hidden_size_x=params.nerf_hidden_size,
-                mlp_ratio=params.nerf_mlp_ratio,
-                dtype=dtype,
-                device=device,
-                operations=operations,
-            ) for _ in range(params.nerf_depth)
-        ])
-
-        if params.nerf_final_head_type == "linear":
-            self.nerf_final_layer = NerfFinalLayer(
-                params.nerf_hidden_size,
-                out_channels=params.in_channels,
-                dtype=dtype,
-                device=device,
-                operations=operations,
-            )
-        elif params.nerf_final_head_type == "conv":
-            self.nerf_final_layer_conv = NerfFinalLayerConv(
-                params.nerf_hidden_size,
-                out_channels=params.in_channels,
-                dtype=dtype,
-                device=device,
-                operations=operations,
-            )
-        else:
-            errstr = f"Unsupported nerf_final_head_type {params.nerf_final_head_type}"
-            raise ValueError(errstr)
-
-        self.skip_mmdit = []
-        self.skip_dit = []
-        self.lite = False
-
-        if params.use_x0:
-            self.register_buffer("__x0__", torch.tensor([]))
-
-    @property
-    def _nerf_final_layer(self) -> nn.Module:
-        if self.params.nerf_final_head_type == "linear":
-            return self.nerf_final_layer
-        if self.params.nerf_final_head_type == "conv":
-            return self.nerf_final_layer_conv
-        # Impossible to get here as we raise an error on unexpected types on initialization.
-        raise NotImplementedError
-
-    def img_in(self, img: Tensor) -> Tensor:
-        img = self.img_in_patch(img) # -> [B, Hidden, H/P, W/P]
-        # flatten into a sequence for the transformer.
-        return img.flatten(2).transpose(1, 2) # -> [B, NumPatches, Hidden]
-
-    def forward_nerf(
-        self,
-        img_orig: Tensor,
-        img_out: Tensor,
-        params: ChromaRadianceParams,
-    ) -> Tensor:
-        B, C, H, W = img_orig.shape
-        num_patches = img_out.shape[1]
-        patch_size = params.patch_size
-
-        # Store the raw pixel values of each patch for the NeRF head later.
-        # unfold creates patches: [B, C * P * P, NumPatches]
-        nerf_pixels = nn.functional.unfold(img_orig, kernel_size=patch_size, stride=patch_size)
-        nerf_pixels = nerf_pixels.transpose(1, 2) # -> [B, NumPatches, C * P * P]
-
-        # Reshape for per-patch processing
-        nerf_hidden = img_out.reshape(B * num_patches, params.hidden_size)
-        nerf_pixels = nerf_pixels.reshape(B * num_patches, C, patch_size**2).transpose(1, 2)
-
-        if params.nerf_tile_size > 0 and num_patches > params.nerf_tile_size:
-            # Enable tiling if nerf_tile_size isn't 0 and we actually have more patches than
-            # the tile size.
-            img_dct = self.forward_tiled_nerf(nerf_hidden, nerf_pixels, B, C, num_patches, patch_size, params)
-        else:
-            # Get DCT-encoded pixel embeddings [pixel-dct]
-            img_dct = self.nerf_image_embedder(nerf_pixels)
-
-            # Pass through the dynamic MLP blocks (the NeRF)
-            for block in self.nerf_blocks:
-                img_dct = block(img_dct, nerf_hidden)
-
-        # Reassemble the patches into the final image.
-        img_dct = img_dct.transpose(1, 2) # -> [B*NumPatches, C, P*P]
-        # Reshape to combine with batch dimension for fold
-        img_dct = img_dct.reshape(B, num_patches, -1) # -> [B, NumPatches, C*P*P]
-        img_dct = img_dct.transpose(1, 2) # -> [B, C*P*P, NumPatches]
-        img_dct = nn.functional.fold(
-            img_dct,
-            output_size=(H, W),
-            kernel_size=patch_size,
-            stride=patch_size,
-        )
-        return self._nerf_final_layer(img_dct)
-
-    def forward_tiled_nerf(
-        self,
-        nerf_hidden: Tensor,
-        nerf_pixels: Tensor,
-        batch: int,
-        channels: int,
-        num_patches: int,
-        patch_size: int,
-        params: ChromaRadianceParams,
-    ) -> Tensor:
-        """
-        Processes the NeRF head in tiles to save memory.
-        nerf_hidden has shape [B, L, D]
-        nerf_pixels has shape [B, L, C * P * P]
-        """
-        tile_size = params.nerf_tile_size
-        output_tiles = []
-        # Iterate over the patches in tiles. The dimension L (num_patches) is at index 1.
-        for i in range(0, num_patches, tile_size):
-            end = min(i + tile_size, num_patches)
-
-            # Slice the current tile from the input tensors
-            nerf_hidden_tile = nerf_hidden[i * batch:end * batch]
-            nerf_pixels_tile = nerf_pixels[i * batch:end * batch]
-
-            # get DCT-encoded pixel embeddings [pixel-dct]
-            img_dct_tile = self.nerf_image_embedder(nerf_pixels_tile)
-
-            # pass through the dynamic MLP blocks (the NeRF)
-            for block in self.nerf_blocks:
-                img_dct_tile = block(img_dct_tile, nerf_hidden_tile)
-
-            output_tiles.append(img_dct_tile)
-
-        # Concatenate the processed tiles along the patch dimension
-        return torch.cat(output_tiles, dim=0)
-
-    def radiance_get_override_params(self, overrides: dict) -> ChromaRadianceParams:
-        params = self.params
-        if not overrides:
-            return params
-        params_dict = {k: getattr(params, k) for k in params.__dataclass_fields__}
-        nullable_keys = frozenset(("nerf_embedder_dtype",))
-        bad_keys = tuple(k for k in overrides if k not in params_dict)
-        if bad_keys:
-            e = f"Unknown key(s) in transformer_options chroma_radiance_options: {', '.join(bad_keys)}"
-            raise ValueError(e)
-        bad_keys = tuple(
-            k
-            for k, v in overrides.items()
-            if not isinstance(v, type(getattr(params, k))) and (v is not None or k not in nullable_keys)
-        )
-        if bad_keys:
-            e = f"Invalid value(s) in transformer_options chroma_radiance_options: {', '.join(bad_keys)}"
-            raise ValueError(e)
-        # At this point it's all valid keys and values so we can merge with the existing params.
-        params_dict |= overrides
-        return params.__class__(**params_dict)
-
-    def _apply_x0_residual(self, predicted, noisy, timesteps):
-
-        # non zero during training to prevent 0 div
-        eps = 0.0
-        return (noisy - predicted) / (timesteps.view(-1,1,1,1) + eps)
-
-    def _forward(
-        self,
-        x: Tensor,
-        timestep: Tensor,
-        context: Tensor,
-        guidance: Optional[Tensor],
-        control: Optional[dict]=None,
-        transformer_options: dict={},
-        **kwargs: dict,
-    ) -> Tensor:
-        bs, c, h, w = x.shape
-        img = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
-
-        if img.ndim != 4:
-            raise ValueError("Input img tensor must be in [B, C, H, W] format.")
-        if context.ndim != 3:
-            raise ValueError("Input txt tensors must have 3 dimensions.")
-
-        params = self.radiance_get_override_params(transformer_options.get("chroma_radiance_options", {}))
-
-        h_len = (img.shape[-2] // self.patch_size)
-        w_len = (img.shape[-1] // self.patch_size)
-
-        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
-        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
-        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
-        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
-        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
-
-        img_out = self.forward_orig(
-            img,
-            img_ids,
-            context,
-            txt_ids,
-            timestep,
-            guidance,
-            control,
-            transformer_options,
-            attn_mask=kwargs.get("attention_mask", None),
-        )
-
-        out = self.forward_nerf(img, img_out, params)[:, :, :h, :w]
-
-        # If x0 variant → v-pred, just return this instead
-        if hasattr(self, "__x0__"):
-            out = self._apply_x0_residual(out, img, timestep)
-        return out
-
--- a/comfy/ldm/cosmos/blocks.py
+++ b/comfy/ldm/cosmos/blocks.py
@ -176,7 +176,6 @@ class Attention(nn.Module):
        context=None,
        mask=None,
        rope_emb=None,
-        transformer_options={},
        **kwargs,
    ):
        """
@ -185,7 +184,7 @@ class Attention(nn.Module):
            context (Optional[Tensor]): The key tensor of shape [B, Mk, K] or use x as context [self attention] if None
        """
        q, k, v = self.cal_qkv(x, context, mask, rope_emb=rope_emb, **kwargs)
-        out = optimized_attention(q, k, v, self.heads, skip_reshape=True, mask=mask, skip_output_reshape=True, transformer_options=transformer_options)
+        out = optimized_attention(q, k, v, self.heads, skip_reshape=True, mask=mask, skip_output_reshape=True)
        del q, k, v
        out = rearrange(out, " b n s c -> s b (n c)")
        return self.to_out(out)
@ -547,7 +546,6 @@ class VideoAttn(nn.Module):
        context: Optional[torch.Tensor] = None,
        crossattn_mask: Optional[torch.Tensor] = None,
        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
-        transformer_options: Optional[dict] = {},
    ) -> torch.Tensor:
        """
        Forward pass for video attention.
@ -573,7 +571,6 @@ class VideoAttn(nn.Module):
            context_M_B_D,
            crossattn_mask,
            rope_emb=rope_emb_L_1_1_D,
-            transformer_options=transformer_options,
        )
        x_T_H_W_B_D = rearrange(x_THW_B_D, "(t h w) b d -> t h w b d", h=H, w=W)
        return x_T_H_W_B_D
@ -668,7 +665,6 @@ class DITBuildingBlock(nn.Module):
        crossattn_mask: Optional[torch.Tensor] = None,
        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
        adaln_lora_B_3D: Optional[torch.Tensor] = None,
-        transformer_options: Optional[dict] = {},
    ) -> torch.Tensor:
        """
        Forward pass for dynamically configured blocks with adaptive normalization.
@ -706,7 +702,6 @@ class DITBuildingBlock(nn.Module):
                adaln_norm_state(self.norm_state, x, scale_1_1_1_B_D, shift_1_1_1_B_D),
                context=None,
                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
-                transformer_options=transformer_options,
            )
        elif self.block_type in ["cross_attn", "ca"]:
            x = x + gate_1_1_1_B_D * self.block(
@ -714,7 +709,6 @@ class DITBuildingBlock(nn.Module):
                context=crossattn_emb,
                crossattn_mask=crossattn_mask,
                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
-                transformer_options=transformer_options,
            )
        else:
            raise ValueError(f"Unknown block type: {self.block_type}")
@ -790,7 +784,6 @@ class GeneralDITTransformerBlock(nn.Module):
        crossattn_mask: Optional[torch.Tensor] = None,
        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
        adaln_lora_B_3D: Optional[torch.Tensor] = None,
-        transformer_options: Optional[dict] = {},
    ) -> torch.Tensor:
        for block in self.blocks:
            x = block(
@ -800,6 +793,5 @@ class GeneralDITTransformerBlock(nn.Module):
                crossattn_mask,
                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
                adaln_lora_B_3D=adaln_lora_B_3D,
-                transformer_options=transformer_options,
            )
        return x
--- a/comfy/ldm/cosmos/model.py
+++ b/comfy/ldm/cosmos/model.py
@ -520,7 +520,6 @@ class GeneralDIT(nn.Module):
                x.shape == extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D.shape
            ), f"{x.shape} != {extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D.shape} {original_shape}"

-        transformer_options = kwargs.get("transformer_options", {})
        for _, block in self.blocks.items():
            assert (
                self.blocks["block0"].x_format == block.x_format
@ -535,7 +534,6 @@ class GeneralDIT(nn.Module):
                crossattn_mask,
                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
                adaln_lora_B_3D=adaln_lora_B_3D,
-                transformer_options=transformer_options,
            )

        x_B_T_H_W_D = rearrange(x, "T H W B D -> B T H W D")
--- a/comfy/ldm/cosmos/predict2.py
+++ b/comfy/ldm/cosmos/predict2.py
@ -44,7 +44,7 @@ class GPT2FeedForward(nn.Module):
        return x


-def torch_attention_op(q_B_S_H_D: torch.Tensor, k_B_S_H_D: torch.Tensor, v_B_S_H_D: torch.Tensor, transformer_options: Optional[dict] = {}) -> torch.Tensor:
+def torch_attention_op(q_B_S_H_D: torch.Tensor, k_B_S_H_D: torch.Tensor, v_B_S_H_D: torch.Tensor) -> torch.Tensor:
    """Computes multi-head attention using PyTorch's native implementation.

    This function provides a PyTorch backend alternative to Transformer Engine's attention operation.
@ -71,7 +71,7 @@ def torch_attention_op(q_B_S_H_D: torch.Tensor, k_B_S_H_D: torch.Tensor, v_B_S_H
    q_B_H_S_D = rearrange(q_B_S_H_D, "b ... h k -> b h ... k").view(in_q_shape[0], in_q_shape[-2], -1, in_q_shape[-1])
    k_B_H_S_D = rearrange(k_B_S_H_D, "b ... h v -> b h ... v").view(in_k_shape[0], in_k_shape[-2], -1, in_k_shape[-1])
    v_B_H_S_D = rearrange(v_B_S_H_D, "b ... h v -> b h ... v").view(in_k_shape[0], in_k_shape[-2], -1, in_k_shape[-1])
-    return optimized_attention(q_B_H_S_D, k_B_H_S_D, v_B_H_S_D, in_q_shape[-2], skip_reshape=True, transformer_options=transformer_options)
+    return optimized_attention(q_B_H_S_D, k_B_H_S_D, v_B_H_S_D, in_q_shape[-2], skip_reshape=True)


 class Attention(nn.Module):
@ -180,8 +180,8 @@ class Attention(nn.Module):

        return q, k, v

-    def compute_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, transformer_options: Optional[dict] = {}) -> torch.Tensor:
-        result = self.attn_op(q, k, v, transformer_options=transformer_options)  # [B, S, H, D]
+    def compute_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+        result = self.attn_op(q, k, v)  # [B, S, H, D]
        return self.output_dropout(self.output_proj(result))

    def forward(
@ -189,7 +189,6 @@ class Attention(nn.Module):
        x: torch.Tensor,
        context: Optional[torch.Tensor] = None,
        rope_emb: Optional[torch.Tensor] = None,
-        transformer_options: Optional[dict] = {},
    ) -> torch.Tensor:
        """
        Args:
@ -197,7 +196,7 @@ class Attention(nn.Module):
            context (Optional[Tensor]): The key tensor of shape [B, Mk, K] or use x as context [self attention] if None
        """
        q, k, v = self.compute_qkv(x, context, rope_emb=rope_emb)
-        return self.compute_attention(q, k, v, transformer_options=transformer_options)
+        return self.compute_attention(q, k, v)


 class Timesteps(nn.Module):
@ -460,7 +459,6 @@ class Block(nn.Module):
        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
        adaln_lora_B_T_3D: Optional[torch.Tensor] = None,
        extra_per_block_pos_emb: Optional[torch.Tensor] = None,
-        transformer_options: Optional[dict] = {},
    ) -> torch.Tensor:
        if extra_per_block_pos_emb is not None:
            x_B_T_H_W_D = x_B_T_H_W_D + extra_per_block_pos_emb
@ -514,7 +512,6 @@ class Block(nn.Module):
                rearrange(normalized_x_B_T_H_W_D, "b t h w d -> b (t h w) d"),
                None,
                rope_emb=rope_emb_L_1_1_D,
-                transformer_options=transformer_options,
            ),
            "b (t h w) d -> b t h w d",
            t=T,
@ -528,7 +525,6 @@ class Block(nn.Module):
            layer_norm_cross_attn: Callable,
            _scale_cross_attn_B_T_1_1_D: torch.Tensor,
            _shift_cross_attn_B_T_1_1_D: torch.Tensor,
-            transformer_options: Optional[dict] = {},
        ) -> torch.Tensor:
            _normalized_x_B_T_H_W_D = _fn(
                _x_B_T_H_W_D, layer_norm_cross_attn, _scale_cross_attn_B_T_1_1_D, _shift_cross_attn_B_T_1_1_D
@ -538,7 +534,6 @@ class Block(nn.Module):
                    rearrange(_normalized_x_B_T_H_W_D, "b t h w d -> b (t h w) d"),
                    crossattn_emb,
                    rope_emb=rope_emb_L_1_1_D,
-                    transformer_options=transformer_options,
                ),
                "b (t h w) d -> b t h w d",
                t=T,
@ -552,7 +547,6 @@ class Block(nn.Module):
            self.layer_norm_cross_attn,
            scale_cross_attn_B_T_1_1_D,
            shift_cross_attn_B_T_1_1_D,
-            transformer_options=transformer_options,
        )
        x_B_T_H_W_D = result_B_T_H_W_D * gate_cross_attn_B_T_1_1_D + x_B_T_H_W_D

@ -871,7 +865,6 @@ class MiniTrainDIT(nn.Module):
            "rope_emb_L_1_1_D": rope_emb_L_1_1_D.unsqueeze(1).unsqueeze(0),
            "adaln_lora_B_T_3D": adaln_lora_B_T_3D,
            "extra_per_block_pos_emb": extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D,
-            "transformer_options": kwargs.get("transformer_options", {}),
        }
        for block in self.blocks:
            x_B_T_H_W_D = block(
--- a/comfy/ldm/flux/layers.py
+++ b/comfy/ldm/flux/layers.py
@ -48,44 +48,15 @@ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 10
    return embedding

 class MLPEmbedder(nn.Module):
-    def __init__(self, in_dim: int, hidden_dim: int, bias=True, dtype=None, device=None, operations=None):
+    def __init__(self, in_dim: int, hidden_dim: int, dtype=None, device=None, operations=None):
        super().__init__()
-        self.in_layer = operations.Linear(in_dim, hidden_dim, bias=bias, dtype=dtype, device=device)
+        self.in_layer = operations.Linear(in_dim, hidden_dim, bias=True, dtype=dtype, device=device)
        self.silu = nn.SiLU()
-        self.out_layer = operations.Linear(hidden_dim, hidden_dim, bias=bias, dtype=dtype, device=device)
+        self.out_layer = operations.Linear(hidden_dim, hidden_dim, bias=True, dtype=dtype, device=device)

    def forward(self, x: Tensor) -> Tensor:
        return self.out_layer(self.silu(self.in_layer(x)))

-class YakMLP(nn.Module):
-    def __init__(self, hidden_size: int, intermediate_size: int, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.gate_proj = operations.Linear(self.hidden_size, self.intermediate_size, bias=True, dtype=dtype, device=device)
-        self.up_proj = operations.Linear(self.hidden_size, self.intermediate_size, bias=True, dtype=dtype, device=device)
-        self.down_proj = operations.Linear(self.intermediate_size, self.hidden_size, bias=True, dtype=dtype, device=device)
-        self.act_fn = nn.SiLU()
-
-    def forward(self, x: Tensor) -> Tensor:
-        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-        return down_proj
-
-def build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=False, yak_mlp=False, dtype=None, device=None, operations=None):
-    if yak_mlp:
-        return YakMLP(hidden_size, mlp_hidden_dim, dtype=dtype, device=device, operations=operations)
-    if mlp_silu_act:
-        return nn.Sequential(
-            operations.Linear(hidden_size, mlp_hidden_dim * 2, bias=False, dtype=dtype, device=device),
-            SiLUActivation(),
-            operations.Linear(mlp_hidden_dim, hidden_size, bias=False, dtype=dtype, device=device),
-        )
-    else:
-        return nn.Sequential(
-            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
-            nn.GELU(approximate="tanh"),
-            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
-        )

 class RMSNorm(torch.nn.Module):
    def __init__(self, dim: int, dtype=None, device=None, operations=None):
@ -109,14 +80,14 @@ class QKNorm(torch.nn.Module):


 class SelfAttention(nn.Module):
-    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, proj_bias: bool = True, dtype=None, device=None, operations=None):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, dtype=None, device=None, operations=None):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads

        self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
        self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
-        self.proj = operations.Linear(dim, dim, bias=proj_bias, dtype=dtype, device=device)
+        self.proj = operations.Linear(dim, dim, dtype=dtype, device=device)


@dataclass
@ -127,11 +98,11 @@ class ModulationOut:


 class Modulation(nn.Module):
-    def __init__(self, dim: int, double: bool, bias=True, dtype=None, device=None, operations=None):
+    def __init__(self, dim: int, double: bool, dtype=None, device=None, operations=None):
        super().__init__()
        self.is_double = double
        self.multiplier = 6 if double else 3
-        self.lin = operations.Linear(dim, self.multiplier * dim, bias=bias, dtype=dtype, device=device)
+        self.lin = operations.Linear(dim, self.multiplier * dim, bias=True, dtype=dtype, device=device)

    def forward(self, vec: Tensor) -> tuple:
        if vec.ndim == 2:
@ -158,107 +129,77 @@ def apply_mod(tensor, m_mult, m_add=None, modulation_dims=None):
        return tensor


-class SiLUActivation(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.gate_fn = nn.SiLU()
-
-    def forward(self, x: Tensor) -> Tensor:
-        x1, x2 = x.chunk(2, dim=-1)
-        return self.gate_fn(x1) * x2
-
-
 class DoubleStreamBlock(nn.Module):
-    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, modulation=True, mlp_silu_act=False, proj_bias=True, yak_mlp=False, dtype=None, device=None, operations=None):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None):
        super().__init__()

        mlp_hidden_dim = int(hidden_size * mlp_ratio)
        self.num_heads = num_heads
        self.hidden_size = hidden_size
-        self.modulation = modulation
-
-        if self.modulation:
-            self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
-
+        self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
        self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias, dtype=dtype, device=device, operations=operations)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)

        self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.img_mlp = nn.Sequential(
+            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
+            nn.GELU(approximate="tanh"),
+            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
+        )

-        self.img_mlp = build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=mlp_silu_act, yak_mlp=yak_mlp, dtype=dtype, device=device, operations=operations)
-
-        if self.modulation:
-            self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
-
+        self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
        self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias, dtype=dtype, device=device, operations=operations)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)

        self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-
-        self.txt_mlp = build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=mlp_silu_act, yak_mlp=yak_mlp, dtype=dtype, device=device, operations=operations)
-
+        self.txt_mlp = nn.Sequential(
+            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
+            nn.GELU(approximate="tanh"),
+            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
+        )
        self.flipped_img_txt = flipped_img_txt

-    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None, transformer_options={}):
-        if self.modulation:
-            img_mod1, img_mod2 = self.img_mod(vec)
-            txt_mod1, txt_mod2 = self.txt_mod(vec)
-        else:
-            (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec
+    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None):
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)

        # prepare image for attention
        img_modulated = self.img_norm1(img)
        img_modulated = apply_mod(img_modulated, (1 + img_mod1.scale), img_mod1.shift, modulation_dims_img)
        img_qkv = self.img_attn.qkv(img_modulated)
-        del img_modulated
        img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        del img_qkv
        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)

        # prepare txt for attention
        txt_modulated = self.txt_norm1(txt)
        txt_modulated = apply_mod(txt_modulated, (1 + txt_mod1.scale), txt_mod1.shift, modulation_dims_txt)
        txt_qkv = self.txt_attn.qkv(txt_modulated)
-        del txt_modulated
        txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        del txt_qkv
        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)

        if self.flipped_img_txt:
-            q = torch.cat((img_q, txt_q), dim=2)
-            del img_q, txt_q
-            k = torch.cat((img_k, txt_k), dim=2)
-            del img_k, txt_k
-            v = torch.cat((img_v, txt_v), dim=2)
-            del img_v, txt_v
            # run actual attention
-            attn = attention(q, k, v,
-                             pe=pe, mask=attn_mask, transformer_options=transformer_options)
-            del q, k, v
+            attn = attention(torch.cat((img_q, txt_q), dim=2),
+                             torch.cat((img_k, txt_k), dim=2),
+                             torch.cat((img_v, txt_v), dim=2),
+                             pe=pe, mask=attn_mask)

            img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1]:]
        else:
-            q = torch.cat((txt_q, img_q), dim=2)
-            del txt_q, img_q
-            k = torch.cat((txt_k, img_k), dim=2)
-            del txt_k, img_k
-            v = torch.cat((txt_v, img_v), dim=2)
-            del txt_v, img_v
            # run actual attention
-            attn = attention(q, k, v,
-                             pe=pe, mask=attn_mask, transformer_options=transformer_options)
-            del q, k, v
+            attn = attention(torch.cat((txt_q, img_q), dim=2),
+                             torch.cat((txt_k, img_k), dim=2),
+                             torch.cat((txt_v, img_v), dim=2),
+                             pe=pe, mask=attn_mask)

            txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:]

        # calculate the img bloks
-        img += apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img)
-        del img_attn
-        img += apply_mod(self.img_mlp(apply_mod(self.img_norm2(img), (1 + img_mod2.scale), img_mod2.shift, modulation_dims_img)), img_mod2.gate, None, modulation_dims_img)
+        img = img + apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img)
+        img = img + apply_mod(self.img_mlp(apply_mod(self.img_norm2(img), (1 + img_mod2.scale), img_mod2.shift, modulation_dims_img)), img_mod2.gate, None, modulation_dims_img)

        # calculate the txt bloks
        txt += apply_mod(self.txt_attn.proj(txt_attn), txt_mod1.gate, None, modulation_dims_txt)
-        del txt_attn
        txt += apply_mod(self.txt_mlp(apply_mod(self.txt_norm2(txt), (1 + txt_mod2.scale), txt_mod2.shift, modulation_dims_txt)), txt_mod2.gate, None, modulation_dims_txt)

        if txt.dtype == torch.float16:
@ -279,10 +220,6 @@ class SingleStreamBlock(nn.Module):
        num_heads: int,
        mlp_ratio: float = 4.0,
        qk_scale: float = None,
-        modulation=True,
-        mlp_silu_act=False,
-        bias=True,
-        yak_mlp=False,
        dtype=None,
        device=None,
        operations=None
@ -294,55 +231,30 @@ class SingleStreamBlock(nn.Module):
        self.scale = qk_scale or head_dim**-0.5

        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
-
-        self.mlp_hidden_dim_first = self.mlp_hidden_dim
-        self.yak_mlp = yak_mlp
-        if mlp_silu_act:
-            self.mlp_hidden_dim_first = int(hidden_size * mlp_ratio * 2)
-            self.mlp_act = SiLUActivation()
-        else:
-            self.mlp_act = nn.GELU(approximate="tanh")
-
-        if self.yak_mlp:
-            self.mlp_hidden_dim_first *= 2
-            self.mlp_act = nn.SiLU()
-
        # qkv and mlp_in
-        self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim_first, bias=bias, dtype=dtype, device=device)
+        self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device)
        # proj and mlp_out
-        self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, bias=bias, dtype=dtype, device=device)
+        self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device)

        self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)

        self.hidden_size = hidden_size
        self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)

-        if modulation:
-            self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)
-        else:
-            self.modulation = None
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)

-    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims=None, transformer_options={}) -> Tensor:
-        if self.modulation:
-            mod, _ = self.modulation(vec)
-        else:
-            mod = vec
-
-        qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim_first], dim=-1)
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims=None) -> Tensor:
+        mod, _ = self.modulation(vec)
+        qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)

        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        del qkv
        q, k = self.norm(q, k, v)

        # compute attention
-        attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options)
-        del q, k, v
+        attn = attention(q, k, v, pe=pe, mask=attn_mask)
        # compute activation in mlp stream, cat again and run second linear layer
-        if self.yak_mlp:
-            mlp = self.mlp_act(mlp[..., self.mlp_hidden_dim_first // 2:]) * mlp[..., :self.mlp_hidden_dim_first // 2]
-        else:
-            mlp = self.mlp_act(mlp)
-        output = self.linear2(torch.cat((attn, mlp), 2))
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
        x += apply_mod(output, mod.gate, None, modulation_dims)
        if x.dtype == torch.float16:
            x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
@ -350,11 +262,11 @@ class SingleStreamBlock(nn.Module):


 class LastLayer(nn.Module):
-    def __init__(self, hidden_size: int, patch_size: int, out_channels: int, bias=True, dtype=None, device=None, operations=None):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None):
        super().__init__()
        self.norm_final = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=bias, dtype=dtype, device=device)
-        self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=bias, dtype=dtype, device=device))
+        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device))

    def forward(self, x: Tensor, vec: Tensor, modulation_dims=None) -> Tensor:
        if vec.ndim == 2:
--- a/comfy/ldm/flux/math.py
+++ b/comfy/ldm/flux/math.py
@ -4,16 +4,23 @@ from torch import Tensor

 from comfy.ldm.modules.attention import optimized_attention
 import comfy.model_management
-import logging


-def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None, transformer_options={}) -> Tensor:
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None) -> Tensor:
+    q_shape = q.shape
+    k_shape = k.shape
+
    if pe is not None:
-        q, k = apply_rope(q, k, pe)
+        q = q.to(dtype=pe.dtype).reshape(*q.shape[:-1], -1, 1, 2)
+        k = k.to(dtype=pe.dtype).reshape(*k.shape[:-1], -1, 1, 2)
+        q = (pe[..., 0] * q[..., 0] + pe[..., 1] * q[..., 1]).reshape(*q_shape).type_as(v)
+        k = (pe[..., 0] * k[..., 0] + pe[..., 1] * k[..., 1]).reshape(*k_shape).type_as(v)
+
    heads = q.shape[1]
-    x = optimized_attention(q, k, v, heads, skip_reshape=True, mask=mask, transformer_options=transformer_options)
+    x = optimized_attention(q, k, v, heads, skip_reshape=True, mask=mask)
    return x

+
 def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
    assert dim % 2 == 0
    if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu() or comfy.model_management.is_directml_enabled():
@ -29,19 +36,10 @@ def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
    return out.to(dtype=torch.float32, device=pos.device)


-try:
-    import comfy.quant_ops
-    apply_rope = comfy.quant_ops.ck.apply_rope
-    apply_rope1 = comfy.quant_ops.ck.apply_rope1
-except:
-    logging.warning("No comfy kitchen, using old apply_rope functions.")
-    def apply_rope1(x: Tensor, freqs_cis: Tensor):
-        x_ = x.to(dtype=freqs_cis.dtype).reshape(*x.shape[:-1], -1, 1, 2)
+def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
+    xq_ = xq.to(dtype=freqs_cis.dtype).reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.to(dtype=freqs_cis.dtype).reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)

-        x_out = freqs_cis[..., 0] * x_[..., 0]
-        x_out.addcmul_(freqs_cis[..., 1], x_[..., 1])
-
-        return x_out.reshape(*x.shape).type_as(x)
-
-    def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
-        return apply_rope1(xq, freqs_cis), apply_rope1(xk, freqs_cis)
--- a/comfy/ldm/flux/model.py
+++ b/comfy/ldm/flux/model.py
@ -15,8 +15,6 @@ from .layers import (
    MLPEmbedder,
    SingleStreamBlock,
    timestep_embedding,
-    Modulation,
-    RMSNorm
 )

@dataclass
@ -35,14 +33,6 @@ class FluxParams:
    patch_size: int
    qkv_bias: bool
    guidance_embed: bool
-    txt_ids_dims: list
-    global_modulation: bool = False
-    mlp_silu_act: bool = False
-    ops_bias: bool = True
-    default_ref_method: str = "offset"
-    ref_index_scale: float = 1.0
-    yak_mlp: bool = False
-    txt_norm: bool = False


 class Flux(nn.Module):
@ -68,22 +58,13 @@ class Flux(nn.Module):
        self.hidden_size = params.hidden_size
        self.num_heads = params.num_heads
        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
-        self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device)
-        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device, operations=operations)
-        if params.vec_in_dim is not None:
-            self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations)
-        else:
-            self.vector_in = None
-
+        self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=True, dtype=dtype, device=device)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations)
+        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations)
        self.guidance_in = (
-            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device, operations=operations) if params.guidance_embed else nn.Identity()
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations) if params.guidance_embed else nn.Identity()
        )
-        self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device)
-
-        if params.txt_norm:
-            self.txt_norm = RMSNorm(params.context_in_dim, dtype=dtype, device=device, operations=operations)
-        else:
-            self.txt_norm = None
+        self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, dtype=dtype, device=device)

        self.double_blocks = nn.ModuleList(
            [
@ -92,10 +73,6 @@ class Flux(nn.Module):
                    self.num_heads,
                    mlp_ratio=params.mlp_ratio,
                    qkv_bias=params.qkv_bias,
-                    modulation=params.global_modulation is False,
-                    mlp_silu_act=params.mlp_silu_act,
-                    proj_bias=params.ops_bias,
-                    yak_mlp=params.yak_mlp,
                    dtype=dtype, device=device, operations=operations
                )
                for _ in range(params.depth)
@ -104,30 +81,13 @@ class Flux(nn.Module):

        self.single_blocks = nn.ModuleList(
            [
-                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, modulation=params.global_modulation is False, mlp_silu_act=params.mlp_silu_act, bias=params.ops_bias, yak_mlp=params.yak_mlp, dtype=dtype, device=device, operations=operations)
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations)
                for _ in range(params.depth_single_blocks)
            ]
        )

        if final_layer:
-            self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, bias=params.ops_bias, dtype=dtype, device=device, operations=operations)
-
-        if params.global_modulation:
-            self.double_stream_modulation_img = Modulation(
-                self.hidden_size,
-                double=True,
-                bias=False,
-                dtype=dtype, device=device, operations=operations
-            )
-            self.double_stream_modulation_txt = Modulation(
-                self.hidden_size,
-                double=True,
-                bias=False,
-                dtype=dtype, device=device, operations=operations
-            )
-            self.single_stream_modulation = Modulation(
-                self.hidden_size, double=False, bias=False, dtype=dtype, device=device, operations=operations
-            )
+            self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, dtype=dtype, device=device, operations=operations)

    def forward_orig(
        self,
@ -143,6 +103,9 @@ class Flux(nn.Module):
        attn_mask: Tensor = None,
    ) -> Tensor:

+        if y is None:
+            y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype)
+
        patches = transformer_options.get("patches", {})
        patches_replace = transformer_options.get("patches_replace", {})
        if img.ndim != 3 or txt.ndim != 3:
@ -155,19 +118,9 @@ class Flux(nn.Module):
            if guidance is not None:
                vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))

-        if self.vector_in is not None:
-            if y is None:
-                y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype)
-            vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
-
-        if self.txt_norm is not None:
-            txt = self.txt_norm(txt)
+        vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
        txt = self.txt_in(txt)

-        vec_orig = vec
-        if self.params.global_modulation:
-            vec = (self.double_stream_modulation_img(vec_orig), self.double_stream_modulation_txt(vec_orig))
-
        if "post_input" in patches:
            for p in patches["post_input"]:
                out = p({"img": img, "txt": txt, "img_ids": img_ids, "txt_ids": txt_ids})
@ -183,10 +136,7 @@ class Flux(nn.Module):
            pe = None

        blocks_replace = patches_replace.get("dit", {})
-        transformer_options["total_blocks"] = len(self.double_blocks)
-        transformer_options["block_type"] = "double"
        for i, block in enumerate(self.double_blocks):
-            transformer_options["block_index"] = i
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
@ -194,16 +144,14 @@ class Flux(nn.Module):
                                                   txt=args["txt"],
                                                   vec=args["vec"],
                                                   pe=args["pe"],
-                                                   attn_mask=args.get("attn_mask"),
-                                                   transformer_options=args.get("transformer_options"))
+                                                   attn_mask=args.get("attn_mask"))
                    return out

                out = blocks_replace[("double_block", i)]({"img": img,
                                                           "txt": txt,
                                                           "vec": vec,
                                                           "pe": pe,
-                                                           "attn_mask": attn_mask,
-                                                           "transformer_options": transformer_options},
+                                                           "attn_mask": attn_mask},
                                                          {"original_block": block_wrap})
                txt = out["txt"]
                img = out["img"]
@ -212,8 +160,7 @@ class Flux(nn.Module):
                                 txt=txt,
                                 vec=vec,
                                 pe=pe,
-                                 attn_mask=attn_mask,
-                                 transformer_options=transformer_options)
+                                 attn_mask=attn_mask)

            if control is not None: # Controlnet
                control_i = control.get("input")
@ -227,32 +174,24 @@ class Flux(nn.Module):

        img = torch.cat((txt, img), 1)

-        if self.params.global_modulation:
-            vec, _ = self.single_stream_modulation(vec_orig)
-
-        transformer_options["total_blocks"] = len(self.single_blocks)
-        transformer_options["block_type"] = "single"
        for i, block in enumerate(self.single_blocks):
-            transformer_options["block_index"] = i
            if ("single_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
                    out["img"] = block(args["img"],
                                       vec=args["vec"],
                                       pe=args["pe"],
-                                       attn_mask=args.get("attn_mask"),
-                                       transformer_options=args.get("transformer_options"))
+                                       attn_mask=args.get("attn_mask"))
                    return out

                out = blocks_replace[("single_block", i)]({"img": img,
                                                           "vec": vec,
                                                           "pe": pe,
-                                                           "attn_mask": attn_mask,
-                                                           "transformer_options": transformer_options},
+                                                           "attn_mask": attn_mask},
                                                          {"original_block": block_wrap})
                img = out["img"]
            else:
-                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask, transformer_options=transformer_options)
+                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask)

            if control is not None: # Controlnet
                control_o = control.get("output")
@ -263,10 +202,10 @@ class Flux(nn.Module):

        img = img[:, txt.shape[1] :, ...]

-        img = self.final_layer(img, vec_orig)  # (N, T, patch_size ** 2 * out_channels)
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
        return img

-    def process_img(self, x, index=0, h_offset=0, w_offset=0, transformer_options={}):
+    def process_img(self, x, index=0, h_offset=0, w_offset=0):
        bs, c, h, w = x.shape
        patch_size = self.patch_size
        x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))
@ -278,22 +217,10 @@ class Flux(nn.Module):
        h_offset = ((h_offset + (patch_size // 2)) // patch_size)
        w_offset = ((w_offset + (patch_size // 2)) // patch_size)

-        steps_h = h_len
-        steps_w = w_len
-
-        rope_options = transformer_options.get("rope_options", None)
-        if rope_options is not None:
-            h_len = (h_len - 1.0) * rope_options.get("scale_y", 1.0) + 1.0
-            w_len = (w_len - 1.0) * rope_options.get("scale_x", 1.0) + 1.0
-
-            index += rope_options.get("shift_t", 0.0)
-            h_offset += rope_options.get("shift_y", 0.0)
-            w_offset += rope_options.get("shift_x", 0.0)
-
-        img_ids = torch.zeros((steps_h, steps_w, len(self.params.axes_dim)), device=x.device, dtype=torch.float32)
+        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
        img_ids[:, :, 0] = img_ids[:, :, 1] + index
-        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=steps_h, device=x.device, dtype=torch.float32).unsqueeze(1)
-        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=steps_w, device=x.device, dtype=torch.float32).unsqueeze(0)
+        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
+        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
        return img, repeat(img_ids, "h w c -> b (h w) c", b=bs)

    def forward(self, x, timestep, context, y=None, guidance=None, ref_latents=None, control=None, transformer_options={}, **kwargs):
@ -309,16 +236,16 @@ class Flux(nn.Module):

        h_len = ((h_orig + (patch_size // 2)) // patch_size)
        w_len = ((w_orig + (patch_size // 2)) // patch_size)
-        img, img_ids = self.process_img(x, transformer_options=transformer_options)
+        img, img_ids = self.process_img(x)
        img_tokens = img.shape[1]
        if ref_latents is not None:
            h = 0
            w = 0
            index = 0
-            ref_latents_method = kwargs.get("ref_latents_method", self.params.default_ref_method)
+            ref_latents_method = kwargs.get("ref_latents_method", "offset")
            for ref in ref_latents:
                if ref_latents_method == "index":
-                    index += self.params.ref_index_scale
+                    index += 1
                    h_offset = 0
                    w_offset = 0
                elif ref_latents_method == "uxo":
@ -342,12 +269,7 @@ class Flux(nn.Module):
                img = torch.cat([img, kontext], dim=1)
                img_ids = torch.cat([img_ids, kontext_ids], dim=1)

-        txt_ids = torch.zeros((bs, context.shape[1], len(self.params.axes_dim)), device=x.device, dtype=torch.float32)
-
-        if len(self.params.txt_ids_dims) > 0:
-            for i in self.params.txt_ids_dims:
-                txt_ids[:, :, i] = torch.linspace(0, context.shape[1] - 1, steps=context.shape[1], device=x.device, dtype=torch.float32)
-
+        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
        out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None))
        out = out[:, :img_tokens]
-        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=self.patch_size, pw=self.patch_size)[:,:,:h_orig,:w_orig]
+        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h_orig,:w_orig]
--- a/comfy/ldm/genmo/joint_model/asymm_models_joint.py
+++ b/comfy/ldm/genmo/joint_model/asymm_models_joint.py
@ -109,7 +109,6 @@ class AsymmetricAttention(nn.Module):
        scale_x: torch.Tensor,  # (B, dim_x), modulation for pre-RMSNorm.
        scale_y: torch.Tensor,  # (B, dim_y), modulation for pre-RMSNorm.
        crop_y,
-        transformer_options={},
        **rope_rotation,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        rope_cos = rope_rotation.get("rope_cos")
@ -144,7 +143,7 @@ class AsymmetricAttention(nn.Module):

        xy = optimized_attention(q,
                                 k,
-                                 v, self.num_heads, skip_reshape=True, transformer_options=transformer_options)
+                                 v, self.num_heads, skip_reshape=True)

        x, y = torch.tensor_split(xy, (q_x.shape[1],), dim=1)
        x = self.proj_x(x)
@ -225,7 +224,6 @@ class AsymmetricJointBlock(nn.Module):
        x: torch.Tensor,
        c: torch.Tensor,
        y: torch.Tensor,
-        transformer_options={},
        **attn_kwargs,
    ):
        """Forward pass of a block.
@ -258,7 +256,6 @@ class AsymmetricJointBlock(nn.Module):
            y,
            scale_x=scale_msa_x,
            scale_y=scale_msa_y,
-            transformer_options=transformer_options,
            **attn_kwargs,
        )

@ -527,11 +524,10 @@ class AsymmDiTJoint(nn.Module):
                                                    args["txt"],
                                                    rope_cos=args["rope_cos"],
                                                    rope_sin=args["rope_sin"],
-                                                    crop_y=args["num_tokens"],
-                                                    transformer_options=args["transformer_options"]
+                                                    crop_y=args["num_tokens"]
                                                    )
                    return out
-                out = blocks_replace[("double_block", i)]({"img": x, "txt": y_feat, "vec": c, "rope_cos": rope_cos, "rope_sin": rope_sin, "num_tokens": num_tokens, "transformer_options": transformer_options}, {"original_block": block_wrap})
+                out = blocks_replace[("double_block", i)]({"img": x, "txt": y_feat, "vec": c, "rope_cos": rope_cos, "rope_sin": rope_sin, "num_tokens": num_tokens}, {"original_block": block_wrap})
                y_feat = out["txt"]
                x = out["img"]
            else:
@ -542,7 +538,6 @@ class AsymmDiTJoint(nn.Module):
                    rope_cos=rope_cos,
                    rope_sin=rope_sin,
                    crop_y=num_tokens,
-                    transformer_options=transformer_options,
                )  # (B, M, D), (B, L, D)
        del y_feat  # Final layers don't use dense text features.

--- a/comfy/ldm/hidream/model.py
+++ b/comfy/ldm/hidream/model.py
@ -72,8 +72,8 @@ class TimestepEmbed(nn.Module):
        return t_emb


-def attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, transformer_options={}):
-    return optimized_attention(query.view(query.shape[0], -1, query.shape[-1] * query.shape[-2]), key.view(key.shape[0], -1, key.shape[-1] * key.shape[-2]), value.view(value.shape[0], -1, value.shape[-1] * value.shape[-2]), query.shape[2], transformer_options=transformer_options)
+def attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor):
+    return optimized_attention(query.view(query.shape[0], -1, query.shape[-1] * query.shape[-2]), key.view(key.shape[0], -1, key.shape[-1] * key.shape[-2]), value.view(value.shape[0], -1, value.shape[-1] * value.shape[-2]), query.shape[2])


 class HiDreamAttnProcessor_flashattn:
@ -86,7 +86,6 @@ class HiDreamAttnProcessor_flashattn:
        image_tokens_masks: Optional[torch.FloatTensor] = None,
        text_tokens: Optional[torch.FloatTensor] = None,
        rope: torch.FloatTensor = None,
-        transformer_options={},
        *args,
        **kwargs,
    ) -> torch.FloatTensor:
@ -134,7 +133,7 @@ class HiDreamAttnProcessor_flashattn:
            query = torch.cat([query_1, query_2], dim=-1)
            key = torch.cat([key_1, key_2], dim=-1)

-        hidden_states = attention(query, key, value, transformer_options=transformer_options)
+        hidden_states = attention(query, key, value)

        if not attn.single:
            hidden_states_i, hidden_states_t = torch.split(hidden_states, [num_image_tokens, num_text_tokens], dim=1)
@ -200,7 +199,6 @@ class HiDreamAttention(nn.Module):
        image_tokens_masks: torch.FloatTensor = None,
        norm_text_tokens: torch.FloatTensor = None,
        rope: torch.FloatTensor = None,
-        transformer_options={},
    ) -> torch.Tensor:
        return self.processor(
            self,
@ -208,7 +206,6 @@ class HiDreamAttention(nn.Module):
            image_tokens_masks = image_tokens_masks,
            text_tokens = norm_text_tokens,
            rope = rope,
-            transformer_options=transformer_options,
        )


@ -409,7 +406,7 @@ class HiDreamImageSingleTransformerBlock(nn.Module):
        text_tokens: Optional[torch.FloatTensor] = None,
        adaln_input: Optional[torch.FloatTensor] = None,
        rope: torch.FloatTensor = None,
-        transformer_options={},
+
    ) -> torch.FloatTensor:
        wtype = image_tokens.dtype
        shift_msa_i, scale_msa_i, gate_msa_i, shift_mlp_i, scale_mlp_i, gate_mlp_i = \
@ -422,7 +419,6 @@ class HiDreamImageSingleTransformerBlock(nn.Module):
            norm_image_tokens,
            image_tokens_masks,
            rope = rope,
-            transformer_options=transformer_options,
        )
        image_tokens = gate_msa_i * attn_output_i + image_tokens

@ -487,7 +483,6 @@ class HiDreamImageTransformerBlock(nn.Module):
        text_tokens: Optional[torch.FloatTensor] = None,
        adaln_input: Optional[torch.FloatTensor] = None,
        rope: torch.FloatTensor = None,
-        transformer_options={},
    ) -> torch.FloatTensor:
        wtype = image_tokens.dtype
        shift_msa_i, scale_msa_i, gate_msa_i, shift_mlp_i, scale_mlp_i, gate_mlp_i, \
@ -505,7 +500,6 @@ class HiDreamImageTransformerBlock(nn.Module):
            image_tokens_masks,
            norm_text_tokens,
            rope = rope,
-            transformer_options=transformer_options,
        )

        image_tokens = gate_msa_i * attn_output_i + image_tokens
@ -556,7 +550,6 @@ class HiDreamImageBlock(nn.Module):
        text_tokens: Optional[torch.FloatTensor] = None,
        adaln_input: torch.FloatTensor = None,
        rope: torch.FloatTensor = None,
-        transformer_options={},
    ) -> torch.FloatTensor:
        return self.block(
            image_tokens,
@ -564,7 +557,6 @@ class HiDreamImageBlock(nn.Module):
            text_tokens,
            adaln_input,
            rope,
-            transformer_options=transformer_options,
        )


@ -794,7 +786,6 @@ class HiDreamImageTransformer2DModel(nn.Module):
                text_tokens = cur_encoder_hidden_states,
                adaln_input = adaln_input,
                rope = rope,
-                transformer_options=transformer_options,
            )
            initial_encoder_hidden_states = initial_encoder_hidden_states[:, :initial_encoder_hidden_states_seq_len]
            block_id += 1
@ -818,7 +809,6 @@ class HiDreamImageTransformer2DModel(nn.Module):
                text_tokens=None,
                adaln_input=adaln_input,
                rope=rope,
-                transformer_options=transformer_options,
            )
            hidden_states = hidden_states[:, :hidden_states_seq_len]
            block_id += 1
--- a/comfy/ldm/hunyuan3d/model.py
+++ b/comfy/ldm/hunyuan3d/model.py
@ -99,16 +99,14 @@ class Hunyuan3Dv2(nn.Module):
                                                   txt=args["txt"],
                                                   vec=args["vec"],
                                                   pe=args["pe"],
-                                                   attn_mask=args.get("attn_mask"),
-                                                   transformer_options=args["transformer_options"])
+                                                   attn_mask=args.get("attn_mask"))
                    return out

                out = blocks_replace[("double_block", i)]({"img": img,
                                                           "txt": txt,
                                                           "vec": vec,
                                                           "pe": pe,
-                                                           "attn_mask": attn_mask,
-                                                           "transformer_options": transformer_options},
+                                                           "attn_mask": attn_mask},
                                                          {"original_block": block_wrap})
                txt = out["txt"]
                img = out["img"]
@ -117,8 +115,7 @@ class Hunyuan3Dv2(nn.Module):
                                 txt=txt,
                                 vec=vec,
                                 pe=pe,
-                                 attn_mask=attn_mask,
-                                 transformer_options=transformer_options)
+                                 attn_mask=attn_mask)

        img = torch.cat((txt, img), 1)

@ -129,19 +126,17 @@ class Hunyuan3Dv2(nn.Module):
                    out["img"] = block(args["img"],
                                       vec=args["vec"],
                                       pe=args["pe"],
-                                       attn_mask=args.get("attn_mask"),
-                                       transformer_options=args["transformer_options"])
+                                       attn_mask=args.get("attn_mask"))
                    return out

                out = blocks_replace[("single_block", i)]({"img": img,
                                                           "vec": vec,
                                                           "pe": pe,
-                                                           "attn_mask": attn_mask,
-                                                           "transformer_options": transformer_options},
+                                                           "attn_mask": attn_mask},
                                                          {"original_block": block_wrap})
                img = out["img"]
            else:
-                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask, transformer_options=transformer_options)
+                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask)

        img = img[:, txt.shape[1]:, ...]
        img = self.final_layer(img, vec)
--- a/comfy/ldm/hunyuan_video/model.py
+++ b/comfy/ldm/hunyuan_video/model.py
@ -6,6 +6,7 @@ import comfy.ldm.flux.layers
 import comfy.ldm.modules.diffusionmodules.mmdit
 from comfy.ldm.modules.attention import optimized_attention

+
 from dataclasses import dataclass
 from einops import repeat

@ -41,9 +42,6 @@ class HunyuanVideoParams:
    guidance_embed: bool
    byt5: bool
    meanflow: bool
-    use_cond_type_embedding: bool
-    vision_in_dim: int
-    meanflow_sum: bool


 class SelfAttentionRef(nn.Module):
@ -82,13 +80,13 @@ class TokenRefinerBlock(nn.Module):
            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
        )

-    def forward(self, x, c, mask, transformer_options={}):
+    def forward(self, x, c, mask):
        mod1, mod2 = self.adaLN_modulation(c).chunk(2, dim=1)

        norm_x = self.norm1(x)
        qkv = self.self_attn.qkv(norm_x)
        q, k, v = qkv.reshape(qkv.shape[0], qkv.shape[1], 3, self.heads, -1).permute(2, 0, 3, 1, 4)
-        attn = optimized_attention(q, k, v, self.heads, mask=mask, skip_reshape=True, transformer_options=transformer_options)
+        attn = optimized_attention(q, k, v, self.heads, mask=mask, skip_reshape=True)

        x = x + self.self_attn.proj(attn) * mod1.unsqueeze(1)
        x = x + self.mlp(self.norm2(x)) * mod2.unsqueeze(1)
@ -119,14 +117,14 @@ class IndividualTokenRefiner(nn.Module):
            ]
        )

-    def forward(self, x, c, mask, transformer_options={}):
+    def forward(self, x, c, mask):
        m = None
        if mask is not None:
            m = mask.view(mask.shape[0], 1, 1, mask.shape[1]).repeat(1, 1, mask.shape[1], 1)
            m = m + m.transpose(2, 3)

        for block in self.blocks:
-            x = block(x, c, m, transformer_options=transformer_options)
+            x = block(x, c, m)
        return x


@ -154,19 +152,15 @@ class TokenRefiner(nn.Module):
        x,
        timesteps,
        mask,
-        transformer_options={},
    ):
        t = self.t_embedder(timestep_embedding(timesteps, 256, time_factor=1.0).to(x.dtype))
        # m = mask.float().unsqueeze(-1)
        # c = (x.float() * m).sum(dim=1) / m.sum(dim=1) #TODO: the following works when the x.shape is the same length as the tokens but might break otherwise
-        if x.dtype == torch.float16:
-            c = x.float().sum(dim=1) / x.shape[1]
-        else:
-            c = x.sum(dim=1) / x.shape[1]
+        c = x.sum(dim=1) / x.shape[1]

        c = t + self.c_embedder(c.to(x.dtype))
        x = self.input_embedder(x)
-        x = self.individual_token_refiner(x, c, mask, transformer_options=transformer_options)
+        x = self.individual_token_refiner(x, c, mask)
        return x


@ -201,15 +195,11 @@ class HunyuanVideo(nn.Module):
    def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs):
        super().__init__()
        self.dtype = dtype
-        operation_settings = {"operations": operations, "device": device, "dtype": dtype}
-
        params = HunyuanVideoParams(**kwargs)
        self.params = params
        self.patch_size = params.patch_size
        self.in_channels = params.in_channels
        self.out_channels = params.out_channels
-        self.use_cond_type_embedding = params.use_cond_type_embedding
-        self.vision_in_dim = params.vision_in_dim
        if params.hidden_size % params.num_heads != 0:
            raise ValueError(
                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
@ -275,18 +265,6 @@ class HunyuanVideo(nn.Module):
        if final_layer:
            self.final_layer = LastLayer(self.hidden_size, self.patch_size[-1], self.out_channels, dtype=dtype, device=device, operations=operations)

-        # HunyuanVideo 1.5 specific modules
-        if self.vision_in_dim is not None:
-            from comfy.ldm.wan.model import MLPProj
-            self.vision_in = MLPProj(in_dim=self.vision_in_dim, out_dim=self.hidden_size, operation_settings=operation_settings)
-        else:
-            self.vision_in = None
-        if self.use_cond_type_embedding:
-            # 0: text_encoder feature 1: byt5 feature 2: vision_encoder feature
-            self.cond_type_embedding = nn.Embedding(3, self.hidden_size)
-        else:
-            self.cond_type_embedding = None
-
    def forward_orig(
        self,
        img: Tensor,
@ -297,11 +275,9 @@ class HunyuanVideo(nn.Module):
        timesteps: Tensor,
        y: Tensor = None,
        txt_byt5=None,
-        clip_fea=None,
        guidance: Tensor = None,
        guiding_frame_index=None,
        ref_latent=None,
-        disable_time_r=False,
        control=None,
        transformer_options={},
    ) -> Tensor:
@ -312,13 +288,13 @@ class HunyuanVideo(nn.Module):
        img = self.img_in(img)
        vec = self.time_in(timestep_embedding(timesteps, 256, time_factor=1.0).to(img.dtype))

-        if (self.time_r_in is not None) and (not disable_time_r):
+        if self.time_r_in is not None:
            w = torch.where(transformer_options['sigmas'][0] == transformer_options['sample_sigmas'])[0]  # This most likely could be improved
            if len(w) > 0:
                timesteps_r = transformer_options['sample_sigmas'][w[0] + 1]
                timesteps_r = timesteps_r.unsqueeze(0).to(device=timesteps.device, dtype=timesteps.dtype)
                vec_r = self.time_r_in(timestep_embedding(timesteps_r, 256, time_factor=1000.0).to(img.dtype))
-                vec = (vec + vec_r) if self.params.meanflow_sum else (vec + vec_r) / 2
+                vec = (vec + vec_r) / 2

        if ref_latent is not None:
            ref_latent_ids = self.img_ids(ref_latent)
@ -351,33 +327,14 @@ class HunyuanVideo(nn.Module):
        if txt_mask is not None and not torch.is_floating_point(txt_mask):
            txt_mask = (txt_mask - 1).to(img.dtype) * torch.finfo(img.dtype).max

-        txt = self.txt_in(txt, timesteps, txt_mask, transformer_options=transformer_options)
-
-        if self.cond_type_embedding is not None:
-            self.cond_type_embedding.to(txt.device)
-            cond_emb = self.cond_type_embedding(torch.zeros_like(txt[:, :, 0], device=txt.device, dtype=torch.long))
-            txt = txt + cond_emb.to(txt.dtype)
+        txt = self.txt_in(txt, timesteps, txt_mask)

        if self.byt5_in is not None and txt_byt5 is not None:
            txt_byt5 = self.byt5_in(txt_byt5)
-            if self.cond_type_embedding is not None:
-                cond_emb = self.cond_type_embedding(torch.ones_like(txt_byt5[:, :, 0], device=txt_byt5.device, dtype=torch.long))
-                txt_byt5 = txt_byt5 + cond_emb.to(txt_byt5.dtype)
-                txt = torch.cat((txt_byt5, txt), dim=1) # byt5 first for HunyuanVideo1.5
-            else:
-                txt = torch.cat((txt, txt_byt5), dim=1)
            txt_byt5_ids = torch.zeros((txt_ids.shape[0], txt_byt5.shape[1], txt_ids.shape[-1]), device=txt_ids.device, dtype=txt_ids.dtype)
+            txt = torch.cat((txt, txt_byt5), dim=1)
            txt_ids = torch.cat((txt_ids, txt_byt5_ids), dim=1)

-        if clip_fea is not None:
-            txt_vision_states = self.vision_in(clip_fea)
-            if self.cond_type_embedding is not None:
-                cond_emb = self.cond_type_embedding(2 * torch.ones_like(txt_vision_states[:, :, 0], dtype=torch.long, device=txt_vision_states.device))
-                txt_vision_states = txt_vision_states + cond_emb
-            txt = torch.cat((txt_vision_states.to(txt.dtype), txt), dim=1)
-            extra_txt_ids = torch.zeros((txt_ids.shape[0], txt_vision_states.shape[1], txt_ids.shape[-1]), device=txt_ids.device, dtype=txt_ids.dtype)
-            txt_ids = torch.cat((txt_ids, extra_txt_ids), dim=1)
-
        ids = torch.cat((img_ids, txt_ids), dim=1)
        pe = self.pe_embedder(ids)

@ -390,21 +347,18 @@ class HunyuanVideo(nn.Module):
            attn_mask = None

        blocks_replace = patches_replace.get("dit", {})
-        transformer_options["total_blocks"] = len(self.double_blocks)
-        transformer_options["block_type"] = "double"
        for i, block in enumerate(self.double_blocks):
-            transformer_options["block_index"] = i
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"], out["txt"] = block(img=args["img"], txt=args["txt"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims_img=args["modulation_dims_img"], modulation_dims_txt=args["modulation_dims_txt"], transformer_options=args["transformer_options"])
+                    out["img"], out["txt"] = block(img=args["img"], txt=args["txt"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims_img=args["modulation_dims_img"], modulation_dims_txt=args["modulation_dims_txt"])
                    return out

-                out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims_img': modulation_dims, 'modulation_dims_txt': modulation_dims_txt, 'transformer_options': transformer_options}, {"original_block": block_wrap})
+                out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims_img': modulation_dims, 'modulation_dims_txt': modulation_dims_txt}, {"original_block": block_wrap})
                txt = out["txt"]
                img = out["img"]
            else:
-                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims_img=modulation_dims, modulation_dims_txt=modulation_dims_txt, transformer_options=transformer_options)
+                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims_img=modulation_dims, modulation_dims_txt=modulation_dims_txt)

            if control is not None: # Controlnet
                control_i = control.get("input")
@ -415,20 +369,17 @@ class HunyuanVideo(nn.Module):

        img = torch.cat((img, txt), 1)

-        transformer_options["total_blocks"] = len(self.single_blocks)
-        transformer_options["block_type"] = "single"
        for i, block in enumerate(self.single_blocks):
-            transformer_options["block_index"] = i
            if ("single_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims=args["modulation_dims"], transformer_options=args["transformer_options"])
+                    out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims=args["modulation_dims"])
                    return out

-                out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims': modulation_dims, 'transformer_options': transformer_options}, {"original_block": block_wrap})
+                out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims': modulation_dims}, {"original_block": block_wrap})
                img = out["img"]
            else:
-                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims=modulation_dims, transformer_options=transformer_options)
+                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims=modulation_dims)

            if control is not None: # Controlnet
                control_o = control.get("output")
@ -477,14 +428,14 @@ class HunyuanVideo(nn.Module):
        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
        return repeat(img_ids, "h w c -> b (h w) c", b=bs)

-    def forward(self, x, timestep, context, y=None, txt_byt5=None, clip_fea=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
+    def forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, control=None, transformer_options={}, **kwargs):
        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
            self._forward,
            self,
            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
-        ).execute(x, timestep, context, y, txt_byt5, clip_fea, guidance, attention_mask, guiding_frame_index, ref_latent, disable_time_r, control, transformer_options, **kwargs)
+        ).execute(x, timestep, context, y, txt_byt5, guidance, attention_mask, guiding_frame_index, ref_latent, control, transformer_options, **kwargs)

-    def _forward(self, x, timestep, context, y=None, txt_byt5=None, clip_fea=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
+    def _forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, control=None, transformer_options={}, **kwargs):
        bs = x.shape[0]
        if len(self.patch_size) == 3:
            img_ids = self.img_ids(x)
@ -492,5 +443,5 @@ class HunyuanVideo(nn.Module):
        else:
            img_ids = self.img_ids_2d(x)
            txt_ids = torch.zeros((bs, context.shape[1], 2), device=x.device, dtype=x.dtype)
-        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, txt_byt5, clip_fea, guidance, guiding_frame_index, ref_latent, disable_time_r=disable_time_r, control=control, transformer_options=transformer_options)
+        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, txt_byt5, guidance, guiding_frame_index, ref_latent, control=control, transformer_options=transformer_options)
        return out
--- a/comfy/ldm/hunyuan_video/upsampler.py
+++ b/comfy/ldm/hunyuan_video/upsampler.py
@ -1,122 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, VideoConv3d
-from comfy.ldm.hunyuan_video.vae_refiner import RMS_norm
-import comfy.model_management
-import comfy.model_patcher
-
-class SRResidualCausalBlock3D(nn.Module):
-    def __init__(self, channels: int):
-        super().__init__()
-        self.block = nn.Sequential(
-            VideoConv3d(channels, channels, kernel_size=3),
-            nn.SiLU(inplace=True),
-            VideoConv3d(channels, channels, kernel_size=3),
-            nn.SiLU(inplace=True),
-            VideoConv3d(channels, channels, kernel_size=3),
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return x + self.block(x)
-
-class SRModel3DV2(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        hidden_channels: int = 64,
-        num_blocks: int = 6,
-        global_residual: bool = False,
-    ):
-        super().__init__()
-        self.in_conv = VideoConv3d(in_channels, hidden_channels, kernel_size=3)
-        self.blocks = nn.ModuleList([SRResidualCausalBlock3D(hidden_channels) for _ in range(num_blocks)])
-        self.out_conv = VideoConv3d(hidden_channels, out_channels, kernel_size=3)
-        self.global_residual = bool(global_residual)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        residual = x
-        y = self.in_conv(x)
-        for blk in self.blocks:
-            y = blk(y)
-        y = self.out_conv(y)
-        if self.global_residual and (y.shape == residual.shape):
-            y = y + residual
-        return y
-
-
-class Upsampler(nn.Module):
-    def __init__(
-        self,
-        z_channels: int,
-        out_channels: int,
-        block_out_channels: tuple[int, ...],
-        num_res_blocks: int = 2,
-    ):
-        super().__init__()
-        self.num_res_blocks = num_res_blocks
-        self.block_out_channels = block_out_channels
-        self.z_channels = z_channels
-
-        ch = block_out_channels[0]
-        self.conv_in = VideoConv3d(z_channels, ch, kernel_size=3)
-
-        self.up = nn.ModuleList()
-
-        for i, tgt in enumerate(block_out_channels):
-            stage = nn.Module()
-            stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
-                                                    out_channels=tgt,
-                                                    temb_channels=0,
-                                                    conv_shortcut=False,
-                                                    conv_op=VideoConv3d, norm_op=RMS_norm)
-                                        for j in range(num_res_blocks + 1)])
-            ch = tgt
-            self.up.append(stage)
-
-        self.norm_out = RMS_norm(ch)
-        self.conv_out = VideoConv3d(ch, out_channels, kernel_size=3)
-
-    def forward(self, z):
-        """
-        Args:
-            z: (B, C, T, H, W)
-            target_shape: (H, W)
-        """
-        # z to block_in
-        repeats = self.block_out_channels[0] // (self.z_channels)
-        x = self.conv_in(z) + z.repeat_interleave(repeats=repeats, dim=1)
-
-        # upsampling
-        for stage in self.up:
-            for blk in stage.block:
-                x = blk(x)
-
-        out = self.conv_out(F.silu(self.norm_out(x)))
-        return out
-
-UPSAMPLERS = {
-    "720p": SRModel3DV2,
-    "1080p": Upsampler,
-}
-
-class HunyuanVideo15SRModel():
-    def __init__(self, model_type, config):
-        self.load_device = comfy.model_management.vae_device()
-        offload_device = comfy.model_management.vae_offload_device()
-        self.dtype = comfy.model_management.vae_dtype(self.load_device)
-        self.model_class = UPSAMPLERS.get(model_type)
-        self.model = self.model_class(**config).eval()
-
-        self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
-
-    def load_sd(self, sd):
-        return self.model.load_state_dict(sd, strict=True)
-
-    def get_sd(self):
-        return self.model.state_dict()
-
-    def resample_latent(self, latent):
-        comfy.model_management.load_model_gpu(self.patcher)
-        return self.model(latent.to(self.load_device))
--- a/comfy/ldm/hunyuan_video/vae_refiner.py
+++ b/comfy/ldm/hunyuan_video/vae_refiner.py
@ -1,313 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, CarriedConv3d, Normalize, conv_carry_causal_3d, torch_cat_if_needed
-import comfy.ops
-import comfy.ldm.models.autoencoder
-import comfy.model_management
-ops = comfy.ops.disable_weight_init
-
-
-class RMS_norm(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        shape = (dim, 1, 1, 1)
-        self.scale = dim**0.5
-        self.gamma = nn.Parameter(torch.empty(shape))
-
-    def forward(self, x):
-        return F.normalize(x, dim=1) * self.scale * comfy.model_management.cast_to(self.gamma, dtype=x.dtype, device=x.device)
-
-class DnSmpl(nn.Module):
-    def __init__(self, ic, oc, tds, refiner_vae, op):
-        super().__init__()
-        fct = 2 * 2 * 2 if tds else 1 * 2 * 2
-        assert oc % fct == 0
-        self.conv = op(ic, oc // fct, kernel_size=3, stride=1, padding=1)
-        self.refiner_vae = refiner_vae
-
-        self.tds = tds
-        self.gs = fct * ic // oc
-
-    def forward(self, x, conv_carry_in=None, conv_carry_out=None):
-        r1 = 2 if self.tds else 1
-        h = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out)
-
-        if self.tds and self.refiner_vae and conv_carry_in is None:
-
-            hf = h[:, :, :1, :, :]
-            b, c, f, ht, wd = hf.shape
-            hf = hf.reshape(b, c, f, ht // 2, 2, wd // 2, 2)
-            hf = hf.permute(0, 4, 6, 1, 2, 3, 5)
-            hf = hf.reshape(b, 2 * 2 * c, f, ht // 2, wd // 2)
-            hf = torch.cat([hf, hf], dim=1)
-
-            h = h[:, :, 1:, :, :]
-
-            xf = x[:, :, :1, :, :]
-            b, ci, f, ht, wd = xf.shape
-            xf = xf.reshape(b, ci, f, ht // 2, 2, wd // 2, 2)
-            xf = xf.permute(0, 4, 6, 1, 2, 3, 5)
-            xf = xf.reshape(b, 2 * 2 * ci, f, ht // 2, wd // 2)
-            B, C, T, H, W = xf.shape
-            xf = xf.view(B, hf.shape[1], self.gs // 2, T, H, W).mean(dim=2)
-
-            x = x[:, :, 1:, :, :]
-
-        if h.shape[2] == 0:
-            return hf + xf
-
-        b, c, frms, ht, wd = h.shape
-        nf = frms // r1
-        h = h.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2)
-        h = h.permute(0, 3, 5, 7, 1, 2, 4, 6)
-        h = h.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2)
-
-        b, ci, frms, ht, wd = x.shape
-        nf = frms // r1
-        x = x.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2)
-        x = x.permute(0, 3, 5, 7, 1, 2, 4, 6)
-        x = x.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2)
-        B, C, T, H, W = x.shape
-        x = x.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2)
-
-        if self.tds and self.refiner_vae and conv_carry_in is None:
-            h = torch.cat([hf, h], dim=2)
-            x = torch.cat([xf, x], dim=2)
-
-        return h + x
-
-
-class UpSmpl(nn.Module):
-    def __init__(self, ic, oc, tus, refiner_vae, op):
-        super().__init__()
-        fct = 2 * 2 * 2 if tus else 1 * 2 * 2
-        self.conv = op(ic, oc * fct, kernel_size=3, stride=1, padding=1)
-        self.refiner_vae = refiner_vae
-
-        self.tus = tus
-        self.rp = fct * oc // ic
-
-    def forward(self, x, conv_carry_in=None, conv_carry_out=None):
-        r1 = 2 if self.tus else 1
-        h = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out)
-
-        if self.tus and self.refiner_vae and conv_carry_in is None:
-            hf = h[:, :, :1, :, :]
-            b, c, f, ht, wd = hf.shape
-            nc = c // (2 * 2)
-            hf = hf.reshape(b, 2, 2, nc, f, ht, wd)
-            hf = hf.permute(0, 3, 4, 5, 1, 6, 2)
-            hf = hf.reshape(b, nc, f, ht * 2, wd * 2)
-            hf = hf[:, : hf.shape[1] // 2]
-
-            h = h[:, :, 1:, :, :]
-
-            xf = x[:, :, :1, :, :]
-            b, ci, f, ht, wd = xf.shape
-            xf = xf.repeat_interleave(repeats=self.rp // 2, dim=1)
-            b, c, f, ht, wd = xf.shape
-            nc = c // (2 * 2)
-            xf = xf.reshape(b, 2, 2, nc, f, ht, wd)
-            xf = xf.permute(0, 3, 4, 5, 1, 6, 2)
-            xf = xf.reshape(b, nc, f, ht * 2, wd * 2)
-
-            x = x[:, :, 1:, :, :]
-
-        b, c, frms, ht, wd = h.shape
-        nc = c // (r1 * 2 * 2)
-        h = h.reshape(b, r1, 2, 2, nc, frms, ht, wd)
-        h = h.permute(0, 4, 5, 1, 6, 2, 7, 3)
-        h = h.reshape(b, nc, frms * r1, ht * 2, wd * 2)
-
-        x = x.repeat_interleave(repeats=self.rp, dim=1)
-        b, c, frms, ht, wd = x.shape
-        nc = c // (r1 * 2 * 2)
-        x = x.reshape(b, r1, 2, 2, nc, frms, ht, wd)
-        x = x.permute(0, 4, 5, 1, 6, 2, 7, 3)
-        x = x.reshape(b, nc, frms * r1, ht * 2, wd * 2)
-
-        if self.tus and self.refiner_vae and conv_carry_in is None:
-            h = torch.cat([hf, h], dim=2)
-            x = torch.cat([xf, x], dim=2)
-
-        return h + x
-
-class Encoder(nn.Module):
-    def __init__(self, in_channels, z_channels, block_out_channels, num_res_blocks,
-                 ffactor_spatial, ffactor_temporal, downsample_match_channel=True, refiner_vae=True, **_):
-        super().__init__()
-        self.z_channels = z_channels
-        self.block_out_channels = block_out_channels
-        self.num_res_blocks = num_res_blocks
-        self.ffactor_temporal = ffactor_temporal
-
-        self.refiner_vae = refiner_vae
-        if self.refiner_vae:
-            conv_op = CarriedConv3d
-            norm_op = RMS_norm
-        else:
-            conv_op = ops.Conv3d
-            norm_op = Normalize
-
-        self.conv_in = conv_op(in_channels, block_out_channels[0], 3, 1, 1)
-
-        self.down = nn.ModuleList()
-        ch = block_out_channels[0]
-        depth = (ffactor_spatial >> 1).bit_length()
-        depth_temporal = ((ffactor_spatial // self.ffactor_temporal) >> 1).bit_length()
-
-        for i, tgt in enumerate(block_out_channels):
-            stage = nn.Module()
-            stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
-                                                     out_channels=tgt,
-                                                     temb_channels=0,
-                                                     conv_op=conv_op, norm_op=norm_op)
-                                        for j in range(num_res_blocks)])
-            ch = tgt
-            if i < depth:
-                nxt = block_out_channels[i + 1] if i + 1 < len(block_out_channels) and downsample_match_channel else ch
-                stage.downsample = DnSmpl(ch, nxt, tds=i >= depth_temporal, refiner_vae=self.refiner_vae, op=conv_op)
-                ch = nxt
-            self.down.append(stage)
-
-        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
-        self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op)
-        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
-
-        self.norm_out = norm_op(ch)
-        self.conv_out = conv_op(ch, z_channels << 1, 3, 1, 1)
-
-        self.regul = comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer()
-
-    def forward(self, x):
-        if not self.refiner_vae and x.shape[2] == 1:
-            x = x.expand(-1, -1, self.ffactor_temporal, -1, -1)
-
-        if self.refiner_vae:
-            xl = [x[:, :, :1, :, :]]
-            if x.shape[2] > self.ffactor_temporal:
-                xl += torch.split(x[:, :, 1: 1 + ((x.shape[2] - 1) // self.ffactor_temporal) * self.ffactor_temporal, :, :], self.ffactor_temporal * 2, dim=2)
-            x = xl
-        else:
-            x = [x]
-        out = []
-
-        conv_carry_in = None
-
-        for i, x1 in enumerate(x):
-            conv_carry_out = []
-            if i == len(x) - 1:
-                conv_carry_out = None
-
-            x1 = [ x1 ]
-            x1 = conv_carry_causal_3d(x1, self.conv_in, conv_carry_in, conv_carry_out)
-
-            for stage in self.down:
-                for blk in stage.block:
-                    x1 = blk(x1, None, conv_carry_in, conv_carry_out)
-                if hasattr(stage, 'downsample'):
-                    x1 = stage.downsample(x1, conv_carry_in, conv_carry_out)
-
-            out.append(x1)
-            conv_carry_in = conv_carry_out
-
-        out = torch_cat_if_needed(out, dim=2)
-
-        x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(out)))
-        del out
-
-        b, c, t, h, w = x.shape
-        grp = c // (self.z_channels << 1)
-        skip = x.view(b, c // grp, grp, t, h, w).mean(2)
-
-        out = conv_carry_causal_3d([F.silu(self.norm_out(x))], self.conv_out) + skip
-
-        if self.refiner_vae:
-            out = self.regul(out)[0]
-
-        return out
-
-class Decoder(nn.Module):
-    def __init__(self, z_channels, out_channels, block_out_channels, num_res_blocks,
-                 ffactor_spatial, ffactor_temporal, upsample_match_channel=True, refiner_vae=True, **_):
-        super().__init__()
-        block_out_channels = block_out_channels[::-1]
-        self.z_channels = z_channels
-        self.block_out_channels = block_out_channels
-        self.num_res_blocks = num_res_blocks
-
-        self.refiner_vae = refiner_vae
-        if self.refiner_vae:
-            conv_op = CarriedConv3d
-            norm_op = RMS_norm
-        else:
-            conv_op = ops.Conv3d
-            norm_op = Normalize
-
-        ch = block_out_channels[0]
-        self.conv_in = conv_op(z_channels, ch, kernel_size=3, stride=1, padding=1)
-
-        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
-        self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op)
-        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch,  conv_op=conv_op, norm_op=norm_op)
-
-        self.up = nn.ModuleList()
-        depth = (ffactor_spatial >> 1).bit_length()
-        depth_temporal = (ffactor_temporal >> 1).bit_length()
-
-        for i, tgt in enumerate(block_out_channels):
-            stage = nn.Module()
-            stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
-                                                     out_channels=tgt,
-                                                     temb_channels=0,
-                                                     conv_op=conv_op, norm_op=norm_op)
-                                        for j in range(num_res_blocks + 1)])
-            ch = tgt
-            if i < depth:
-                nxt = block_out_channels[i + 1] if i + 1 < len(block_out_channels) and upsample_match_channel else ch
-                stage.upsample = UpSmpl(ch, nxt, tus=i < depth_temporal, refiner_vae=self.refiner_vae, op=conv_op)
-                ch = nxt
-            self.up.append(stage)
-
-        self.norm_out = norm_op(ch)
-        self.conv_out = conv_op(ch, out_channels, 3, stride=1, padding=1)
-
-    def forward(self, z):
-        x = conv_carry_causal_3d([z], self.conv_in) + z.repeat_interleave(self.block_out_channels[0] // self.z_channels, 1)
-        x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x)))
-
-        if self.refiner_vae:
-            x = torch.split(x, 2, dim=2)
-        else:
-            x = [ x ]
-        out = []
-
-        conv_carry_in = None
-
-        for i, x1 in enumerate(x):
-            conv_carry_out = []
-            if i == len(x) - 1:
-                conv_carry_out = None
-            for stage in self.up:
-                for blk in stage.block:
-                    x1 = blk(x1, None, conv_carry_in, conv_carry_out)
-                if hasattr(stage, 'upsample'):
-                    x1 = stage.upsample(x1, conv_carry_in, conv_carry_out)
-
-            x1 = [ F.silu(self.norm_out(x1)) ]
-            x1 = conv_carry_causal_3d(x1, self.conv_out, conv_carry_in, conv_carry_out)
-            out.append(x1)
-            conv_carry_in = conv_carry_out
-        del x
-
-        out = torch_cat_if_needed(out, dim=2)
-
-        if not self.refiner_vae:
-            if z.shape[-3] == 1:
-                out = out[:, :, -1:]
-
-        return out
-
--- a/comfy/ldm/kandinsky5/model.py
+++ b/comfy/ldm/kandinsky5/model.py
@ -1,413 +0,0 @@
-import torch
-from torch import nn
-import math
-
-import comfy.ldm.common_dit
-from comfy.ldm.modules.attention import optimized_attention
-from comfy.ldm.flux.math import apply_rope1
-from comfy.ldm.flux.layers import EmbedND
-
-def attention(q, k, v, heads, transformer_options={}):
-    return optimized_attention(
-        q.transpose(1, 2),
-        k.transpose(1, 2),
-        v.transpose(1, 2),
-        heads=heads,
-        skip_reshape=True,
-        transformer_options=transformer_options
-    )
-
-def apply_scale_shift_norm(norm, x, scale, shift):
-    return torch.addcmul(shift, norm(x), scale + 1.0)
-
-def apply_gate_sum(x, out, gate):
-    return torch.addcmul(x, gate, out)
-
-def get_shift_scale_gate(params):
-    shift, scale, gate = torch.chunk(params, 3, dim=-1)
-    return tuple(x.unsqueeze(1) for x in (shift, scale, gate))
-
-def get_freqs(dim, max_period=10000.0):
-    return torch.exp(-math.log(max_period) * torch.arange(start=0, end=dim, dtype=torch.float32) / dim)
-
-
-class TimeEmbeddings(nn.Module):
-    def __init__(self, model_dim, time_dim, max_period=10000.0, operation_settings=None):
-        super().__init__()
-        assert model_dim % 2 == 0
-        self.model_dim = model_dim
-        self.max_period = max_period
-        self.register_buffer("freqs", get_freqs(model_dim // 2, max_period), persistent=False)
-        operations = operation_settings.get("operations")
-        self.in_layer = operations.Linear(model_dim, time_dim, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.activation = nn.SiLU()
-        self.out_layer = operations.Linear(time_dim, time_dim, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-
-    def forward(self, timestep, dtype):
-        args = torch.outer(timestep, self.freqs.to(device=timestep.device))
-        time_embed = torch.cat([torch.cos(args), torch.sin(args)], dim=-1).to(dtype)
-        time_embed = self.out_layer(self.activation(self.in_layer(time_embed)))
-        return time_embed
-
-
-class TextEmbeddings(nn.Module):
-    def __init__(self, text_dim, model_dim, operation_settings=None):
-        super().__init__()
-        operations = operation_settings.get("operations")
-        self.in_layer = operations.Linear(text_dim, model_dim, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.norm = operations.LayerNorm(model_dim, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-
-    def forward(self, text_embed):
-        text_embed = self.in_layer(text_embed)
-        return self.norm(text_embed).type_as(text_embed)
-
-
-class VisualEmbeddings(nn.Module):
-    def __init__(self, visual_dim, model_dim, patch_size, operation_settings=None):
-        super().__init__()
-        self.patch_size = patch_size
-        operations = operation_settings.get("operations")
-        self.in_layer = operations.Linear(visual_dim, model_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-
-    def forward(self, x):
-        x = x.movedim(1, -1)  # B C T H W -> B T H W C
-        B, T, H, W, dim = x.shape
-        pt, ph, pw = self.patch_size
-
-        x = x.view(
-            B,
-            T // pt, pt,
-            H // ph, ph,
-            W // pw, pw,
-            dim,
-        ).permute(0, 1, 3, 5, 2, 4, 6, 7).flatten(4, 7)
-
-        return self.in_layer(x)
-
-
-class Modulation(nn.Module):
-    def __init__(self, time_dim, model_dim, num_params, operation_settings=None):
-        super().__init__()
-        self.activation = nn.SiLU()
-        self.out_layer = operation_settings.get("operations").Linear(time_dim, num_params * model_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-
-    def forward(self, x):
-        return self.out_layer(self.activation(x))
-
-
-class SelfAttention(nn.Module):
-    def __init__(self, num_channels, head_dim, operation_settings=None):
-        super().__init__()
-        assert num_channels % head_dim == 0
-        self.num_heads = num_channels // head_dim
-        self.head_dim = head_dim
-
-        operations = operation_settings.get("operations")
-        self.to_query = operations.Linear(num_channels, num_channels, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.to_key = operations.Linear(num_channels, num_channels, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.to_value = operations.Linear(num_channels, num_channels, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.query_norm = operations.RMSNorm(head_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.key_norm = operations.RMSNorm(head_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-
-        self.out_layer = operations.Linear(num_channels, num_channels, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.num_chunks = 2
-
-    def _compute_qk(self, x, freqs, proj_fn, norm_fn):
-        result = proj_fn(x).view(*x.shape[:-1], self.num_heads, -1)
-        return apply_rope1(norm_fn(result), freqs)
-
-    def _forward(self, x, freqs, transformer_options={}):
-        q = self._compute_qk(x, freqs, self.to_query, self.query_norm)
-        k = self._compute_qk(x, freqs, self.to_key, self.key_norm)
-        v = self.to_value(x).view(*x.shape[:-1], self.num_heads, -1)
-        out = attention(q, k, v, self.num_heads, transformer_options=transformer_options)
-        return self.out_layer(out)
-
-    def _forward_chunked(self, x, freqs, transformer_options={}):
-        def process_chunks(proj_fn, norm_fn):
-            x_chunks = torch.chunk(x, self.num_chunks, dim=1)
-            freqs_chunks = torch.chunk(freqs, self.num_chunks, dim=1)
-            chunks = []
-            for x_chunk, freqs_chunk in zip(x_chunks, freqs_chunks):
-                chunks.append(self._compute_qk(x_chunk, freqs_chunk, proj_fn, norm_fn))
-            return torch.cat(chunks, dim=1)
-
-        q = process_chunks(self.to_query, self.query_norm)
-        k = process_chunks(self.to_key, self.key_norm)
-        v = self.to_value(x).view(*x.shape[:-1], self.num_heads, -1)
-        out = attention(q, k, v, self.num_heads, transformer_options=transformer_options)
-        return self.out_layer(out)
-
-    def forward(self, x, freqs, transformer_options={}):
-        if x.shape[1] > 8192:
-            return self._forward_chunked(x, freqs, transformer_options=transformer_options)
-        else:
-            return self._forward(x, freqs, transformer_options=transformer_options)
-
-
-class CrossAttention(SelfAttention):
-    def get_qkv(self, x, context):
-        q = self.to_query(x).view(*x.shape[:-1], self.num_heads, -1)
-        k = self.to_key(context).view(*context.shape[:-1], self.num_heads, -1)
-        v = self.to_value(context).view(*context.shape[:-1], self.num_heads, -1)
-        return q, k, v
-
-    def forward(self, x, context, transformer_options={}):
-        q, k, v = self.get_qkv(x, context)
-        out = attention(self.query_norm(q), self.key_norm(k), v, self.num_heads, transformer_options=transformer_options)
-        return self.out_layer(out)
-
-
-class FeedForward(nn.Module):
-    def __init__(self, dim, ff_dim, operation_settings=None):
-        super().__init__()
-        operations = operation_settings.get("operations")
-        self.in_layer = operations.Linear(dim, ff_dim, bias=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.activation = nn.GELU()
-        self.out_layer = operations.Linear(ff_dim, dim, bias=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.num_chunks = 4
-
-    def _forward(self, x):
-        return self.out_layer(self.activation(self.in_layer(x)))
-
-    def _forward_chunked(self, x):
-        chunks = torch.chunk(x, self.num_chunks, dim=1)
-        output_chunks = []
-        for chunk in chunks:
-            output_chunks.append(self._forward(chunk))
-        return torch.cat(output_chunks, dim=1)
-
-    def forward(self, x):
-        if x.shape[1] > 8192:
-            return self._forward_chunked(x)
-        else:
-            return self._forward(x)
-
-
-class OutLayer(nn.Module):
-    def __init__(self, model_dim, time_dim, visual_dim, patch_size, operation_settings=None):
-        super().__init__()
-        self.patch_size = patch_size
-        self.modulation = Modulation(time_dim, model_dim, 2, operation_settings=operation_settings)
-        operations = operation_settings.get("operations")
-        self.norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.out_layer = operations.Linear(model_dim, math.prod(patch_size) * visual_dim, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-
-    def forward(self, visual_embed, time_embed):
-        B, T, H, W, _ = visual_embed.shape
-        shift, scale = torch.chunk(self.modulation(time_embed), 2, dim=-1)
-        scale = scale[:, None, None, None, :]
-        shift = shift[:, None, None, None, :]
-        visual_embed = apply_scale_shift_norm(self.norm, visual_embed, scale, shift)
-        x = self.out_layer(visual_embed)
-
-        out_dim = x.shape[-1] // (self.patch_size[0] * self.patch_size[1] * self.patch_size[2])
-        x = x.view(
-            B, T, H, W,
-            out_dim,
-            self.patch_size[0], self.patch_size[1], self.patch_size[2]
-        )
-        return x.permute(0, 4, 1, 5, 2, 6, 3, 7).flatten(2, 3).flatten(3, 4).flatten(4, 5)
-
-
-class TransformerEncoderBlock(nn.Module):
-    def __init__(self, model_dim, time_dim, ff_dim, head_dim, operation_settings=None):
-        super().__init__()
-        self.text_modulation = Modulation(time_dim, model_dim, 6, operation_settings=operation_settings)
-        operations = operation_settings.get("operations")
-
-        self.self_attention_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.self_attention = SelfAttention(model_dim, head_dim, operation_settings=operation_settings)
-
-        self.feed_forward_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.feed_forward = FeedForward(model_dim, ff_dim, operation_settings=operation_settings)
-
-    def forward(self, x, time_embed, freqs, transformer_options={}):
-        self_attn_params, ff_params = torch.chunk(self.text_modulation(time_embed), 2, dim=-1)
-        shift, scale, gate = get_shift_scale_gate(self_attn_params)
-        out = apply_scale_shift_norm(self.self_attention_norm, x, scale, shift)
-        out = self.self_attention(out, freqs, transformer_options=transformer_options)
-        x = apply_gate_sum(x, out, gate)
-
-        shift, scale, gate = get_shift_scale_gate(ff_params)
-        out = apply_scale_shift_norm(self.feed_forward_norm, x, scale, shift)
-        out = self.feed_forward(out)
-        x = apply_gate_sum(x, out, gate)
-        return x
-
-
-class TransformerDecoderBlock(nn.Module):
-    def __init__(self, model_dim, time_dim, ff_dim, head_dim, operation_settings=None):
-        super().__init__()
-        self.visual_modulation = Modulation(time_dim, model_dim, 9, operation_settings=operation_settings)
-
-        operations = operation_settings.get("operations")
-        self.self_attention_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.self_attention = SelfAttention(model_dim, head_dim, operation_settings=operation_settings)
-
-        self.cross_attention_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.cross_attention = CrossAttention(model_dim, head_dim, operation_settings=operation_settings)
-
-        self.feed_forward_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.feed_forward = FeedForward(model_dim, ff_dim, operation_settings=operation_settings)
-
-    def forward(self, visual_embed, text_embed, time_embed, freqs, transformer_options={}):
-        self_attn_params, cross_attn_params, ff_params = torch.chunk(self.visual_modulation(time_embed), 3, dim=-1)
-        # self attention
-        shift, scale, gate = get_shift_scale_gate(self_attn_params)
-        visual_out = apply_scale_shift_norm(self.self_attention_norm, visual_embed, scale, shift)
-        visual_out = self.self_attention(visual_out, freqs, transformer_options=transformer_options)
-        visual_embed = apply_gate_sum(visual_embed, visual_out, gate)
-        # cross attention
-        shift, scale, gate = get_shift_scale_gate(cross_attn_params)
-        visual_out = apply_scale_shift_norm(self.cross_attention_norm, visual_embed, scale, shift)
-        visual_out = self.cross_attention(visual_out, text_embed, transformer_options=transformer_options)
-        visual_embed = apply_gate_sum(visual_embed, visual_out, gate)
-        # feed forward
-        shift, scale, gate = get_shift_scale_gate(ff_params)
-        visual_out = apply_scale_shift_norm(self.feed_forward_norm, visual_embed, scale, shift)
-        visual_out = self.feed_forward(visual_out)
-        visual_embed = apply_gate_sum(visual_embed, visual_out, gate)
-        return visual_embed
-
-
-class Kandinsky5(nn.Module):
-    def __init__(
-        self,
-        in_visual_dim=16, out_visual_dim=16, in_text_dim=3584, in_text_dim2=768, time_dim=512,
-        model_dim=1792, ff_dim=7168, visual_embed_dim=132, patch_size=(1, 2, 2), num_text_blocks=2, num_visual_blocks=32,
-        axes_dims=(16, 24, 24), rope_scale_factor=(1.0, 2.0, 2.0),
-        dtype=None, device=None, operations=None, **kwargs
-    ):
-        super().__init__()
-        head_dim = sum(axes_dims)
-        self.rope_scale_factor = rope_scale_factor
-        self.in_visual_dim = in_visual_dim
-        self.model_dim = model_dim
-        self.patch_size = patch_size
-        self.visual_embed_dim = visual_embed_dim
-        self.dtype = dtype
-        self.device = device
-        operation_settings = {"operations": operations, "device": device, "dtype": dtype}
-
-        self.time_embeddings = TimeEmbeddings(model_dim, time_dim, operation_settings=operation_settings)
-        self.text_embeddings = TextEmbeddings(in_text_dim, model_dim, operation_settings=operation_settings)
-        self.pooled_text_embeddings = TextEmbeddings(in_text_dim2, time_dim, operation_settings=operation_settings)
-        self.visual_embeddings = VisualEmbeddings(visual_embed_dim, model_dim, patch_size, operation_settings=operation_settings)
-
-        self.text_transformer_blocks = nn.ModuleList(
-            [TransformerEncoderBlock(model_dim, time_dim, ff_dim, head_dim, operation_settings=operation_settings) for _ in range(num_text_blocks)]
-        )
-
-        self.visual_transformer_blocks = nn.ModuleList(
-            [TransformerDecoderBlock(model_dim, time_dim, ff_dim, head_dim, operation_settings=operation_settings) for _ in range(num_visual_blocks)]
-        )
-
-        self.out_layer = OutLayer(model_dim, time_dim, out_visual_dim, patch_size, operation_settings=operation_settings)
-
-        self.rope_embedder_3d = EmbedND(dim=head_dim, theta=10000.0, axes_dim=axes_dims)
-        self.rope_embedder_1d = EmbedND(dim=head_dim, theta=10000.0, axes_dim=[head_dim])
-
-    def rope_encode_1d(self, seq_len, seq_start=0, steps=None, device=None, dtype=None, transformer_options={}):
-        steps = seq_len if steps is None else steps
-        seq_ids = torch.linspace(seq_start, seq_start + (seq_len - 1), steps=steps, device=device, dtype=dtype)
-        seq_ids = seq_ids.reshape(-1, 1).unsqueeze(0)  # Shape: (1, steps, 1)
-        freqs = self.rope_embedder_1d(seq_ids).movedim(1, 2)
-        return freqs
-
-    def rope_encode_3d(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None, transformer_options={}):
-
-        patch_size = self.patch_size
-        t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
-        h_len = ((h + (patch_size[1] // 2)) // patch_size[1])
-        w_len = ((w + (patch_size[2] // 2)) // patch_size[2])
-
-        if steps_t is None:
-            steps_t = t_len
-        if steps_h is None:
-            steps_h = h_len
-        if steps_w is None:
-            steps_w = w_len
-
-        h_start = 0
-        w_start = 0
-        rope_options = transformer_options.get("rope_options", None)
-        if rope_options is not None:
-            t_len = (t_len - 1.0) * rope_options.get("scale_t", 1.0) + 1.0
-            h_len = (h_len - 1.0) * rope_options.get("scale_y", 1.0) + 1.0
-            w_len = (w_len - 1.0) * rope_options.get("scale_x", 1.0) + 1.0
-
-            t_start += rope_options.get("shift_t", 0.0)
-            h_start += rope_options.get("shift_y", 0.0)
-            w_start += rope_options.get("shift_x", 0.0)
-        else:
-            rope_scale_factor = self.rope_scale_factor
-            if self.model_dim == 4096: # pro video model uses different rope scaling at higher resolutions
-                if h * w >= 14080:
-                    rope_scale_factor = (1.0, 3.16, 3.16)
-
-            t_len = (t_len - 1.0) / rope_scale_factor[0] + 1.0
-            h_len = (h_len - 1.0) / rope_scale_factor[1] + 1.0
-            w_len = (w_len - 1.0) / rope_scale_factor[2] + 1.0
-
-        img_ids = torch.zeros((steps_t, steps_h, steps_w, 3), device=device, dtype=dtype)
-        img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(t_start, t_start + (t_len - 1), steps=steps_t, device=device, dtype=dtype).reshape(-1, 1, 1)
-        img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(h_start, h_start + (h_len - 1), steps=steps_h, device=device, dtype=dtype).reshape(1, -1, 1)
-        img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(w_start, w_start + (w_len - 1), steps=steps_w, device=device, dtype=dtype).reshape(1, 1, -1)
-        img_ids = img_ids.reshape(1, -1, img_ids.shape[-1])
-
-        freqs = self.rope_embedder_3d(img_ids).movedim(1, 2)
-        return freqs
-
-    def forward_orig(self, x, timestep, context, y, freqs, freqs_text, transformer_options={}, **kwargs):
-        patches_replace = transformer_options.get("patches_replace", {})
-        context = self.text_embeddings(context)
-        time_embed = self.time_embeddings(timestep, x.dtype) + self.pooled_text_embeddings(y)
-
-        for block in self.text_transformer_blocks:
-            context = block(context, time_embed, freqs_text, transformer_options=transformer_options)
-
-        visual_embed = self.visual_embeddings(x)
-        visual_shape = visual_embed.shape[:-1]
-        visual_embed = visual_embed.flatten(1, -2)
-
-        blocks_replace = patches_replace.get("dit", {})
-        transformer_options["total_blocks"] = len(self.visual_transformer_blocks)
-        transformer_options["block_type"] = "double"
-        for i, block in enumerate(self.visual_transformer_blocks):
-            transformer_options["block_index"] = i
-            if ("double_block", i) in blocks_replace:
-                def block_wrap(args):
-                    return block(x=args["x"], context=args["context"], time_embed=args["time_embed"], freqs=args["freqs"], transformer_options=args.get("transformer_options"))
-                visual_embed = blocks_replace[("double_block", i)]({"x": visual_embed, "context": context, "time_embed": time_embed, "freqs": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap})["x"]
-            else:
-                visual_embed = block(visual_embed, context, time_embed, freqs=freqs, transformer_options=transformer_options)
-
-        visual_embed = visual_embed.reshape(*visual_shape, -1)
-        return self.out_layer(visual_embed, time_embed)
-
-    def _forward(self, x, timestep, context, y, time_dim_replace=None, transformer_options={}, **kwargs):
-        original_dims = x.ndim
-        if original_dims == 4:
-            x = x.unsqueeze(2)
-        bs, c, t_len, h, w = x.shape
-        x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size)
-
-        if time_dim_replace is not None:
-            time_dim_replace = comfy.ldm.common_dit.pad_to_patch_size(time_dim_replace, self.patch_size)
-            x[:, :time_dim_replace.shape[1], :time_dim_replace.shape[2]] = time_dim_replace
-
-        freqs = self.rope_encode_3d(t_len, h, w, device=x.device, dtype=x.dtype, transformer_options=transformer_options)
-        freqs_text = self.rope_encode_1d(context.shape[1], device=x.device, dtype=x.dtype, transformer_options=transformer_options)
-
-        out = self.forward_orig(x, timestep, context, y, freqs, freqs_text, transformer_options=transformer_options, **kwargs)
-        if original_dims == 4:
-            out = out.squeeze(2)
-        return out
-
-    def forward(self, x, timestep, context, y, time_dim_replace=None, transformer_options={}, **kwargs):
-        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
-            self._forward,
-            self,
-            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
-        ).execute(x, timestep, context, y, time_dim_replace=time_dim_replace, transformer_options=transformer_options, **kwargs)
--- a/comfy/ldm/lightricks/av_model.py
+++ b/comfy/ldm/lightricks/av_model.py
@ -1,837 +0,0 @@
-from typing import Tuple
-import torch
-import torch.nn as nn
-from comfy.ldm.lightricks.model import (
-    CrossAttention,
-    FeedForward,
-    AdaLayerNormSingle,
-    PixArtAlphaTextProjection,
-    LTXVModel,
-)
-from comfy.ldm.lightricks.symmetric_patchifier import AudioPatchifier
-import comfy.ldm.common_dit
-
-class BasicAVTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        v_dim,
-        a_dim,
-        v_heads,
-        a_heads,
-        vd_head,
-        ad_head,
-        v_context_dim=None,
-        a_context_dim=None,
-        attn_precision=None,
-        dtype=None,
-        device=None,
-        operations=None,
-    ):
-        super().__init__()
-
-        self.attn_precision = attn_precision
-
-        self.attn1 = CrossAttention(
-            query_dim=v_dim,
-            heads=v_heads,
-            dim_head=vd_head,
-            context_dim=None,
-            attn_precision=self.attn_precision,
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-        self.audio_attn1 = CrossAttention(
-            query_dim=a_dim,
-            heads=a_heads,
-            dim_head=ad_head,
-            context_dim=None,
-            attn_precision=self.attn_precision,
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-
-        self.attn2 = CrossAttention(
-            query_dim=v_dim,
-            context_dim=v_context_dim,
-            heads=v_heads,
-            dim_head=vd_head,
-            attn_precision=self.attn_precision,
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-        self.audio_attn2 = CrossAttention(
-            query_dim=a_dim,
-            context_dim=a_context_dim,
-            heads=a_heads,
-            dim_head=ad_head,
-            attn_precision=self.attn_precision,
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-
-        # Q: Video, K,V: Audio
-        self.audio_to_video_attn = CrossAttention(
-            query_dim=v_dim,
-            context_dim=a_dim,
-            heads=a_heads,
-            dim_head=ad_head,
-            attn_precision=self.attn_precision,
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-
-        # Q: Audio, K,V: Video
-        self.video_to_audio_attn = CrossAttention(
-            query_dim=a_dim,
-            context_dim=v_dim,
-            heads=a_heads,
-            dim_head=ad_head,
-            attn_precision=self.attn_precision,
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-
-        self.ff = FeedForward(
-            v_dim, dim_out=v_dim, glu=True, dtype=dtype, device=device, operations=operations
-        )
-        self.audio_ff = FeedForward(
-            a_dim, dim_out=a_dim, glu=True, dtype=dtype, device=device, operations=operations
-        )
-
-        self.scale_shift_table = nn.Parameter(torch.empty(6, v_dim, device=device, dtype=dtype))
-        self.audio_scale_shift_table = nn.Parameter(
-            torch.empty(6, a_dim, device=device, dtype=dtype)
-        )
-
-        self.scale_shift_table_a2v_ca_audio = nn.Parameter(
-            torch.empty(5, a_dim, device=device, dtype=dtype)
-        )
-        self.scale_shift_table_a2v_ca_video = nn.Parameter(
-            torch.empty(5, v_dim, device=device, dtype=dtype)
-        )
-
-    def get_ada_values(
-        self, scale_shift_table: torch.Tensor, batch_size: int, timestep: torch.Tensor, indices: slice = slice(None, None)
-    ):
-        num_ada_params = scale_shift_table.shape[0]
-
-        ada_values = (
-            scale_shift_table[indices].unsqueeze(0).unsqueeze(0).to(device=timestep.device, dtype=timestep.dtype)
-            + timestep.reshape(batch_size, timestep.shape[1], num_ada_params, -1)[:, :, indices, :]
-        ).unbind(dim=2)
-        return ada_values
-
-    def get_av_ca_ada_values(
-        self,
-        scale_shift_table: torch.Tensor,
-        batch_size: int,
-        scale_shift_timestep: torch.Tensor,
-        gate_timestep: torch.Tensor,
-        num_scale_shift_values: int = 4,
-    ):
-        scale_shift_ada_values = self.get_ada_values(
-            scale_shift_table[:num_scale_shift_values, :],
-            batch_size,
-            scale_shift_timestep,
-        )
-        gate_ada_values = self.get_ada_values(
-            scale_shift_table[num_scale_shift_values:, :],
-            batch_size,
-            gate_timestep,
-        )
-
-        scale_shift_chunks = [t.squeeze(2) for t in scale_shift_ada_values]
-        gate_ada_values = [t.squeeze(2) for t in gate_ada_values]
-
-        return (*scale_shift_chunks, *gate_ada_values)
-
-    def forward(
-        self,
-        x: Tuple[torch.Tensor, torch.Tensor],
-        v_context=None,
-        a_context=None,
-        attention_mask=None,
-        v_timestep=None,
-        a_timestep=None,
-        v_pe=None,
-        a_pe=None,
-        v_cross_pe=None,
-        a_cross_pe=None,
-        v_cross_scale_shift_timestep=None,
-        a_cross_scale_shift_timestep=None,
-        v_cross_gate_timestep=None,
-        a_cross_gate_timestep=None,
-        transformer_options=None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        run_vx = transformer_options.get("run_vx", True)
-        run_ax = transformer_options.get("run_ax", True)
-
-        vx, ax = x
-        run_ax = run_ax and ax.numel() > 0
-        run_a2v = run_vx and transformer_options.get("a2v_cross_attn", True) and ax.numel() > 0
-        run_v2a = run_ax and transformer_options.get("v2a_cross_attn", True)
-
-        if run_vx:
-            vshift_msa, vscale_msa, vgate_msa = (
-                self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(0, 3))
-            )
-
-            norm_vx = comfy.ldm.common_dit.rms_norm(vx) * (1 + vscale_msa) + vshift_msa
-            vx += self.attn1(norm_vx, pe=v_pe, transformer_options=transformer_options) * vgate_msa
-            vx += self.attn2(
-                comfy.ldm.common_dit.rms_norm(vx),
-                context=v_context,
-                mask=attention_mask,
-                transformer_options=transformer_options,
-            )
-
-            del vshift_msa, vscale_msa, vgate_msa
-
-        if run_ax:
-            ashift_msa, ascale_msa, agate_msa = (
-                self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(0, 3))
-            )
-
-            norm_ax = comfy.ldm.common_dit.rms_norm(ax) * (1 + ascale_msa) + ashift_msa
-            ax += (
-                self.audio_attn1(norm_ax, pe=a_pe, transformer_options=transformer_options)
-                * agate_msa
-            )
-            ax += self.audio_attn2(
-                comfy.ldm.common_dit.rms_norm(ax),
-                context=a_context,
-                mask=attention_mask,
-                transformer_options=transformer_options,
-            )
-
-            del ashift_msa, ascale_msa, agate_msa
-
-        # Audio - Video cross attention.
-        if run_a2v or run_v2a:
-            # norm3
-            vx_norm3 = comfy.ldm.common_dit.rms_norm(vx)
-            ax_norm3 = comfy.ldm.common_dit.rms_norm(ax)
-
-            (
-                scale_ca_audio_hidden_states_a2v,
-                shift_ca_audio_hidden_states_a2v,
-                scale_ca_audio_hidden_states_v2a,
-                shift_ca_audio_hidden_states_v2a,
-                gate_out_v2a,
-            ) = self.get_av_ca_ada_values(
-                self.scale_shift_table_a2v_ca_audio,
-                ax.shape[0],
-                a_cross_scale_shift_timestep,
-                a_cross_gate_timestep,
-            )
-
-            (
-                scale_ca_video_hidden_states_a2v,
-                shift_ca_video_hidden_states_a2v,
-                scale_ca_video_hidden_states_v2a,
-                shift_ca_video_hidden_states_v2a,
-                gate_out_a2v,
-            ) = self.get_av_ca_ada_values(
-                self.scale_shift_table_a2v_ca_video,
-                vx.shape[0],
-                v_cross_scale_shift_timestep,
-                v_cross_gate_timestep,
-            )
-
-            if run_a2v:
-                vx_scaled = (
-                    vx_norm3 * (1 + scale_ca_video_hidden_states_a2v)
-                    + shift_ca_video_hidden_states_a2v
-                )
-                ax_scaled = (
-                    ax_norm3 * (1 + scale_ca_audio_hidden_states_a2v)
-                    + shift_ca_audio_hidden_states_a2v
-                )
-                vx += (
-                    self.audio_to_video_attn(
-                        vx_scaled,
-                        context=ax_scaled,
-                        pe=v_cross_pe,
-                        k_pe=a_cross_pe,
-                        transformer_options=transformer_options,
-                    )
-                    * gate_out_a2v
-                )
-
-                del gate_out_a2v
-                del scale_ca_video_hidden_states_a2v,\
-                    shift_ca_video_hidden_states_a2v,\
-                    scale_ca_audio_hidden_states_a2v,\
-                    shift_ca_audio_hidden_states_a2v,\
-
-            if run_v2a:
-                ax_scaled = (
-                    ax_norm3 * (1 + scale_ca_audio_hidden_states_v2a)
-                    + shift_ca_audio_hidden_states_v2a
-                )
-                vx_scaled = (
-                    vx_norm3 * (1 + scale_ca_video_hidden_states_v2a)
-                    + shift_ca_video_hidden_states_v2a
-                )
-                ax += (
-                    self.video_to_audio_attn(
-                        ax_scaled,
-                        context=vx_scaled,
-                        pe=a_cross_pe,
-                        k_pe=v_cross_pe,
-                        transformer_options=transformer_options,
-                    )
-                    * gate_out_v2a
-                )
-
-                del gate_out_v2a
-                del scale_ca_video_hidden_states_v2a,\
-                    shift_ca_video_hidden_states_v2a,\
-                    scale_ca_audio_hidden_states_v2a,\
-                    shift_ca_audio_hidden_states_v2a
-
-        if run_vx:
-            vshift_mlp, vscale_mlp, vgate_mlp = (
-                self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(3, None))
-            )
-
-            vx_scaled = comfy.ldm.common_dit.rms_norm(vx) * (1 + vscale_mlp) + vshift_mlp
-            vx += self.ff(vx_scaled) * vgate_mlp
-            del vshift_mlp, vscale_mlp, vgate_mlp
-
-        if run_ax:
-            ashift_mlp, ascale_mlp, agate_mlp = (
-                self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(3, None))
-            )
-
-            ax_scaled = comfy.ldm.common_dit.rms_norm(ax) * (1 + ascale_mlp) + ashift_mlp
-            ax += self.audio_ff(ax_scaled) * agate_mlp
-
-            del ashift_mlp, ascale_mlp, agate_mlp
-
-
-        return vx, ax
-
-
-class LTXAVModel(LTXVModel):
-    """LTXAV model for audio-video generation."""
-
-    def __init__(
-        self,
-        in_channels=128,
-        audio_in_channels=128,
-        cross_attention_dim=4096,
-        audio_cross_attention_dim=2048,
-        attention_head_dim=128,
-        audio_attention_head_dim=64,
-        num_attention_heads=32,
-        audio_num_attention_heads=32,
-        caption_channels=3840,
-        num_layers=48,
-        positional_embedding_theta=10000.0,
-        positional_embedding_max_pos=[20, 2048, 2048],
-        audio_positional_embedding_max_pos=[20],
-        causal_temporal_positioning=False,
-        vae_scale_factors=(8, 32, 32),
-        use_middle_indices_grid=False,
-        timestep_scale_multiplier=1000.0,
-        av_ca_timestep_scale_multiplier=1.0,
-        dtype=None,
-        device=None,
-        operations=None,
-        **kwargs,
-    ):
-        # Store audio-specific parameters
-        self.audio_in_channels = audio_in_channels
-        self.audio_cross_attention_dim = audio_cross_attention_dim
-        self.audio_attention_head_dim = audio_attention_head_dim
-        self.audio_num_attention_heads = audio_num_attention_heads
-        self.audio_positional_embedding_max_pos = audio_positional_embedding_max_pos
-
-        # Calculate audio dimensions
-        self.audio_inner_dim = audio_num_attention_heads * audio_attention_head_dim
-        self.audio_out_channels = audio_in_channels
-
-        # Audio-specific constants
-        self.num_audio_channels = 8
-        self.audio_frequency_bins = 16
-
-        self.av_ca_timestep_scale_multiplier = av_ca_timestep_scale_multiplier
-
-        super().__init__(
-            in_channels=in_channels,
-            cross_attention_dim=cross_attention_dim,
-            attention_head_dim=attention_head_dim,
-            num_attention_heads=num_attention_heads,
-            caption_channels=caption_channels,
-            num_layers=num_layers,
-            positional_embedding_theta=positional_embedding_theta,
-            positional_embedding_max_pos=positional_embedding_max_pos,
-            causal_temporal_positioning=causal_temporal_positioning,
-            vae_scale_factors=vae_scale_factors,
-            use_middle_indices_grid=use_middle_indices_grid,
-            timestep_scale_multiplier=timestep_scale_multiplier,
-            dtype=dtype,
-            device=device,
-            operations=operations,
-            **kwargs,
-        )
-
-    def _init_model_components(self, device, dtype, **kwargs):
-        """Initialize LTXAV-specific components."""
-        # Audio-specific projections
-        self.audio_patchify_proj = self.operations.Linear(
-            self.audio_in_channels, self.audio_inner_dim, bias=True, dtype=dtype, device=device
-        )
-
-        # Audio-specific AdaLN
-        self.audio_adaln_single = AdaLayerNormSingle(
-            self.audio_inner_dim,
-            use_additional_conditions=False,
-            dtype=dtype,
-            device=device,
-            operations=self.operations,
-        )
-
-        num_scale_shift_values = 4
-        self.av_ca_video_scale_shift_adaln_single = AdaLayerNormSingle(
-            self.inner_dim,
-            use_additional_conditions=False,
-            embedding_coefficient=num_scale_shift_values,
-            dtype=dtype,
-            device=device,
-            operations=self.operations,
-        )
-        self.av_ca_a2v_gate_adaln_single = AdaLayerNormSingle(
-            self.inner_dim,
-            use_additional_conditions=False,
-            embedding_coefficient=1,
-            dtype=dtype,
-            device=device,
-            operations=self.operations,
-        )
-        self.av_ca_audio_scale_shift_adaln_single = AdaLayerNormSingle(
-            self.audio_inner_dim,
-            use_additional_conditions=False,
-            embedding_coefficient=num_scale_shift_values,
-            dtype=dtype,
-            device=device,
-            operations=self.operations,
-        )
-        self.av_ca_v2a_gate_adaln_single = AdaLayerNormSingle(
-            self.audio_inner_dim,
-            use_additional_conditions=False,
-            embedding_coefficient=1,
-            dtype=dtype,
-            device=device,
-            operations=self.operations,
-        )
-
-        # Audio caption projection
-        self.audio_caption_projection = PixArtAlphaTextProjection(
-            in_features=self.caption_channels,
-            hidden_size=self.audio_inner_dim,
-            dtype=dtype,
-            device=device,
-            operations=self.operations,
-        )
-
-    def _init_transformer_blocks(self, device, dtype, **kwargs):
-        """Initialize transformer blocks for LTXAV."""
-        self.transformer_blocks = nn.ModuleList(
-            [
-                BasicAVTransformerBlock(
-                    v_dim=self.inner_dim,
-                    a_dim=self.audio_inner_dim,
-                    v_heads=self.num_attention_heads,
-                    a_heads=self.audio_num_attention_heads,
-                    vd_head=self.attention_head_dim,
-                    ad_head=self.audio_attention_head_dim,
-                    v_context_dim=self.cross_attention_dim,
-                    a_context_dim=self.audio_cross_attention_dim,
-                    dtype=dtype,
-                    device=device,
-                    operations=self.operations,
-                )
-                for _ in range(self.num_layers)
-            ]
-        )
-
-    def _init_output_components(self, device, dtype):
-        """Initialize output components for LTXAV."""
-        # Video output components
-        super()._init_output_components(device, dtype)
-        # Audio output components
-        self.audio_scale_shift_table = nn.Parameter(
-            torch.empty(2, self.audio_inner_dim, dtype=dtype, device=device)
-        )
-        self.audio_norm_out = self.operations.LayerNorm(
-            self.audio_inner_dim, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device
-        )
-        self.audio_proj_out = self.operations.Linear(
-            self.audio_inner_dim, self.audio_out_channels, dtype=dtype, device=device
-        )
-        self.a_patchifier = AudioPatchifier(1, start_end=True)
-
-    def separate_audio_and_video_latents(self, x, audio_length):
-        """Separate audio and video latents from combined input."""
-        # vx = x[:, : self.in_channels]
-        # ax = x[:, self.in_channels :]
-        #
-        # ax = ax.reshape(ax.shape[0], -1)
-        # ax = ax[:, : audio_length * self.num_audio_channels * self.audio_frequency_bins]
-        #
-        # ax = ax.reshape(
-        #     ax.shape[0], self.num_audio_channels, audio_length, self.audio_frequency_bins
-        # )
-
-        vx = x[0]
-        ax = x[1] if len(x) > 1 else torch.zeros(
-            (vx.shape[0], self.num_audio_channels, 0, self.audio_frequency_bins),
-            device=vx.device, dtype=vx.dtype
-        )
-        return vx, ax
-
-    def recombine_audio_and_video_latents(self, vx, ax, target_shape=None):
-        if ax.numel() == 0:
-            return vx
-        else:
-            return [vx, ax]
-        """Recombine audio and video latents for output."""
-        # if ax.device != vx.device or ax.dtype != vx.dtype:
-        #     logging.warning("Audio and video latents are on different devices or dtypes.")
-        #     ax = ax.to(device=vx.device, dtype=vx.dtype)
-        #     logging.warning(f"Audio audio latent moved to device: {ax.device}, dtype: {ax.dtype}")
-        #
-        # ax = ax.reshape(ax.shape[0], -1)
-        # # pad to f x h x w of the video latents
-        # divisor = vx.shape[-1] * vx.shape[-2] * vx.shape[-3]
-        # if target_shape is None:
-        #     repetitions = math.ceil(ax.shape[-1] / divisor)
-        # else:
-        #     repetitions = target_shape[1] - vx.shape[1]
-        # padded_len = repetitions * divisor
-        # ax = F.pad(ax, (0, padded_len - ax.shape[-1]))
-        # ax = ax.reshape(ax.shape[0], -1, vx.shape[-3], vx.shape[-2], vx.shape[-1])
-        # return torch.cat([vx, ax], dim=1)
-
-    def _process_input(self, x, keyframe_idxs, denoise_mask, **kwargs):
-        """Process input for LTXAV - separate audio and video, then patchify."""
-        audio_length = kwargs.get("audio_length", 0)
-        # Separate audio and video latents
-        vx, ax = self.separate_audio_and_video_latents(x, audio_length)
-        [vx, v_pixel_coords, additional_args] = super()._process_input(
-            vx, keyframe_idxs, denoise_mask, **kwargs
-        )
-
-        ax, a_latent_coords = self.a_patchifier.patchify(ax)
-        ax = self.audio_patchify_proj(ax)
-
-        # additional_args.update({"av_orig_shape": list(x.shape)})
-        return [vx, ax], [v_pixel_coords, a_latent_coords], additional_args
-
-    def _prepare_timestep(self, timestep, batch_size, hidden_dtype, **kwargs):
-        """Prepare timestep embeddings."""
-        # TODO: some code reuse is needed here.
-        grid_mask = kwargs.get("grid_mask", None)
-        if grid_mask is not None:
-            timestep = timestep[:, grid_mask]
-
-        timestep = timestep * self.timestep_scale_multiplier
-        v_timestep, v_embedded_timestep = self.adaln_single(
-            timestep.flatten(),
-            {"resolution": None, "aspect_ratio": None},
-            batch_size=batch_size,
-            hidden_dtype=hidden_dtype,
-        )
-
-        # Second dimension is 1 or number of tokens (if timestep_per_token)
-        v_timestep = v_timestep.view(batch_size, -1, v_timestep.shape[-1])
-        v_embedded_timestep = v_embedded_timestep.view(
-            batch_size, -1, v_embedded_timestep.shape[-1]
-        )
-
-        # Prepare audio timestep
-        a_timestep = kwargs.get("a_timestep")
-        if a_timestep is not None:
-            a_timestep = a_timestep * self.timestep_scale_multiplier
-            av_ca_factor = self.av_ca_timestep_scale_multiplier / self.timestep_scale_multiplier
-
-            av_ca_audio_scale_shift_timestep, _ = self.av_ca_audio_scale_shift_adaln_single(
-                a_timestep.flatten(),
-                {"resolution": None, "aspect_ratio": None},
-                batch_size=batch_size,
-                hidden_dtype=hidden_dtype,
-            )
-            av_ca_video_scale_shift_timestep, _ = self.av_ca_video_scale_shift_adaln_single(
-                timestep.flatten(),
-                {"resolution": None, "aspect_ratio": None},
-                batch_size=batch_size,
-                hidden_dtype=hidden_dtype,
-            )
-            av_ca_a2v_gate_noise_timestep, _ = self.av_ca_a2v_gate_adaln_single(
-                timestep.flatten() * av_ca_factor,
-                {"resolution": None, "aspect_ratio": None},
-                batch_size=batch_size,
-                hidden_dtype=hidden_dtype,
-            )
-            av_ca_v2a_gate_noise_timestep, _ = self.av_ca_v2a_gate_adaln_single(
-                a_timestep.flatten() * av_ca_factor,
-                {"resolution": None, "aspect_ratio": None},
-                batch_size=batch_size,
-                hidden_dtype=hidden_dtype,
-            )
-
-            a_timestep, a_embedded_timestep = self.audio_adaln_single(
-                a_timestep.flatten(),
-                {"resolution": None, "aspect_ratio": None},
-                batch_size=batch_size,
-                hidden_dtype=hidden_dtype,
-            )
-            a_timestep = a_timestep.view(batch_size, -1, a_timestep.shape[-1])
-            a_embedded_timestep = a_embedded_timestep.view(
-                batch_size, -1, a_embedded_timestep.shape[-1]
-            )
-            cross_av_timestep_ss = [
-                av_ca_audio_scale_shift_timestep,
-                av_ca_video_scale_shift_timestep,
-                av_ca_a2v_gate_noise_timestep,
-                av_ca_v2a_gate_noise_timestep,
-            ]
-            cross_av_timestep_ss = list(
-                [t.view(batch_size, -1, t.shape[-1]) for t in cross_av_timestep_ss]
-            )
-        else:
-            a_timestep = timestep
-            a_embedded_timestep = kwargs.get("embedded_timestep")
-            cross_av_timestep_ss = []
-
-        return [v_timestep, a_timestep, cross_av_timestep_ss], [
-            v_embedded_timestep,
-            a_embedded_timestep,
-        ]
-
-    def _prepare_context(self, context, batch_size, x, attention_mask=None):
-        vx = x[0]
-        ax = x[1]
-        v_context, a_context = torch.split(
-            context, int(context.shape[-1] / 2), len(context.shape) - 1
-        )
-
-        v_context, attention_mask = super()._prepare_context(
-            v_context, batch_size, vx, attention_mask
-        )
-        if self.audio_caption_projection is not None:
-            a_context = self.audio_caption_projection(a_context)
-            a_context = a_context.view(batch_size, -1, ax.shape[-1])
-
-        return [v_context, a_context], attention_mask
-
-    def _prepare_positional_embeddings(self, pixel_coords, frame_rate, x_dtype):
-        v_pixel_coords = pixel_coords[0]
-        v_pe = super()._prepare_positional_embeddings(v_pixel_coords, frame_rate, x_dtype)
-
-        a_latent_coords = pixel_coords[1]
-        a_pe = self._precompute_freqs_cis(
-            a_latent_coords,
-            dim=self.audio_inner_dim,
-            out_dtype=x_dtype,
-            max_pos=self.audio_positional_embedding_max_pos,
-            use_middle_indices_grid=self.use_middle_indices_grid,
-            num_attention_heads=self.audio_num_attention_heads,
-        )
-
-        # calculate positional embeddings for the middle of the token duration, to use in av cross attention layers.
-        max_pos = max(
-            self.positional_embedding_max_pos[0], self.audio_positional_embedding_max_pos[0]
-        )
-        v_pixel_coords = v_pixel_coords.to(torch.float32)
-        v_pixel_coords[:, 0] = v_pixel_coords[:, 0] * (1.0 / frame_rate)
-        av_cross_video_freq_cis = self._precompute_freqs_cis(
-            v_pixel_coords[:, 0:1, :],
-            dim=self.audio_cross_attention_dim,
-            out_dtype=x_dtype,
-            max_pos=[max_pos],
-            use_middle_indices_grid=True,
-            num_attention_heads=self.audio_num_attention_heads,
-        )
-        av_cross_audio_freq_cis = self._precompute_freqs_cis(
-            a_latent_coords[:, 0:1, :],
-            dim=self.audio_cross_attention_dim,
-            out_dtype=x_dtype,
-            max_pos=[max_pos],
-            use_middle_indices_grid=True,
-            num_attention_heads=self.audio_num_attention_heads,
-        )
-
-        return [(v_pe, av_cross_video_freq_cis), (a_pe, av_cross_audio_freq_cis)]
-
-    def _process_transformer_blocks(
-        self, x, context, attention_mask, timestep, pe, transformer_options={}, **kwargs
-    ):
-        vx = x[0]
-        ax = x[1]
-        v_context = context[0]
-        a_context = context[1]
-        v_timestep = timestep[0]
-        a_timestep = timestep[1]
-        v_pe, av_cross_video_freq_cis = pe[0]
-        a_pe, av_cross_audio_freq_cis = pe[1]
-
-        (
-            av_ca_audio_scale_shift_timestep,
-            av_ca_video_scale_shift_timestep,
-            av_ca_a2v_gate_noise_timestep,
-            av_ca_v2a_gate_noise_timestep,
-        ) = timestep[2]
-
-        """Process transformer blocks for LTXAV."""
-        patches_replace = transformer_options.get("patches_replace", {})
-        blocks_replace = patches_replace.get("dit", {})
-
-        # Process transformer blocks
-        for i, block in enumerate(self.transformer_blocks):
-            if ("double_block", i) in blocks_replace:
-
-                def block_wrap(args):
-                    out = {}
-                    out["img"] = block(
-                        args["img"],
-                        v_context=args["v_context"],
-                        a_context=args["a_context"],
-                        attention_mask=args["attention_mask"],
-                        v_timestep=args["v_timestep"],
-                        a_timestep=args["a_timestep"],
-                        v_pe=args["v_pe"],
-                        a_pe=args["a_pe"],
-                        v_cross_pe=args["v_cross_pe"],
-                        a_cross_pe=args["a_cross_pe"],
-                        v_cross_scale_shift_timestep=args["v_cross_scale_shift_timestep"],
-                        a_cross_scale_shift_timestep=args["a_cross_scale_shift_timestep"],
-                        v_cross_gate_timestep=args["v_cross_gate_timestep"],
-                        a_cross_gate_timestep=args["a_cross_gate_timestep"],
-                        transformer_options=args["transformer_options"],
-                    )
-                    return out
-
-                out = blocks_replace[("double_block", i)](
-                    {
-                        "img": (vx, ax),
-                        "v_context": v_context,
-                        "a_context": a_context,
-                        "attention_mask": attention_mask,
-                        "v_timestep": v_timestep,
-                        "a_timestep": a_timestep,
-                        "v_pe": v_pe,
-                        "a_pe": a_pe,
-                        "v_cross_pe": av_cross_video_freq_cis,
-                        "a_cross_pe": av_cross_audio_freq_cis,
-                        "v_cross_scale_shift_timestep": av_ca_video_scale_shift_timestep,
-                        "a_cross_scale_shift_timestep": av_ca_audio_scale_shift_timestep,
-                        "v_cross_gate_timestep": av_ca_a2v_gate_noise_timestep,
-                        "a_cross_gate_timestep": av_ca_v2a_gate_noise_timestep,
-                        "transformer_options": transformer_options,
-                    },
-                    {"original_block": block_wrap},
-                )
-                vx, ax = out["img"]
-            else:
-                vx, ax = block(
-                    (vx, ax),
-                    v_context=v_context,
-                    a_context=a_context,
-                    attention_mask=attention_mask,
-                    v_timestep=v_timestep,
-                    a_timestep=a_timestep,
-                    v_pe=v_pe,
-                    a_pe=a_pe,
-                    v_cross_pe=av_cross_video_freq_cis,
-                    a_cross_pe=av_cross_audio_freq_cis,
-                    v_cross_scale_shift_timestep=av_ca_video_scale_shift_timestep,
-                    a_cross_scale_shift_timestep=av_ca_audio_scale_shift_timestep,
-                    v_cross_gate_timestep=av_ca_a2v_gate_noise_timestep,
-                    a_cross_gate_timestep=av_ca_v2a_gate_noise_timestep,
-                    transformer_options=transformer_options,
-                )
-
-        return [vx, ax]
-
-    def _process_output(self, x, embedded_timestep, keyframe_idxs, **kwargs):
-        vx = x[0]
-        ax = x[1]
-        v_embedded_timestep = embedded_timestep[0]
-        a_embedded_timestep = embedded_timestep[1]
-        vx = super()._process_output(vx, v_embedded_timestep, keyframe_idxs, **kwargs)
-
-        # Process audio output
-        a_scale_shift_values = (
-            self.audio_scale_shift_table[None, None].to(device=a_embedded_timestep.device, dtype=a_embedded_timestep.dtype)
-            + a_embedded_timestep[:, :, None]
-        )
-        a_shift, a_scale = a_scale_shift_values[:, :, 0], a_scale_shift_values[:, :, 1]
-
-        ax = self.audio_norm_out(ax)
-        ax = ax * (1 + a_scale) + a_shift
-        ax = self.audio_proj_out(ax)
-
-        # Unpatchify audio
-        ax = self.a_patchifier.unpatchify(
-            ax, channels=self.num_audio_channels, freq=self.audio_frequency_bins
-        )
-
-        # Recombine audio and video
-        original_shape = kwargs.get("av_orig_shape")
-        return self.recombine_audio_and_video_latents(vx, ax, original_shape)
-
-    def forward(
-        self,
-        x,
-        timestep,
-        context,
-        attention_mask=None,
-        frame_rate=25,
-        transformer_options={},
-        keyframe_idxs=None,
-        **kwargs,
-    ):
-        """
-        Forward pass for LTXAV model.
-
-        Args:
-            x: Combined audio-video input tensor
-            timestep: Tuple of (video_timestep, audio_timestep) or single timestep
-            context: Context tensor (e.g., text embeddings)
-            attention_mask: Attention mask tensor
-            frame_rate: Frame rate for temporal processing
-            transformer_options: Additional options for transformer blocks
-            keyframe_idxs: Keyframe indices for temporal processing
-            **kwargs: Additional keyword arguments including audio_length
-
-        Returns:
-            Combined audio-video output tensor
-        """
-        # Handle timestep format
-        if isinstance(timestep, (tuple, list)) and len(timestep) == 2:
-            v_timestep, a_timestep = timestep
-            kwargs["a_timestep"] = a_timestep
-            timestep = v_timestep
-        else:
-            kwargs["a_timestep"] = timestep
-
-        # Call parent forward method
-        return super().forward(
-            x,
-            timestep,
-            context,
-            attention_mask,
-            frame_rate,
-            transformer_options,
-            keyframe_idxs,
-            **kwargs,
-        )
--- a/comfy/ldm/lightricks/embeddings_connector.py
+++ b/comfy/ldm/lightricks/embeddings_connector.py
@ -1,305 +0,0 @@
-import math
-from typing import Optional
-
-import comfy.ldm.common_dit
-import torch
-from comfy.ldm.lightricks.model import (
-    CrossAttention,
-    FeedForward,
-    generate_freq_grid_np,
-    interleaved_freqs_cis,
-    split_freqs_cis,
-)
-from torch import nn
-
-
-class BasicTransformerBlock1D(nn.Module):
-    r"""
-    A basic Transformer block.
-
-    Parameters:
-
-        dim (`int`): The number of channels in the input and output.
-        num_attention_heads (`int`): The number of heads to use for multi-head attention.
-        attention_head_dim (`int`): The number of channels in each head.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        attention_bias (:
-            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
-        upcast_attention (`bool`, *optional*):
-            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
-        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
-            Whether to use learnable elementwise affine parameters for normalization.
-        standardization_norm (`str`, *optional*, defaults to `"layer_norm"`): The type of pre-normalization to use. Can be `"layer_norm"` or `"rms_norm"`.
-        norm_eps (`float`, *optional*, defaults to 1e-5): Epsilon value for normalization layers.
-        qk_norm (`str`, *optional*, defaults to None):
-            Set to 'layer_norm' or `rms_norm` to perform query and key normalization.
-        final_dropout (`bool` *optional*, defaults to False):
-            Whether to apply a final dropout after the last feed-forward layer.
-        ff_inner_dim (`int`, *optional*): Dimension of the inner feed-forward layer. If not provided, defaults to `dim * 4`.
-        ff_bias (`bool`, *optional*, defaults to `True`): Whether to use bias in the feed-forward layer.
-        attention_out_bias (`bool`, *optional*, defaults to `True`): Whether to use bias in the attention output layer.
-        use_rope (`bool`, *optional*, defaults to `False`): Whether to use Rotary Position Embeddings (RoPE).
-        ffn_dim_mult (`int`, *optional*, defaults to 4): Multiplier for the inner dimension of the feed-forward layer.
-    """
-
-    def __init__(
-        self,
-        dim,
-        n_heads,
-        d_head,
-        context_dim=None,
-        attn_precision=None,
-        dtype=None,
-        device=None,
-        operations=None,
-    ):
-        super().__init__()
-
-        # Define 3 blocks. Each block has its own normalization layer.
-        # 1. Self-Attn
-        self.attn1 = CrossAttention(
-            query_dim=dim,
-            heads=n_heads,
-            dim_head=d_head,
-            context_dim=None,
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-
-        # 3. Feed-forward
-        self.ff = FeedForward(
-            dim,
-            dim_out=dim,
-            glu=True,
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-
-    def forward(self, hidden_states, attention_mask=None, pe=None) -> torch.FloatTensor:
-
-        # Notice that normalization is always applied before the real computation in the following blocks.
-
-        # 1. Normalization Before Self-Attention
-        norm_hidden_states = comfy.ldm.common_dit.rms_norm(hidden_states)
-
-        norm_hidden_states = norm_hidden_states.squeeze(1)
-
-        # 2. Self-Attention
-        attn_output = self.attn1(norm_hidden_states, mask=attention_mask, pe=pe)
-
-        hidden_states = attn_output + hidden_states
-        if hidden_states.ndim == 4:
-            hidden_states = hidden_states.squeeze(1)
-
-        # 3. Normalization before Feed-Forward
-        norm_hidden_states = comfy.ldm.common_dit.rms_norm(hidden_states)
-
-        # 4. Feed-forward
-        ff_output = self.ff(norm_hidden_states)
-
-        hidden_states = ff_output + hidden_states
-        if hidden_states.ndim == 4:
-            hidden_states = hidden_states.squeeze(1)
-
-        return hidden_states
-
-
-class Embeddings1DConnector(nn.Module):
-    _supports_gradient_checkpointing = True
-
-    def __init__(
-        self,
-        in_channels=128,
-        cross_attention_dim=2048,
-        attention_head_dim=128,
-        num_attention_heads=30,
-        num_layers=2,
-        positional_embedding_theta=10000.0,
-        positional_embedding_max_pos=[4096],
-        causal_temporal_positioning=False,
-        num_learnable_registers: Optional[int] = 128,
-        dtype=None,
-        device=None,
-        operations=None,
-        split_rope=False,
-        double_precision_rope=False,
-        **kwargs,
-    ):
-        super().__init__()
-        self.dtype = dtype
-        self.out_channels = in_channels
-        self.num_attention_heads = num_attention_heads
-        self.inner_dim = num_attention_heads * attention_head_dim
-        self.causal_temporal_positioning = causal_temporal_positioning
-        self.positional_embedding_theta = positional_embedding_theta
-        self.positional_embedding_max_pos = positional_embedding_max_pos
-        self.split_rope = split_rope
-        self.double_precision_rope = double_precision_rope
-        self.transformer_1d_blocks = nn.ModuleList(
-            [
-                BasicTransformerBlock1D(
-                    self.inner_dim,
-                    num_attention_heads,
-                    attention_head_dim,
-                    context_dim=cross_attention_dim,
-                    dtype=dtype,
-                    device=device,
-                    operations=operations,
-                )
-                for _ in range(num_layers)
-            ]
-        )
-
-        inner_dim = num_attention_heads * attention_head_dim
-        self.num_learnable_registers = num_learnable_registers
-        if self.num_learnable_registers:
-            self.learnable_registers = nn.Parameter(
-                torch.rand(
-                    self.num_learnable_registers, inner_dim, dtype=dtype, device=device
-                )
-                * 2.0
-                - 1.0
-            )
-
-    def get_fractional_positions(self, indices_grid):
-        fractional_positions = torch.stack(
-            [
-                indices_grid[:, i] / self.positional_embedding_max_pos[i]
-                for i in range(1)
-            ],
-            dim=-1,
-        )
-        return fractional_positions
-
-    def precompute_freqs(self, indices_grid, spacing):
-        source_dtype = indices_grid.dtype
-        dtype = (
-            torch.float32
-            if source_dtype in (torch.bfloat16, torch.float16)
-            else source_dtype
-        )
-
-        fractional_positions = self.get_fractional_positions(indices_grid)
-        indices = (
-            generate_freq_grid_np(
-                self.positional_embedding_theta,
-                indices_grid.shape[1],
-                self.inner_dim,
-            )
-            if self.double_precision_rope
-            else self.generate_freq_grid(spacing, dtype, fractional_positions.device)
-        ).to(device=fractional_positions.device)
-
-        if spacing == "exp_2":
-            freqs = (
-                (indices * fractional_positions.unsqueeze(-1))
-                .transpose(-1, -2)
-                .flatten(2)
-            )
-        else:
-            freqs = (
-                (indices * (fractional_positions.unsqueeze(-1) * 2 - 1))
-                .transpose(-1, -2)
-                .flatten(2)
-            )
-        return freqs
-
-    def generate_freq_grid(self, spacing, dtype, device):
-        dim = self.inner_dim
-        theta = self.positional_embedding_theta
-        n_pos_dims = 1
-        n_elem = 2 * n_pos_dims  # 2 for cos and sin e.g. x 3 = 6
-        start = 1
-        end = theta
-
-        if spacing == "exp":
-            indices = theta ** (torch.arange(0, dim, n_elem, device="cpu", dtype=torch.float32) / (dim - n_elem))
-            indices = indices.to(dtype=dtype, device=device)
-        elif spacing == "exp_2":
-            indices = 1.0 / theta ** (torch.arange(0, dim, n_elem, device=device) / dim)
-            indices = indices.to(dtype=dtype)
-        elif spacing == "linear":
-            indices = torch.linspace(
-                start, end, dim // n_elem, device=device, dtype=dtype
-            )
-        elif spacing == "sqrt":
-            indices = torch.linspace(
-                start**2, end**2, dim // n_elem, device=device, dtype=dtype
-            ).sqrt()
-
-        indices = indices * math.pi / 2
-
-        return indices
-
-    def precompute_freqs_cis(self, indices_grid, spacing="exp"):
-        dim = self.inner_dim
-        n_elem = 2  # 2 because of cos and sin
-        freqs = self.precompute_freqs(indices_grid, spacing)
-        if self.split_rope:
-            expected_freqs = dim // 2
-            current_freqs = freqs.shape[-1]
-            pad_size = expected_freqs - current_freqs
-            cos_freq, sin_freq = split_freqs_cis(
-                freqs, pad_size, self.num_attention_heads
-            )
-        else:
-            cos_freq, sin_freq = interleaved_freqs_cis(freqs, dim % n_elem)
-        return cos_freq.to(self.dtype), sin_freq.to(self.dtype), self.split_rope
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-    ):
-        """
-        The [`Transformer2DModel`] forward method.
-
-        Args:
-            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
-                Input `hidden_states`.
-            indices_grid (`torch.LongTensor` of shape `(batch size, 3, num latent pixels)`):
-            attention_mask ( `torch.Tensor`, *optional*):
-                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
-                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
-                negative values to the attention scores corresponding to "discard" tokens.
-        Returns:
-            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
-            `tuple` where the first element is the sample tensor.
-        """
-        # 1. Input
-
-        if self.num_learnable_registers:
-            num_registers_duplications = math.ceil(
-                max(1024, hidden_states.shape[1]) / self.num_learnable_registers
-            )
-            learnable_registers = torch.tile(
-                self.learnable_registers.to(hidden_states), (num_registers_duplications, 1)
-            )
-
-            hidden_states = torch.cat((hidden_states, learnable_registers[hidden_states.shape[1]:].unsqueeze(0).repeat(hidden_states.shape[0], 1, 1)), dim=1)
-
-            if attention_mask is not None:
-                attention_mask = torch.zeros([1, 1, 1, hidden_states.shape[1]], dtype=attention_mask.dtype, device=attention_mask.device)
-
-        indices_grid = torch.arange(
-            hidden_states.shape[1], dtype=torch.float32, device=hidden_states.device
-        )
-        indices_grid = indices_grid[None, None, :]
-        freqs_cis = self.precompute_freqs_cis(indices_grid)
-
-        # 2. Blocks
-        for block_idx, block in enumerate(self.transformer_1d_blocks):
-            hidden_states = block(
-                hidden_states, attention_mask=attention_mask, pe=freqs_cis
-            )
-
-        # 3. Output
-        # if self.output_scale is not None:
-        #     hidden_states = hidden_states / self.output_scale
-
-        hidden_states = comfy.ldm.common_dit.rms_norm(hidden_states)
-
-        return hidden_states, attention_mask
--- a/comfy/ldm/lightricks/latent_upsampler.py
+++ b/comfy/ldm/lightricks/latent_upsampler.py
@ -1,292 +0,0 @@
-from typing import Optional, Tuple
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-
-
-def _rational_for_scale(scale: float) -> Tuple[int, int]:
-    mapping = {0.75: (3, 4), 1.5: (3, 2), 2.0: (2, 1), 4.0: (4, 1)}
-    if float(scale) not in mapping:
-        raise ValueError(
-            f"Unsupported spatial_scale {scale}. Choose from {list(mapping.keys())}"
-        )
-    return mapping[float(scale)]
-
-
-class PixelShuffleND(nn.Module):
-    def __init__(self, dims, upscale_factors=(2, 2, 2)):
-        super().__init__()
-        assert dims in [1, 2, 3], "dims must be 1, 2, or 3"
-        self.dims = dims
-        self.upscale_factors = upscale_factors
-
-    def forward(self, x):
-        if self.dims == 3:
-            return rearrange(
-                x,
-                "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",
-                p1=self.upscale_factors[0],
-                p2=self.upscale_factors[1],
-                p3=self.upscale_factors[2],
-            )
-        elif self.dims == 2:
-            return rearrange(
-                x,
-                "b (c p1 p2) h w -> b c (h p1) (w p2)",
-                p1=self.upscale_factors[0],
-                p2=self.upscale_factors[1],
-            )
-        elif self.dims == 1:
-            return rearrange(
-                x,
-                "b (c p1) f h w -> b c (f p1) h w",
-                p1=self.upscale_factors[0],
-            )
-
-
-class BlurDownsample(nn.Module):
-    """
-    Anti-aliased spatial downsampling by integer stride using a fixed separable binomial kernel.
-    Applies only on H,W. Works for dims=2 or dims=3 (per-frame).
-    """
-
-    def __init__(self, dims: int, stride: int):
-        super().__init__()
-        assert dims in (2, 3)
-        assert stride >= 1 and isinstance(stride, int)
-        self.dims = dims
-        self.stride = stride
-
-        # 5x5 separable binomial kernel [1,4,6,4,1] (outer product), normalized
-        k = torch.tensor([1.0, 4.0, 6.0, 4.0, 1.0])
-        k2d = k[:, None] @ k[None, :]
-        k2d = (k2d / k2d.sum()).float()  # shape (5,5)
-        self.register_buffer("kernel", k2d[None, None, :, :])  # (1,1,5,5)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        if self.stride == 1:
-            return x
-
-        def _apply_2d(x2d: torch.Tensor) -> torch.Tensor:
-            # x2d: (B, C, H, W)
-            B, C, H, W = x2d.shape
-            weight = self.kernel.expand(C, 1, 5, 5)  # depthwise
-            x2d = F.conv2d(
-                x2d, weight=weight, bias=None, stride=self.stride, padding=2, groups=C
-            )
-            return x2d
-
-        if self.dims == 2:
-            return _apply_2d(x)
-        else:
-            # dims == 3: apply per-frame on H,W
-            b, c, f, h, w = x.shape
-            x = rearrange(x, "b c f h w -> (b f) c h w")
-            x = _apply_2d(x)
-            h2, w2 = x.shape[-2:]
-            x = rearrange(x, "(b f) c h w -> b c f h w", b=b, f=f, h=h2, w=w2)
-            return x
-
-
-class SpatialRationalResampler(nn.Module):
-    """
-    Fully-learned rational spatial scaling: up by 'num' via PixelShuffle, then anti-aliased
-    downsample by 'den' using fixed blur + stride. Operates on H,W only.
-
-    For dims==3, work per-frame for spatial scaling (temporal axis untouched).
-    """
-
-    def __init__(self, mid_channels: int, scale: float):
-        super().__init__()
-        self.scale = float(scale)
-        self.num, self.den = _rational_for_scale(self.scale)
-        self.conv = nn.Conv2d(
-            mid_channels, (self.num**2) * mid_channels, kernel_size=3, padding=1
-        )
-        self.pixel_shuffle = PixelShuffleND(2, upscale_factors=(self.num, self.num))
-        self.blur_down = BlurDownsample(dims=2, stride=self.den)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        b, c, f, h, w = x.shape
-        x = rearrange(x, "b c f h w -> (b f) c h w")
-        x = self.conv(x)
-        x = self.pixel_shuffle(x)
-        x = self.blur_down(x)
-        x = rearrange(x, "(b f) c h w -> b c f h w", b=b, f=f)
-        return x
-
-
-class ResBlock(nn.Module):
-    def __init__(
-        self, channels: int, mid_channels: Optional[int] = None, dims: int = 3
-    ):
-        super().__init__()
-        if mid_channels is None:
-            mid_channels = channels
-
-        Conv = nn.Conv2d if dims == 2 else nn.Conv3d
-
-        self.conv1 = Conv(channels, mid_channels, kernel_size=3, padding=1)
-        self.norm1 = nn.GroupNorm(32, mid_channels)
-        self.conv2 = Conv(mid_channels, channels, kernel_size=3, padding=1)
-        self.norm2 = nn.GroupNorm(32, channels)
-        self.activation = nn.SiLU()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        residual = x
-        x = self.conv1(x)
-        x = self.norm1(x)
-        x = self.activation(x)
-        x = self.conv2(x)
-        x = self.norm2(x)
-        x = self.activation(x + residual)
-        return x
-
-
-class LatentUpsampler(nn.Module):
-    """
-    Model to spatially upsample VAE latents.
-
-    Args:
-        in_channels (`int`): Number of channels in the input latent
-        mid_channels (`int`): Number of channels in the middle layers
-        num_blocks_per_stage (`int`): Number of ResBlocks to use in each stage (pre/post upsampling)
-        dims (`int`): Number of dimensions for convolutions (2 or 3)
-        spatial_upsample (`bool`): Whether to spatially upsample the latent
-        temporal_upsample (`bool`): Whether to temporally upsample the latent
-    """
-
-    def __init__(
-        self,
-        in_channels: int = 128,
-        mid_channels: int = 512,
-        num_blocks_per_stage: int = 4,
-        dims: int = 3,
-        spatial_upsample: bool = True,
-        temporal_upsample: bool = False,
-        spatial_scale: float = 2.0,
-        rational_resampler: bool = False,
-    ):
-        super().__init__()
-
-        self.in_channels = in_channels
-        self.mid_channels = mid_channels
-        self.num_blocks_per_stage = num_blocks_per_stage
-        self.dims = dims
-        self.spatial_upsample = spatial_upsample
-        self.temporal_upsample = temporal_upsample
-        self.spatial_scale = float(spatial_scale)
-        self.rational_resampler = rational_resampler
-
-        Conv = nn.Conv2d if dims == 2 else nn.Conv3d
-
-        self.initial_conv = Conv(in_channels, mid_channels, kernel_size=3, padding=1)
-        self.initial_norm = nn.GroupNorm(32, mid_channels)
-        self.initial_activation = nn.SiLU()
-
-        self.res_blocks = nn.ModuleList(
-            [ResBlock(mid_channels, dims=dims) for _ in range(num_blocks_per_stage)]
-        )
-
-        if spatial_upsample and temporal_upsample:
-            self.upsampler = nn.Sequential(
-                nn.Conv3d(mid_channels, 8 * mid_channels, kernel_size=3, padding=1),
-                PixelShuffleND(3),
-            )
-        elif spatial_upsample:
-            if rational_resampler:
-                self.upsampler = SpatialRationalResampler(
-                    mid_channels=mid_channels, scale=self.spatial_scale
-                )
-            else:
-                self.upsampler = nn.Sequential(
-                    nn.Conv2d(mid_channels, 4 * mid_channels, kernel_size=3, padding=1),
-                    PixelShuffleND(2),
-                )
-        elif temporal_upsample:
-            self.upsampler = nn.Sequential(
-                nn.Conv3d(mid_channels, 2 * mid_channels, kernel_size=3, padding=1),
-                PixelShuffleND(1),
-            )
-        else:
-            raise ValueError(
-                "Either spatial_upsample or temporal_upsample must be True"
-            )
-
-        self.post_upsample_res_blocks = nn.ModuleList(
-            [ResBlock(mid_channels, dims=dims) for _ in range(num_blocks_per_stage)]
-        )
-
-        self.final_conv = Conv(mid_channels, in_channels, kernel_size=3, padding=1)
-
-    def forward(self, latent: torch.Tensor) -> torch.Tensor:
-        b, c, f, h, w = latent.shape
-
-        if self.dims == 2:
-            x = rearrange(latent, "b c f h w -> (b f) c h w")
-            x = self.initial_conv(x)
-            x = self.initial_norm(x)
-            x = self.initial_activation(x)
-
-            for block in self.res_blocks:
-                x = block(x)
-
-            x = self.upsampler(x)
-
-            for block in self.post_upsample_res_blocks:
-                x = block(x)
-
-            x = self.final_conv(x)
-            x = rearrange(x, "(b f) c h w -> b c f h w", b=b, f=f)
-        else:
-            x = self.initial_conv(latent)
-            x = self.initial_norm(x)
-            x = self.initial_activation(x)
-
-            for block in self.res_blocks:
-                x = block(x)
-
-            if self.temporal_upsample:
-                x = self.upsampler(x)
-                x = x[:, :, 1:, :, :]
-            else:
-                if isinstance(self.upsampler, SpatialRationalResampler):
-                    x = self.upsampler(x)
-                else:
-                    x = rearrange(x, "b c f h w -> (b f) c h w")
-                    x = self.upsampler(x)
-                    x = rearrange(x, "(b f) c h w -> b c f h w", b=b, f=f)
-
-            for block in self.post_upsample_res_blocks:
-                x = block(x)
-
-            x = self.final_conv(x)
-
-        return x
-
-    @classmethod
-    def from_config(cls, config):
-        return cls(
-            in_channels=config.get("in_channels", 4),
-            mid_channels=config.get("mid_channels", 128),
-            num_blocks_per_stage=config.get("num_blocks_per_stage", 4),
-            dims=config.get("dims", 2),
-            spatial_upsample=config.get("spatial_upsample", True),
-            temporal_upsample=config.get("temporal_upsample", False),
-            spatial_scale=config.get("spatial_scale", 2.0),
-            rational_resampler=config.get("rational_resampler", False),
-        )
-
-    def config(self):
-        return {
-            "_class_name": "LatentUpsampler",
-            "in_channels": self.in_channels,
-            "mid_channels": self.mid_channels,
-            "num_blocks_per_stage": self.num_blocks_per_stage,
-            "dims": self.dims,
-            "spatial_upsample": self.spatial_upsample,
-            "temporal_upsample": self.temporal_upsample,
-            "spatial_scale": self.spatial_scale,
-            "rational_resampler": self.rational_resampler,
-        }
--- a/comfy/ldm/lightricks/model.py
+++ b/comfy/ldm/lightricks/model.py
@ -1,47 +1,14 @@
-from abc import ABC, abstractmethod
-from enum import Enum
-import functools
-import math
-from typing import Dict, Optional, Tuple
-
-from einops import rearrange
-import numpy as np
 import torch
 from torch import nn
 import comfy.patcher_extension
 import comfy.ldm.modules.attention
 import comfy.ldm.common_dit
+from einops import rearrange
+import math
+from typing import Dict, Optional, Tuple

 from .symmetric_patchifier import SymmetricPatchifier, latent_to_pixel_coords

-def _log_base(x, base):
-    return np.log(x) / np.log(base)
-
-class LTXRopeType(str, Enum):
-    INTERLEAVED = "interleaved"
-    SPLIT = "split"
-
-    KEY = "rope_type"
-
-    @classmethod
-    def from_dict(cls, kwargs, default=None):
-        if default is None:
-            default = cls.INTERLEAVED
-        return cls(kwargs.get(cls.KEY, default))
-
-
-class LTXFrequenciesPrecision(str, Enum):
-    FLOAT32 = "float32"
-    FLOAT64 = "float64"
-
-    KEY = "frequencies_precision"
-
-    @classmethod
-    def from_dict(cls, kwargs, default=None):
-        if default is None:
-            default = cls.FLOAT32
-        return cls(kwargs.get(cls.KEY, default))
-

 def get_timestep_embedding(
    timesteps: torch.Tensor,
@ -73,7 +40,9 @@ def get_timestep_embedding(
    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"

    half_dim = embedding_dim // 2
-    exponent = -math.log(max_period) * torch.arange(start=0, end=half_dim, dtype=torch.float32, device=timesteps.device)
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
    exponent = exponent / (half_dim - downscale_freq_shift)

    emb = torch.exp(exponent)
@ -105,9 +74,7 @@ class TimestepEmbedding(nn.Module):
        post_act_fn: Optional[str] = None,
        cond_proj_dim=None,
        sample_proj_bias=True,
-        dtype=None,
-        device=None,
-        operations=None,
+        dtype=None, device=None, operations=None,
    ):
        super().__init__()

@ -124,9 +91,7 @@ class TimestepEmbedding(nn.Module):
            time_embed_dim_out = out_dim
        else:
            time_embed_dim_out = time_embed_dim
-        self.linear_2 = operations.Linear(
-            time_embed_dim, time_embed_dim_out, sample_proj_bias, dtype=dtype, device=device
-        )
+        self.linear_2 = operations.Linear(time_embed_dim, time_embed_dim_out, sample_proj_bias, dtype=dtype, device=device)

        if post_act_fn is None:
            self.post_act = None
@ -175,22 +140,12 @@ class PixArtAlphaCombinedTimestepSizeEmbeddings(nn.Module):
    https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L164C9-L168C29
    """

-    def __init__(
-        self,
-        embedding_dim,
-        size_emb_dim,
-        use_additional_conditions: bool = False,
-        dtype=None,
-        device=None,
-        operations=None,
-    ):
+    def __init__(self, embedding_dim, size_emb_dim, use_additional_conditions: bool = False, dtype=None, device=None, operations=None):
        super().__init__()

        self.outdim = size_emb_dim
        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.timestep_embedder = TimestepEmbedding(
-            in_channels=256, time_embed_dim=embedding_dim, dtype=dtype, device=device, operations=operations
-        )
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim, dtype=dtype, device=device, operations=operations)

    def forward(self, timestep, resolution, aspect_ratio, batch_size, hidden_dtype):
        timesteps_proj = self.time_proj(timestep)
@ -209,22 +164,15 @@ class AdaLayerNormSingle(nn.Module):
        use_additional_conditions (`bool`): To use additional conditions for normalization or not.
    """

-    def __init__(
-        self, embedding_dim: int, embedding_coefficient: int = 6, use_additional_conditions: bool = False, dtype=None, device=None, operations=None
-    ):
+    def __init__(self, embedding_dim: int, use_additional_conditions: bool = False, dtype=None, device=None, operations=None):
        super().__init__()

        self.emb = PixArtAlphaCombinedTimestepSizeEmbeddings(
-            embedding_dim,
-            size_emb_dim=embedding_dim // 3,
-            use_additional_conditions=use_additional_conditions,
-            dtype=dtype,
-            device=device,
-            operations=operations,
+            embedding_dim, size_emb_dim=embedding_dim // 3, use_additional_conditions=use_additional_conditions, dtype=dtype, device=device, operations=operations
        )

        self.silu = nn.SiLU()
-        self.linear = operations.Linear(embedding_dim, embedding_coefficient * embedding_dim, bias=True, dtype=dtype, device=device)
+        self.linear = operations.Linear(embedding_dim, 6 * embedding_dim, bias=True, dtype=dtype, device=device)

    def forward(
        self,
@ -238,7 +186,6 @@ class AdaLayerNormSingle(nn.Module):
        embedded_timestep = self.emb(timestep, **added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_dtype)
        return self.linear(self.silu(embedded_timestep)), embedded_timestep

-
 class PixArtAlphaTextProjection(nn.Module):
    """
    Projects caption embeddings. Also handles dropout for classifier-free guidance.
@ -246,24 +193,18 @@ class PixArtAlphaTextProjection(nn.Module):
    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
    """

-    def __init__(
-        self, in_features, hidden_size, out_features=None, act_fn="gelu_tanh", dtype=None, device=None, operations=None
-    ):
+    def __init__(self, in_features, hidden_size, out_features=None, act_fn="gelu_tanh", dtype=None, device=None, operations=None):
        super().__init__()
        if out_features is None:
            out_features = hidden_size
-        self.linear_1 = operations.Linear(
-            in_features=in_features, out_features=hidden_size, bias=True, dtype=dtype, device=device
-        )
+        self.linear_1 = operations.Linear(in_features=in_features, out_features=hidden_size, bias=True, dtype=dtype, device=device)
        if act_fn == "gelu_tanh":
            self.act_1 = nn.GELU(approximate="tanh")
        elif act_fn == "silu":
            self.act_1 = nn.SiLU()
        else:
            raise ValueError(f"Unknown activation function: {act_fn}")
-        self.linear_2 = operations.Linear(
-            in_features=hidden_size, out_features=out_features, bias=True, dtype=dtype, device=device
-        )
+        self.linear_2 = operations.Linear(in_features=hidden_size, out_features=out_features, bias=True, dtype=dtype, device=device)

    def forward(self, caption):
        hidden_states = self.linear_1(caption)
@ -282,28 +223,25 @@ class GELU_approx(nn.Module):


 class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out, mult=4, glu=False, dropout=0.0, dtype=None, device=None, operations=None):
+    def __init__(self, dim, dim_out, mult=4, glu=False, dropout=0., dtype=None, device=None, operations=None):
        super().__init__()
        inner_dim = int(dim * mult)
        project_in = GELU_approx(dim, inner_dim, dtype=dtype, device=device, operations=operations)

        self.net = nn.Sequential(
-            project_in, nn.Dropout(dropout), operations.Linear(inner_dim, dim_out, dtype=dtype, device=device)
+            project_in,
+            nn.Dropout(dropout),
+            operations.Linear(inner_dim, dim_out, dtype=dtype, device=device)
        )

    def forward(self, x):
        return self.net(x)

-def apply_rotary_emb(input_tensor, freqs_cis):
-    cos_freqs, sin_freqs = freqs_cis[0], freqs_cis[1]
-    split_pe = freqs_cis[2] if len(freqs_cis) > 2 else False
-    return (
-        apply_split_rotary_emb(input_tensor, cos_freqs, sin_freqs)
-        if split_pe else
-        apply_interleaved_rotary_emb(input_tensor, cos_freqs, sin_freqs)
-    )

-def apply_interleaved_rotary_emb(input_tensor, cos_freqs, sin_freqs):  # TODO: remove duplicate funcs and pick the best/fastest one
+def apply_rotary_emb(input_tensor, freqs_cis): #TODO: remove duplicate funcs and pick the best/fastest one
+    cos_freqs = freqs_cis[0]
+    sin_freqs = freqs_cis[1]
+
    t_dup = rearrange(input_tensor, "... (d r) -> ... d r", r=2)
    t1, t2 = t_dup.unbind(dim=-1)
    t_dup = torch.stack((-t2, t1), dim=-1)
@ -313,37 +251,9 @@ def apply_interleaved_rotary_emb(input_tensor, cos_freqs, sin_freqs):  # TODO: r

    return out

-def apply_split_rotary_emb(input_tensor, cos, sin):
-    needs_reshape = False
-    if input_tensor.ndim != 4 and cos.ndim == 4:
-        B, H, T, _ = cos.shape
-        input_tensor = input_tensor.reshape(B, T, H, -1).swapaxes(1, 2)
-        needs_reshape = True
-    split_input = rearrange(input_tensor, "... (d r) -> ... d r", d=2)
-    first_half_input = split_input[..., :1, :]
-    second_half_input = split_input[..., 1:, :]
-    output = split_input * cos.unsqueeze(-2)
-    first_half_output = output[..., :1, :]
-    second_half_output = output[..., 1:, :]
-    first_half_output.addcmul_(-sin.unsqueeze(-2), second_half_input)
-    second_half_output.addcmul_(sin.unsqueeze(-2), first_half_input)
-    output = rearrange(output, "... d r -> ... (d r)")
-    return output.swapaxes(1, 2).reshape(B, T, -1) if needs_reshape else output
-

 class CrossAttention(nn.Module):
-    def __init__(
-        self,
-        query_dim,
-        context_dim=None,
-        heads=8,
-        dim_head=64,
-        dropout=0.0,
-        attn_precision=None,
-        dtype=None,
-        device=None,
-        operations=None,
-    ):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0., attn_precision=None, dtype=None, device=None, operations=None):
        super().__init__()
        inner_dim = dim_head * heads
        context_dim = query_dim if context_dim is None else context_dim
@ -359,11 +269,9 @@ class CrossAttention(nn.Module):
        self.to_k = operations.Linear(context_dim, inner_dim, bias=True, dtype=dtype, device=device)
        self.to_v = operations.Linear(context_dim, inner_dim, bias=True, dtype=dtype, device=device)

-        self.to_out = nn.Sequential(
-            operations.Linear(inner_dim, query_dim, dtype=dtype, device=device), nn.Dropout(dropout)
-        )
+        self.to_out = nn.Sequential(operations.Linear(inner_dim, query_dim, dtype=dtype, device=device), nn.Dropout(dropout))

-    def forward(self, x, context=None, mask=None, pe=None, k_pe=None, transformer_options={}):
+    def forward(self, x, context=None, mask=None, pe=None):
        q = self.to_q(x)
        context = x if context is None else context
        k = self.to_k(context)
@ -374,505 +282,156 @@ class CrossAttention(nn.Module):

        if pe is not None:
            q = apply_rotary_emb(q, pe)
-            k = apply_rotary_emb(k, pe if k_pe is None else k_pe)
+            k = apply_rotary_emb(k, pe)

        if mask is None:
-            out = comfy.ldm.modules.attention.optimized_attention(q, k, v, self.heads, attn_precision=self.attn_precision, transformer_options=transformer_options)
+            out = comfy.ldm.modules.attention.optimized_attention(q, k, v, self.heads, attn_precision=self.attn_precision)
        else:
-            out = comfy.ldm.modules.attention.optimized_attention_masked(q, k, v, self.heads, mask, attn_precision=self.attn_precision, transformer_options=transformer_options)
+            out = comfy.ldm.modules.attention.optimized_attention_masked(q, k, v, self.heads, mask, attn_precision=self.attn_precision)
        return self.to_out(out)


 class BasicTransformerBlock(nn.Module):
-    def __init__(
-        self, dim, n_heads, d_head, context_dim=None, attn_precision=None, dtype=None, device=None, operations=None
-    ):
+    def __init__(self, dim, n_heads, d_head, context_dim=None, attn_precision=None, dtype=None, device=None, operations=None):
        super().__init__()

        self.attn_precision = attn_precision
-        self.attn1 = CrossAttention(
-            query_dim=dim,
-            heads=n_heads,
-            dim_head=d_head,
-            context_dim=None,
-            attn_precision=self.attn_precision,
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
+        self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, context_dim=None, attn_precision=self.attn_precision, dtype=dtype, device=device, operations=operations)
        self.ff = FeedForward(dim, dim_out=dim, glu=True, dtype=dtype, device=device, operations=operations)

-        self.attn2 = CrossAttention(
-            query_dim=dim,
-            context_dim=context_dim,
-            heads=n_heads,
-            dim_head=d_head,
-            attn_precision=self.attn_precision,
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
+        self.attn2 = CrossAttention(query_dim=dim, context_dim=context_dim, heads=n_heads, dim_head=d_head, attn_precision=self.attn_precision, dtype=dtype, device=device, operations=operations)

        self.scale_shift_table = nn.Parameter(torch.empty(6, dim, device=device, dtype=dtype))

-    def forward(self, x, context=None, attention_mask=None, timestep=None, pe=None, transformer_options={}):
+    def forward(self, x, context=None, attention_mask=None, timestep=None, pe=None):
        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None, None].to(device=x.device, dtype=x.dtype) + timestep.reshape(x.shape[0], timestep.shape[1], self.scale_shift_table.shape[0], -1)).unbind(dim=2)

-        attn1_input = comfy.ldm.common_dit.rms_norm(x)
-        attn1_input = torch.addcmul(attn1_input, attn1_input, scale_msa).add_(shift_msa)
-        attn1_input = self.attn1(attn1_input, pe=pe, transformer_options=transformer_options)
-        x.addcmul_(attn1_input, gate_msa)
-        del attn1_input
+        x += self.attn1(comfy.ldm.common_dit.rms_norm(x) * (1 + scale_msa) + shift_msa, pe=pe) * gate_msa

-        x += self.attn2(x, context=context, mask=attention_mask, transformer_options=transformer_options)
+        x += self.attn2(x, context=context, mask=attention_mask)

-        y = comfy.ldm.common_dit.rms_norm(x)
-        y = torch.addcmul(y, y, scale_mlp).add_(shift_mlp)
-        x.addcmul_(self.ff(y), gate_mlp)
+        y = comfy.ldm.common_dit.rms_norm(x) * (1 + scale_mlp) + shift_mlp
+        x += self.ff(y) * gate_mlp

        return x

 def get_fractional_positions(indices_grid, max_pos):
-    n_pos_dims = indices_grid.shape[1]
-    assert n_pos_dims == len(max_pos), f'Number of position dimensions ({n_pos_dims}) must match max_pos length ({len(max_pos)})'
    fractional_positions = torch.stack(
-        [indices_grid[:, i] / max_pos[i] for i in range(n_pos_dims)],
-        axis=-1,
+        [
+            indices_grid[:, i] / max_pos[i]
+            for i in range(3)
+        ],
+        dim=-1,
    )
    return fractional_positions


-@functools.lru_cache(maxsize=5)
-def generate_freq_grid_np(positional_embedding_theta, positional_embedding_max_pos_count, inner_dim, _ = None):
-    theta = positional_embedding_theta
+def precompute_freqs_cis(indices_grid, dim, out_dtype, theta=10000.0, max_pos=[20, 2048, 2048]):
+    dtype = torch.float32 #self.dtype
+
+    fractional_positions = get_fractional_positions(indices_grid, max_pos)
+
    start = 1
    end = theta
-
-    n_elem = 2 * positional_embedding_max_pos_count
-    pow_indices = np.power(
-        theta,
-        np.linspace(
-            _log_base(start, theta),
-            _log_base(end, theta),
-            inner_dim // n_elem,
-            dtype=np.float64,
-        ),
-    )
-    return torch.tensor(pow_indices * math.pi / 2, dtype=torch.float32)
-
-def generate_freq_grid_pytorch(positional_embedding_theta, positional_embedding_max_pos_count, inner_dim, device):
-    theta = positional_embedding_theta
-    start = 1
-    end = theta
-    n_elem = 2 * positional_embedding_max_pos_count
+    device = fractional_positions.device

    indices = theta ** (
        torch.linspace(
            math.log(start, theta),
            math.log(end, theta),
-            inner_dim // n_elem,
+            dim // 6,
            device=device,
-            dtype=torch.float32,
+            dtype=dtype,
        )
    )
-    indices = indices.to(dtype=torch.float32)
+    indices = indices.to(dtype=dtype)

    indices = indices * math.pi / 2

-    return indices
-
-def generate_freqs(indices, indices_grid, max_pos, use_middle_indices_grid):
-    if use_middle_indices_grid:
-        assert(len(indices_grid.shape) == 4 and indices_grid.shape[-1] ==2)
-        indices_grid_start, indices_grid_end = indices_grid[..., 0], indices_grid[..., 1]
-        indices_grid = (indices_grid_start + indices_grid_end) / 2.0
-    elif len(indices_grid.shape) == 4:
-        indices_grid = indices_grid[..., 0]
-
-    # Get fractional positions and compute frequency indices
-    fractional_positions = get_fractional_positions(indices_grid, max_pos)
-    indices = indices.to(device=fractional_positions.device)
-
    freqs = (
        (indices * (fractional_positions.unsqueeze(-1) * 2 - 1))
        .transpose(-1, -2)
        .flatten(2)
    )
-    return freqs

-def interleaved_freqs_cis(freqs, pad_size):
    cos_freq = freqs.cos().repeat_interleave(2, dim=-1)
    sin_freq = freqs.sin().repeat_interleave(2, dim=-1)
-    if pad_size != 0:
-        cos_padding = torch.ones_like(cos_freq[:, :, : pad_size])
-        sin_padding = torch.zeros_like(cos_freq[:, :, : pad_size])
+    if dim % 6 != 0:
+        cos_padding = torch.ones_like(cos_freq[:, :, : dim % 6])
+        sin_padding = torch.zeros_like(cos_freq[:, :, : dim % 6])
        cos_freq = torch.cat([cos_padding, cos_freq], dim=-1)
        sin_freq = torch.cat([sin_padding, sin_freq], dim=-1)
-    return cos_freq, sin_freq
+    return cos_freq.to(out_dtype), sin_freq.to(out_dtype)

-def split_freqs_cis(freqs, pad_size, num_attention_heads):
-    cos_freq = freqs.cos()
-    sin_freq = freqs.sin()

-    if pad_size != 0:
-        cos_padding = torch.ones_like(cos_freq[:, :, :pad_size])
-        sin_padding = torch.zeros_like(sin_freq[:, :, :pad_size])
+class LTXVModel(torch.nn.Module):
+    def __init__(self,
+                 in_channels=128,
+                 cross_attention_dim=2048,
+                 attention_head_dim=64,
+                 num_attention_heads=32,

-        cos_freq = torch.concatenate([cos_padding, cos_freq], axis=-1)
-        sin_freq = torch.concatenate([sin_padding, sin_freq], axis=-1)
+                 caption_channels=4096,
+                 num_layers=28,

-    # Reshape freqs to be compatible with multi-head attention
-    B , T, half_HD = cos_freq.shape

-    cos_freq = cos_freq.reshape(B, T, num_attention_heads, half_HD // num_attention_heads)
-    sin_freq = sin_freq.reshape(B, T, num_attention_heads, half_HD // num_attention_heads)
-
-    cos_freq = torch.swapaxes(cos_freq, 1, 2)  # (B,H,T,D//2)
-    sin_freq = torch.swapaxes(sin_freq, 1, 2)  # (B,H,T,D//2)
-    return cos_freq, sin_freq
-
-class LTXBaseModel(torch.nn.Module, ABC):
-    """
-    Abstract base class for LTX models (Lightricks Transformer models).
-
-    This class defines the common interface and shared functionality for all LTX models,
-    including LTXV (video) and LTXAV (audio-video) variants.
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        cross_attention_dim: int,
-        attention_head_dim: int,
-        num_attention_heads: int,
-        caption_channels: int,
-        num_layers: int,
-        positional_embedding_theta: float = 10000.0,
-        positional_embedding_max_pos: list = [20, 2048, 2048],
-        causal_temporal_positioning: bool = False,
-        vae_scale_factors: tuple = (8, 32, 32),
-        use_middle_indices_grid=False,
-        timestep_scale_multiplier = 1000.0,
-        dtype=None,
-        device=None,
-        operations=None,
-        **kwargs,
-    ):
+                 positional_embedding_theta=10000.0,
+                 positional_embedding_max_pos=[20, 2048, 2048],
+                 causal_temporal_positioning=False,
+                 vae_scale_factors=(8, 32, 32),
+                 dtype=None, device=None, operations=None, **kwargs):
        super().__init__()
        self.generator = None
        self.vae_scale_factors = vae_scale_factors
-        self.use_middle_indices_grid = use_middle_indices_grid
        self.dtype = dtype
-        self.in_channels = in_channels
-        self.cross_attention_dim = cross_attention_dim
-        self.attention_head_dim = attention_head_dim
-        self.num_attention_heads = num_attention_heads
-        self.caption_channels = caption_channels
-        self.num_layers = num_layers
-        self.positional_embedding_theta = positional_embedding_theta
-        self.positional_embedding_max_pos = positional_embedding_max_pos
-        self.split_positional_embedding = LTXRopeType.from_dict(kwargs)
-        self.freq_grid_generator = (
-            generate_freq_grid_np if LTXFrequenciesPrecision.from_dict(kwargs) == LTXFrequenciesPrecision.FLOAT64
-            else generate_freq_grid_pytorch
-        )
-        self.causal_temporal_positioning = causal_temporal_positioning
-        self.operations = operations
-        self.timestep_scale_multiplier = timestep_scale_multiplier
-
-        # Common dimensions
-        self.inner_dim = num_attention_heads * attention_head_dim
        self.out_channels = in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+        self.causal_temporal_positioning = causal_temporal_positioning

-        # Initialize common components
-        self._init_common_components(device, dtype)
-
-        # Initialize model-specific components
-        self._init_model_components(device, dtype, **kwargs)
-
-        # Initialize transformer blocks
-        self._init_transformer_blocks(device, dtype, **kwargs)
-
-        # Initialize output components
-        self._init_output_components(device, dtype)
-
-    def _init_common_components(self, device, dtype):
-        """Initialize components common to all LTX models
-        - patchify_proj: Linear projection for patchifying input
-        - adaln_single: AdaLN layer for timestep embedding
-        - caption_projection: Linear projection for caption embedding
-        """
-        self.patchify_proj = self.operations.Linear(
-            self.in_channels, self.inner_dim, bias=True, dtype=dtype, device=device
-        )
+        self.patchify_proj = operations.Linear(in_channels, self.inner_dim, bias=True, dtype=dtype, device=device)

        self.adaln_single = AdaLayerNormSingle(
-            self.inner_dim, use_additional_conditions=False, dtype=dtype, device=device, operations=self.operations
+            self.inner_dim, use_additional_conditions=False, dtype=dtype, device=device, operations=operations
        )

+        # self.adaln_single.linear = operations.Linear(self.inner_dim, 4 * self.inner_dim, bias=True, dtype=dtype, device=device)
+
        self.caption_projection = PixArtAlphaTextProjection(
-            in_features=self.caption_channels,
-            hidden_size=self.inner_dim,
-            dtype=dtype,
-            device=device,
-            operations=self.operations,
+            in_features=caption_channels, hidden_size=self.inner_dim, dtype=dtype, device=device, operations=operations
        )

-    @abstractmethod
-    def _init_model_components(self, device, dtype, **kwargs):
-        """Initialize model-specific components. Must be implemented by subclasses."""
-        pass
-
-    @abstractmethod
-    def _init_transformer_blocks(self, device, dtype, **kwargs):
-        """Initialize transformer blocks. Must be implemented by subclasses."""
-        pass
-
-    @abstractmethod
-    def _init_output_components(self, device, dtype):
-        """Initialize output components. Must be implemented by subclasses."""
-        pass
-
-    @abstractmethod
-    def _process_input(self, x, keyframe_idxs, denoise_mask, **kwargs):
-        """Process input data. Must be implemented by subclasses."""
-        pass
-
-    @abstractmethod
-    def _process_transformer_blocks(self, x, context, attention_mask, timestep, pe, **kwargs):
-        """Process transformer blocks. Must be implemented by subclasses."""
-        pass
-
-    @abstractmethod
-    def _process_output(self, x, embedded_timestep, keyframe_idxs, **kwargs):
-        """Process output data. Must be implemented by subclasses."""
-        pass
-
-    def _prepare_timestep(self, timestep, batch_size, hidden_dtype, **kwargs):
-        """Prepare timestep embeddings."""
-        grid_mask = kwargs.get("grid_mask", None)
-        if grid_mask is not None:
-            timestep = timestep[:, grid_mask]
-
-        timestep = timestep * self.timestep_scale_multiplier
-        timestep, embedded_timestep = self.adaln_single(
-            timestep.flatten(),
-            {"resolution": None, "aspect_ratio": None},
-            batch_size=batch_size,
-            hidden_dtype=hidden_dtype,
-        )
-
-        # Second dimension is 1 or number of tokens (if timestep_per_token)
-        timestep = timestep.view(batch_size, -1, timestep.shape[-1])
-        embedded_timestep = embedded_timestep.view(batch_size, -1, embedded_timestep.shape[-1])
-
-        return timestep, embedded_timestep
-
-    def _prepare_context(self, context, batch_size, x, attention_mask=None):
-        """Prepare context for transformer blocks."""
-        if self.caption_projection is not None:
-            context = self.caption_projection(context)
-            context = context.view(batch_size, -1, x.shape[-1])
-
-        return context, attention_mask
-
-    def _precompute_freqs_cis(
-        self,
-        indices_grid,
-        dim,
-        out_dtype,
-        theta=10000.0,
-        max_pos=[20, 2048, 2048],
-        use_middle_indices_grid=False,
-        num_attention_heads=32,
-    ):
-        split_mode = self.split_positional_embedding == LTXRopeType.SPLIT
-        indices = self.freq_grid_generator(theta, indices_grid.shape[1], dim, indices_grid.device)
-        freqs = generate_freqs(indices, indices_grid, max_pos, use_middle_indices_grid)
-
-        if split_mode:
-            expected_freqs = dim // 2
-            current_freqs = freqs.shape[-1]
-            pad_size = expected_freqs - current_freqs
-            cos_freq, sin_freq = split_freqs_cis(freqs, pad_size, num_attention_heads)
-        else:
-            # 2 because of cos and sin by 3 for (t, x, y), 1 for temporal only
-            n_elem = 2 * indices_grid.shape[1]
-            cos_freq, sin_freq = interleaved_freqs_cis(freqs, dim % n_elem)
-        return cos_freq.to(out_dtype), sin_freq.to(out_dtype), split_mode
-
-    def _prepare_positional_embeddings(self, pixel_coords, frame_rate, x_dtype):
-        """Prepare positional embeddings."""
-        fractional_coords = pixel_coords.to(torch.float32)
-        fractional_coords[:, 0] = fractional_coords[:, 0] * (1.0 / frame_rate)
-        pe = self._precompute_freqs_cis(
-            fractional_coords,
-            dim=self.inner_dim,
-            out_dtype=x_dtype,
-            max_pos=self.positional_embedding_max_pos,
-            use_middle_indices_grid=self.use_middle_indices_grid,
-            num_attention_heads=self.num_attention_heads,
-        )
-        return pe
-
-    def _prepare_attention_mask(self, attention_mask, x_dtype):
-        """Prepare attention mask."""
-        if attention_mask is not None and not torch.is_floating_point(attention_mask):
-            attention_mask = (attention_mask - 1).to(x_dtype).reshape(
-                (attention_mask.shape[0], 1, -1, attention_mask.shape[-1])
-            ) * torch.finfo(x_dtype).max
-        return attention_mask
-
-    def forward(
-        self, x, timestep, context, attention_mask, frame_rate=25, transformer_options={}, keyframe_idxs=None, denoise_mask=None, **kwargs
-    ):
-        """
-        Forward pass for LTX models.
-
-        Args:
-            x: Input tensor
-            timestep: Timestep tensor
-            context: Context tensor (e.g., text embeddings)
-            attention_mask: Attention mask tensor
-            frame_rate: Frame rate for temporal processing
-            transformer_options: Additional options for transformer blocks
-            keyframe_idxs: Keyframe indices for temporal processing
-            **kwargs: Additional keyword arguments
-
-        Returns:
-            Processed output tensor
-        """
-        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
-            self._forward,
-            self,
-            comfy.patcher_extension.get_all_wrappers(
-                comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options
-            ),
-        ).execute(x, timestep, context, attention_mask, frame_rate, transformer_options, keyframe_idxs, denoise_mask=denoise_mask, **kwargs)
-
-    def _forward(
-        self, x, timestep, context, attention_mask, frame_rate=25, transformer_options={}, keyframe_idxs=None, denoise_mask=None, **kwargs
-    ):
-        """
-        Internal forward pass for LTX models.
-
-        Args:
-            x: Input tensor
-            timestep: Timestep tensor
-            context: Context tensor (e.g., text embeddings)
-            attention_mask: Attention mask tensor
-            frame_rate: Frame rate for temporal processing
-            transformer_options: Additional options for transformer blocks
-            keyframe_idxs: Keyframe indices for temporal processing
-            **kwargs: Additional keyword arguments
-
-        Returns:
-            Processed output tensor
-        """
-        if isinstance(x, list):
-            input_dtype = x[0].dtype
-            batch_size = x[0].shape[0]
-        else:
-            input_dtype = x.dtype
-            batch_size = x.shape[0]
-        # Process input
-        merged_args = {**transformer_options, **kwargs}
-        x, pixel_coords, additional_args = self._process_input(x, keyframe_idxs, denoise_mask, **merged_args)
-        merged_args.update(additional_args)
-
-        # Prepare timestep and context
-        timestep, embedded_timestep = self._prepare_timestep(timestep, batch_size, input_dtype, **merged_args)
-        context, attention_mask = self._prepare_context(context, batch_size, x, attention_mask)
-
-        # Prepare attention mask and positional embeddings
-        attention_mask = self._prepare_attention_mask(attention_mask, input_dtype)
-        pe = self._prepare_positional_embeddings(pixel_coords, frame_rate, input_dtype)
-
-        # Process transformer blocks
-        x = self._process_transformer_blocks(
-            x, context, attention_mask, timestep, pe, transformer_options=transformer_options, **merged_args
-        )
-
-        # Process output
-        x = self._process_output(x, embedded_timestep, keyframe_idxs, **merged_args)
-        return x
-
-
-class LTXVModel(LTXBaseModel):
-    """LTXV model for video generation."""
-
-    def __init__(
-        self,
-        in_channels=128,
-        cross_attention_dim=2048,
-        attention_head_dim=64,
-        num_attention_heads=32,
-        caption_channels=4096,
-        num_layers=28,
-        positional_embedding_theta=10000.0,
-        positional_embedding_max_pos=[20, 2048, 2048],
-        causal_temporal_positioning=False,
-        vae_scale_factors=(8, 32, 32),
-        use_middle_indices_grid=False,
-        timestep_scale_multiplier = 1000.0,
-        dtype=None,
-        device=None,
-        operations=None,
-        **kwargs,
-    ):
-        super().__init__(
-            in_channels=in_channels,
-            cross_attention_dim=cross_attention_dim,
-            attention_head_dim=attention_head_dim,
-            num_attention_heads=num_attention_heads,
-            caption_channels=caption_channels,
-            num_layers=num_layers,
-            positional_embedding_theta=positional_embedding_theta,
-            positional_embedding_max_pos=positional_embedding_max_pos,
-            causal_temporal_positioning=causal_temporal_positioning,
-            vae_scale_factors=vae_scale_factors,
-            use_middle_indices_grid=use_middle_indices_grid,
-            timestep_scale_multiplier=timestep_scale_multiplier,
-            dtype=dtype,
-            device=device,
-            operations=operations,
-            **kwargs,
-        )
-
-    def _init_model_components(self, device, dtype, **kwargs):
-        """Initialize LTXV-specific components."""
-        # No additional components needed for LTXV beyond base class
-        pass
-
-    def _init_transformer_blocks(self, device, dtype, **kwargs):
-        """Initialize transformer blocks for LTXV."""
        self.transformer_blocks = nn.ModuleList(
            [
                BasicTransformerBlock(
                    self.inner_dim,
-                    self.num_attention_heads,
-                    self.attention_head_dim,
-                    context_dim=self.cross_attention_dim,
-                    dtype=dtype,
-                    device=device,
-                    operations=self.operations,
+                    num_attention_heads,
+                    attention_head_dim,
+                    context_dim=cross_attention_dim,
+                    # attn_precision=attn_precision,
+                    dtype=dtype, device=device, operations=operations
                )
-                for _ in range(self.num_layers)
+                for d in range(num_layers)
            ]
        )

-    def _init_output_components(self, device, dtype):
-        """Initialize output components for LTXV."""
        self.scale_shift_table = nn.Parameter(torch.empty(2, self.inner_dim, dtype=dtype, device=device))
-        self.norm_out = self.operations.LayerNorm(
-            self.inner_dim, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device
-        )
-        self.proj_out = self.operations.Linear(self.inner_dim, self.out_channels, dtype=dtype, device=device)
-        self.patchifier = SymmetricPatchifier(1, start_end=True)
+        self.norm_out = operations.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.proj_out = operations.Linear(self.inner_dim, self.out_channels, dtype=dtype, device=device)
+
+        self.patchifier = SymmetricPatchifier(1)
+
+    def forward(self, x, timestep, context, attention_mask, frame_rate=25, transformer_options={}, keyframe_idxs=None, **kwargs):
+        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
+            self._forward,
+            self,
+            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
+        ).execute(x, timestep, context, attention_mask, frame_rate, transformer_options, keyframe_idxs, **kwargs)
+
+    def _forward(self, x, timestep, context, attention_mask, frame_rate=25, transformer_options={}, keyframe_idxs=None, **kwargs):
+        patches_replace = transformer_options.get("patches_replace", {})
+
+        orig_shape = list(x.shape)

-    def _process_input(self, x, keyframe_idxs, denoise_mask, **kwargs):
-        """Process input for LTXV."""
-        additional_args = {"orig_shape": list(x.shape)}
        x, latent_coords = self.patchifier.patchify(x)
        pixel_coords = latent_to_pixel_coords(
            latent_coords=latent_coords,
@ -880,36 +439,50 @@ class LTXVModel(LTXBaseModel):
            causal_fix=self.causal_temporal_positioning,
        )

-        grid_mask = None
        if keyframe_idxs is not None:
-            additional_args.update({ "orig_patchified_shape": list(x.shape)})
-            denoise_mask = self.patchifier.patchify(denoise_mask)[0]
-            grid_mask = ~torch.any(denoise_mask < 0, dim=-1)[0]
-            additional_args.update({"grid_mask": grid_mask})
-            x = x[:, grid_mask, :]
-            pixel_coords = pixel_coords[:, :, grid_mask, ...]
+            pixel_coords[:, :, -keyframe_idxs.shape[2]:] = keyframe_idxs

-            kf_grid_mask = grid_mask[-keyframe_idxs.shape[2]:]
-            keyframe_idxs = keyframe_idxs[..., kf_grid_mask, :]
-            pixel_coords[:, :, -keyframe_idxs.shape[2]:, :] = keyframe_idxs
+        fractional_coords = pixel_coords.to(torch.float32)
+        fractional_coords[:, 0] = fractional_coords[:, 0] * (1.0 / frame_rate)

        x = self.patchify_proj(x)
-        return x, pixel_coords, additional_args
+        timestep = timestep * 1000.0
+
+        if attention_mask is not None and not torch.is_floating_point(attention_mask):
+            attention_mask = (attention_mask - 1).to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])) * torch.finfo(x.dtype).max
+
+        pe = precompute_freqs_cis(fractional_coords, dim=self.inner_dim, out_dtype=x.dtype)
+
+        batch_size = x.shape[0]
+        timestep, embedded_timestep = self.adaln_single(
+            timestep.flatten(),
+            {"resolution": None, "aspect_ratio": None},
+            batch_size=batch_size,
+            hidden_dtype=x.dtype,
+        )
+        # Second dimension is 1 or number of tokens (if timestep_per_token)
+        timestep = timestep.view(batch_size, -1, timestep.shape[-1])
+        embedded_timestep = embedded_timestep.view(
+            batch_size, -1, embedded_timestep.shape[-1]
+        )
+
+        # 2. Blocks
+        if self.caption_projection is not None:
+            batch_size = x.shape[0]
+            context = self.caption_projection(context)
+            context = context.view(
+                batch_size, -1, x.shape[-1]
+            )

-    def _process_transformer_blocks(self, x, context, attention_mask, timestep, pe, transformer_options={}, **kwargs):
-        """Process transformer blocks for LTXV."""
-        patches_replace = transformer_options.get("patches_replace", {})
        blocks_replace = patches_replace.get("dit", {})
-
        for i, block in enumerate(self.transformer_blocks):
            if ("double_block", i) in blocks_replace:
-
                def block_wrap(args):
                    out = {}
-                    out["img"] = block(args["img"], context=args["txt"], attention_mask=args["attention_mask"], timestep=args["vec"], pe=args["pe"], transformer_options=args["transformer_options"])
+                    out["img"] = block(args["img"], context=args["txt"], attention_mask=args["attention_mask"], timestep=args["vec"], pe=args["pe"])
                    return out

-                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "attention_mask": attention_mask, "vec": timestep, "pe": pe, "transformer_options": transformer_options}, {"original_block": block_wrap})
+                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "attention_mask": attention_mask, "vec": timestep, "pe": pe}, {"original_block": block_wrap})
                x = out["img"]
            else:
                x = block(
@ -917,32 +490,19 @@ class LTXVModel(LTXBaseModel):
                    context=context,
                    attention_mask=attention_mask,
                    timestep=timestep,
-                    pe=pe,
-                    transformer_options=transformer_options,
+                    pe=pe
                )

-        return x
-
-    def _process_output(self, x, embedded_timestep, keyframe_idxs, **kwargs):
-        """Process output for LTXV."""
-        # Apply scale-shift modulation
+        # 3. Output
        scale_shift_values = (
            self.scale_shift_table[None, None].to(device=x.device, dtype=x.dtype) + embedded_timestep[:, :, None]
        )
        shift, scale = scale_shift_values[:, :, 0], scale_shift_values[:, :, 1]
-
        x = self.norm_out(x)
+        # Modulation
        x = x * (1 + scale) + shift
        x = self.proj_out(x)

-        if keyframe_idxs is not None:
-            grid_mask = kwargs["grid_mask"]
-            orig_patchified_shape = kwargs["orig_patchified_shape"]
-            full_x = torch.zeros(orig_patchified_shape, dtype=x.dtype, device=x.device)
-            full_x[:, grid_mask, :] = x
-            x = full_x
-        # Unpatchify to restore original dimensions
-        orig_shape = kwargs["orig_shape"]
        x = self.patchifier.unpatchify(
            latents=x,
            output_height=orig_shape[3],
--- a/comfy/ldm/lightricks/symmetric_patchifier.py
+++ b/comfy/ldm/lightricks/symmetric_patchifier.py
@ -21,23 +21,20 @@ def latent_to_pixel_coords(
    Returns:
        Tensor: A tensor of pixel coordinates corresponding to the input latent coordinates.
    """
-    shape = [1] * latent_coords.ndim
-    shape[1] = -1
    pixel_coords = (
        latent_coords
-        * torch.tensor(scale_factors, device=latent_coords.device).view(*shape)
+        * torch.tensor(scale_factors, device=latent_coords.device)[None, :, None]
    )
    if causal_fix:
        # Fix temporal scale for first frame to 1 due to causality
-        pixel_coords[:, 0, ...] = (pixel_coords[:, 0, ...] + 1 - scale_factors[0]).clamp(min=0)
+        pixel_coords[:, 0] = (pixel_coords[:, 0] + 1 - scale_factors[0]).clamp(min=0)
    return pixel_coords


 class Patchifier(ABC):
-    def __init__(self, patch_size: int, start_end: bool=False):
+    def __init__(self, patch_size: int):
        super().__init__()
        self._patch_size = (1, patch_size, patch_size)
-        self.start_end = start_end

    @abstractmethod
    def patchify(
@ -74,23 +71,11 @@ class Patchifier(ABC):
            torch.arange(0, latent_width, self._patch_size[2], device=device),
            indexing="ij",
        )
-        latent_sample_coords_start = torch.stack(latent_sample_coords, dim=0)
-        delta = torch.tensor(self._patch_size, device=latent_sample_coords_start.device, dtype=latent_sample_coords_start.dtype)[:, None, None, None]
-        latent_sample_coords_end = latent_sample_coords_start + delta
-
-        latent_sample_coords_start = latent_sample_coords_start.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
-        latent_sample_coords_start = rearrange(
-            latent_sample_coords_start, "b c f h w -> b c (f h w)", b=batch_size
+        latent_sample_coords = torch.stack(latent_sample_coords, dim=0)
+        latent_coords = latent_sample_coords.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
+        latent_coords = rearrange(
+            latent_coords, "b c f h w -> b c (f h w)", b=batch_size
        )
-        if self.start_end:
-            latent_sample_coords_end = latent_sample_coords_end.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
-            latent_sample_coords_end = rearrange(
-                latent_sample_coords_end, "b c f h w -> b c (f h w)", b=batch_size
-            )
-
-            latent_coords = torch.stack((latent_sample_coords_start, latent_sample_coords_end), dim=-1)
-        else:
-            latent_coords = latent_sample_coords_start
        return latent_coords


@ -130,61 +115,3 @@ class SymmetricPatchifier(Patchifier):
            q=self._patch_size[2],
        )
        return latents
-
-
-class AudioPatchifier(Patchifier):
-    def __init__(self, patch_size: int,
-        sample_rate=16000,
-        hop_length=160,
-        audio_latent_downsample_factor=4,
-        is_causal=True,
-        start_end=False,
-        shift = 0
-    ):
-        super().__init__(patch_size, start_end=start_end)
-        self.hop_length = hop_length
-        self.sample_rate = sample_rate
-        self.audio_latent_downsample_factor = audio_latent_downsample_factor
-        self.is_causal = is_causal
-        self.shift = shift
-
-    def copy_with_shift(self, shift):
-        return AudioPatchifier(
-            self.patch_size, self.sample_rate, self.hop_length, self.audio_latent_downsample_factor,
-            self.is_causal, self.start_end, shift
-        )
-
-    def _get_audio_latent_time_in_sec(self, start_latent, end_latent: int, dtype: torch.dtype, device=torch.device):
-        audio_latent_frame = torch.arange(start_latent, end_latent, dtype=dtype, device=device)
-        audio_mel_frame = audio_latent_frame * self.audio_latent_downsample_factor
-        if self.is_causal:
-            audio_mel_frame = (audio_mel_frame + 1 - self.audio_latent_downsample_factor).clip(min=0)
-        return audio_mel_frame * self.hop_length / self.sample_rate
-
-
-    def patchify(self, audio_latents: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        # audio_latents: (batch, channels, time, freq)
-        b, _, t, _ = audio_latents.shape
-        audio_latents = rearrange(
-            audio_latents,
-            "b c t f -> b t (c f)",
-        )
-
-        audio_latents_start_timings = self._get_audio_latent_time_in_sec(self.shift, t + self.shift, torch.float32, audio_latents.device)
-        audio_latents_start_timings = audio_latents_start_timings.unsqueeze(0).expand(b, -1).unsqueeze(1)
-
-        if self.start_end:
-            audio_latents_end_timings = self._get_audio_latent_time_in_sec(self.shift + 1, t + self.shift + 1, torch.float32, audio_latents.device)
-            audio_latents_end_timings = audio_latents_end_timings.unsqueeze(0).expand(b, -1).unsqueeze(1)
-
-            audio_latents_timings = torch.stack([audio_latents_start_timings, audio_latents_end_timings], dim=-1)
-        else:
-            audio_latents_timings = audio_latents_start_timings
-        return audio_latents, audio_latents_timings
-
-    def unpatchify(self, audio_latents: torch.Tensor, channels: int, freq: int) -> torch.Tensor:
-        # audio_latents: (batch, time, freq * channels)
-        audio_latents = rearrange(
-            audio_latents, "b t (c f) -> b c t f", c=channels, f=freq
-        )
-        return audio_latents
--- a/comfy/ldm/lightricks/vae/audio_vae.py
+++ b/comfy/ldm/lightricks/vae/audio_vae.py
@ -1,286 +0,0 @@
-import json
-from dataclasses import dataclass
-import math
-import torch
-import torchaudio
-
-import comfy.model_management
-import comfy.model_patcher
-import comfy.utils as utils
-from comfy.ldm.mmaudio.vae.distributions import DiagonalGaussianDistribution
-from comfy.ldm.lightricks.symmetric_patchifier import AudioPatchifier
-from comfy.ldm.lightricks.vae.causal_audio_autoencoder import (
-    CausalityAxis,
-    CausalAudioAutoencoder,
-)
-from comfy.ldm.lightricks.vocoders.vocoder import Vocoder
-
-LATENT_DOWNSAMPLE_FACTOR = 4
-
-
-@dataclass(frozen=True)
-class AudioVAEComponentConfig:
-    """Container for model component configuration extracted from metadata."""
-
-    autoencoder: dict
-    vocoder: dict
-
-    @classmethod
-    def from_metadata(cls, metadata: dict) -> "AudioVAEComponentConfig":
-        assert metadata is not None and "config" in metadata, "Metadata is required for audio VAE"
-
-        raw_config = metadata["config"]
-        if isinstance(raw_config, str):
-            parsed_config = json.loads(raw_config)
-        else:
-            parsed_config = raw_config
-
-        audio_config = parsed_config.get("audio_vae")
-        vocoder_config = parsed_config.get("vocoder")
-
-        assert audio_config is not None, "Audio VAE config is required for audio VAE"
-        assert vocoder_config is not None, "Vocoder config is required for audio VAE"
-
-        return cls(autoencoder=audio_config, vocoder=vocoder_config)
-
-
-class ModelDeviceManager:
-    """Manages device placement and GPU residency for the composed model."""
-
-    def __init__(self, module: torch.nn.Module):
-        load_device = comfy.model_management.get_torch_device()
-        offload_device = comfy.model_management.vae_offload_device()
-        self.patcher = comfy.model_patcher.ModelPatcher(module, load_device, offload_device)
-
-    def ensure_model_loaded(self) -> None:
-        comfy.model_management.free_memory(
-            self.patcher.model_size(),
-            self.patcher.load_device,
-        )
-        comfy.model_management.load_model_gpu(self.patcher)
-
-    def move_to_load_device(self, tensor: torch.Tensor) -> torch.Tensor:
-        return tensor.to(self.patcher.load_device)
-
-    @property
-    def load_device(self):
-        return self.patcher.load_device
-
-
-class AudioLatentNormalizer:
-    """Applies per-channel statistics in patch space and restores original layout."""
-
-    def __init__(self, patchfier: AudioPatchifier, statistics_processor: torch.nn.Module):
-        self.patchifier = patchfier
-        self.statistics = statistics_processor
-
-    def normalize(self, latents: torch.Tensor) -> torch.Tensor:
-        channels = latents.shape[1]
-        freq = latents.shape[3]
-        patched, _ = self.patchifier.patchify(latents)
-        normalized = self.statistics.normalize(patched)
-        return self.patchifier.unpatchify(normalized, channels=channels, freq=freq)
-
-    def denormalize(self, latents: torch.Tensor) -> torch.Tensor:
-        channels = latents.shape[1]
-        freq = latents.shape[3]
-        patched, _ = self.patchifier.patchify(latents)
-        denormalized = self.statistics.un_normalize(patched)
-        return self.patchifier.unpatchify(denormalized, channels=channels, freq=freq)
-
-
-class AudioPreprocessor:
-    """Prepares raw waveforms for the autoencoder by matching training conditions."""
-
-    def __init__(self, target_sample_rate: int, mel_bins: int, mel_hop_length: int, n_fft: int):
-        self.target_sample_rate = target_sample_rate
-        self.mel_bins = mel_bins
-        self.mel_hop_length = mel_hop_length
-        self.n_fft = n_fft
-
-    def resample(self, waveform: torch.Tensor, source_rate: int) -> torch.Tensor:
-        if source_rate == self.target_sample_rate:
-            return waveform
-        return torchaudio.functional.resample(waveform, source_rate, self.target_sample_rate)
-
-    @staticmethod
-    def normalize_amplitude(
-        waveform: torch.Tensor, max_amplitude: float = 0.5, eps: float = 1e-5
-    ) -> torch.Tensor:
-        waveform = waveform - waveform.mean(dim=2, keepdim=True)
-        peak = torch.max(torch.abs(waveform)) + eps
-        scale = peak.clamp(max=max_amplitude) / peak
-        return waveform * scale
-
-    def waveform_to_mel(
-        self, waveform: torch.Tensor, waveform_sample_rate: int, device
-    ) -> torch.Tensor:
-        waveform = self.resample(waveform, waveform_sample_rate)
-        waveform = self.normalize_amplitude(waveform)
-
-        mel_transform = torchaudio.transforms.MelSpectrogram(
-            sample_rate=self.target_sample_rate,
-            n_fft=self.n_fft,
-            win_length=self.n_fft,
-            hop_length=self.mel_hop_length,
-            f_min=0.0,
-            f_max=self.target_sample_rate / 2.0,
-            n_mels=self.mel_bins,
-            window_fn=torch.hann_window,
-            center=True,
-            pad_mode="reflect",
-            power=1.0,
-            mel_scale="slaney",
-            norm="slaney",
-        ).to(device)
-
-        mel = mel_transform(waveform)
-        mel = torch.log(torch.clamp(mel, min=1e-5))
-        return mel.permute(0, 1, 3, 2).contiguous()
-
-
-class AudioVAE(torch.nn.Module):
-    """High-level Audio VAE wrapper exposing encode and decode entry points."""
-
-    def __init__(self, state_dict: dict, metadata: dict):
-        super().__init__()
-
-        component_config = AudioVAEComponentConfig.from_metadata(metadata)
-
-        vae_sd = utils.state_dict_prefix_replace(state_dict, {"audio_vae.": ""}, filter_keys=True)
-        vocoder_sd = utils.state_dict_prefix_replace(state_dict, {"vocoder.": ""}, filter_keys=True)
-
-        self.autoencoder = CausalAudioAutoencoder(config=component_config.autoencoder)
-        self.vocoder = Vocoder(config=component_config.vocoder)
-
-        self.autoencoder.load_state_dict(vae_sd, strict=False)
-        self.vocoder.load_state_dict(vocoder_sd, strict=False)
-
-        autoencoder_config = self.autoencoder.get_config()
-        self.normalizer = AudioLatentNormalizer(
-            AudioPatchifier(
-                patch_size=1,
-                audio_latent_downsample_factor=LATENT_DOWNSAMPLE_FACTOR,
-                sample_rate=autoencoder_config["sampling_rate"],
-                hop_length=autoencoder_config["mel_hop_length"],
-                is_causal=autoencoder_config["is_causal"],
-            ),
-            self.autoencoder.per_channel_statistics,
-        )
-
-        self.preprocessor = AudioPreprocessor(
-            target_sample_rate=autoencoder_config["sampling_rate"],
-            mel_bins=autoencoder_config["mel_bins"],
-            mel_hop_length=autoencoder_config["mel_hop_length"],
-            n_fft=autoencoder_config["n_fft"],
-        )
-
-        self.device_manager = ModelDeviceManager(self)
-
-    def encode(self, audio: dict) -> torch.Tensor:
-        """Encode a waveform dictionary into normalized latent tensors."""
-
-        waveform = audio["waveform"]
-        waveform_sample_rate = audio["sample_rate"]
-        input_device = waveform.device
-        # Ensure that Audio VAE is loaded on the correct device.
-        self.device_manager.ensure_model_loaded()
-
-        waveform = self.device_manager.move_to_load_device(waveform)
-        expected_channels = self.autoencoder.encoder.in_channels
-        if waveform.shape[1] != expected_channels:
-            raise ValueError(
-                f"Input audio must have {expected_channels} channels, got {waveform.shape[1]}"
-            )
-
-        mel_spec = self.preprocessor.waveform_to_mel(
-            waveform, waveform_sample_rate, device=self.device_manager.load_device
-        )
-
-        latents = self.autoencoder.encode(mel_spec)
-        posterior = DiagonalGaussianDistribution(latents)
-        latent_mode = posterior.mode()
-
-        normalized = self.normalizer.normalize(latent_mode)
-        return normalized.to(input_device)
-
-    def decode(self, latents: torch.Tensor) -> torch.Tensor:
-        """Decode normalized latent tensors into an audio waveform."""
-        original_shape = latents.shape
-
-        # Ensure that Audio VAE is loaded on the correct device.
-        self.device_manager.ensure_model_loaded()
-
-        latents = self.device_manager.move_to_load_device(latents)
-        latents = self.normalizer.denormalize(latents)
-
-        target_shape = self.target_shape_from_latents(original_shape)
-        mel_spec = self.autoencoder.decode(latents, target_shape=target_shape)
-
-        waveform = self.run_vocoder(mel_spec)
-        return self.device_manager.move_to_load_device(waveform)
-
-    def target_shape_from_latents(self, latents_shape):
-        batch, _, time, _ = latents_shape
-        target_length = time * LATENT_DOWNSAMPLE_FACTOR
-        if self.autoencoder.causality_axis != CausalityAxis.NONE:
-            target_length -= LATENT_DOWNSAMPLE_FACTOR - 1
-        return (
-            batch,
-            self.autoencoder.decoder.out_ch,
-            target_length,
-            self.autoencoder.mel_bins,
-        )
-
-    def num_of_latents_from_frames(self, frames_number: int, frame_rate: int) -> int:
-        return math.ceil((float(frames_number) / frame_rate) * self.latents_per_second)
-
-    def run_vocoder(self, mel_spec: torch.Tensor) -> torch.Tensor:
-        audio_channels = self.autoencoder.decoder.out_ch
-        vocoder_input = mel_spec.transpose(2, 3)
-
-        if audio_channels == 1:
-            vocoder_input = vocoder_input.squeeze(1)
-        elif audio_channels != 2:
-            raise ValueError(f"Unsupported audio_channels: {audio_channels}")
-
-        return self.vocoder(vocoder_input)
-
-    @property
-    def sample_rate(self) -> int:
-        return int(self.autoencoder.sampling_rate)
-
-    @property
-    def mel_hop_length(self) -> int:
-        return int(self.autoencoder.mel_hop_length)
-
-    @property
-    def mel_bins(self) -> int:
-        return int(self.autoencoder.mel_bins)
-
-    @property
-    def latent_channels(self) -> int:
-        return int(self.autoencoder.decoder.z_channels)
-
-    @property
-    def latent_frequency_bins(self) -> int:
-        return int(self.mel_bins // LATENT_DOWNSAMPLE_FACTOR)
-
-    @property
-    def latents_per_second(self) -> float:
-        return self.sample_rate / self.mel_hop_length / LATENT_DOWNSAMPLE_FACTOR
-
-    @property
-    def output_sample_rate(self) -> int:
-        output_rate = getattr(self.vocoder, "output_sample_rate", None)
-        if output_rate is not None:
-            return int(output_rate)
-        upsample_factor = getattr(self.vocoder, "upsample_factor", None)
-        if upsample_factor is None:
-            raise AttributeError(
-                "Vocoder is missing upsample_factor; cannot infer output sample rate"
-            )
-        return int(self.sample_rate * upsample_factor / self.mel_hop_length)
-
-    def memory_required(self, input_shape):
-        return self.device_manager.patcher.model_size()
--- a/comfy/ldm/lightricks/vae/causal_audio_autoencoder.py
+++ b/comfy/ldm/lightricks/vae/causal_audio_autoencoder.py
@ -1,909 +0,0 @@
-from __future__ import annotations
-import torch
-from torch import nn
-from torch.nn import functional as F
-from typing import Optional
-from enum import Enum
-from .pixel_norm import PixelNorm
-import comfy.ops
-import logging
-
-ops = comfy.ops.disable_weight_init
-
-
-class StringConvertibleEnum(Enum):
-    """
-    Base enum class that provides string-to-enum conversion functionality.
-
-    This mixin adds a str_to_enum() class method that handles conversion from
-    strings, None, or existing enum instances with case-insensitive matching.
-    """
-
-    @classmethod
-    def str_to_enum(cls, value):
-        """
-        Convert a string, enum instance, or None to the appropriate enum member.
-
-        Args:
-            value: Can be an enum instance of this class, a string, or None
-
-        Returns:
-            Enum member of this class
-
-        Raises:
-            ValueError: If the value cannot be converted to a valid enum member
-        """
-        # Already an enum instance of this class
-        if isinstance(value, cls):
-            return value
-
-        # None maps to NONE member if it exists
-        if value is None:
-            if hasattr(cls, "NONE"):
-                return cls.NONE
-            raise ValueError(f"{cls.__name__} does not have a NONE member to map None to")
-
-        # String conversion (case-insensitive)
-        if isinstance(value, str):
-            value_lower = value.lower()
-
-            # Try to match against enum values
-            for member in cls:
-                # Handle members with None values
-                if member.value is None:
-                    if value_lower == "none":
-                        return member
-                # Handle members with string values
-                elif isinstance(member.value, str) and member.value.lower() == value_lower:
-                    return member
-
-            # Build helpful error message with valid values
-            valid_values = []
-            for member in cls:
-                if member.value is None:
-                    valid_values.append("none")
-                elif isinstance(member.value, str):
-                    valid_values.append(member.value)
-
-            raise ValueError(f"Invalid {cls.__name__} string: '{value}'. " f"Valid values are: {valid_values}")
-
-        raise ValueError(
-            f"Cannot convert type {type(value).__name__} to {cls.__name__} enum. "
-            f"Expected string, None, or {cls.__name__} instance."
-        )
-
-
-class AttentionType(StringConvertibleEnum):
-    """Enum for specifying the attention mechanism type."""
-
-    VANILLA = "vanilla"
-    LINEAR = "linear"
-    NONE = "none"
-
-
-class CausalityAxis(StringConvertibleEnum):
-    """Enum for specifying the causality axis in causal convolutions."""
-
-    NONE = None
-    WIDTH = "width"
-    HEIGHT = "height"
-    WIDTH_COMPATIBILITY = "width-compatibility"
-
-
-def Normalize(in_channels, *, num_groups=32, normtype="group"):
-    if normtype == "group":
-        return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
-    elif normtype == "pixel":
-        return PixelNorm(dim=1, eps=1e-6)
-    else:
-        raise ValueError(f"Invalid normalization type: {normtype}")
-
-
-class CausalConv2d(nn.Module):
-    """
-    A causal 2D convolution.
-
-    This layer ensures that the output at time `t` only depends on inputs
-    at time `t` and earlier. It achieves this by applying asymmetric padding
-    to the time dimension (width) before the convolution.
-    """
-
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride=1,
-        dilation=1,
-        groups=1,
-        bias=True,
-        causality_axis: CausalityAxis = CausalityAxis.HEIGHT,
-    ):
-        super().__init__()
-
-        self.causality_axis = causality_axis
-
-        # Ensure kernel_size and dilation are tuples
-        kernel_size = nn.modules.utils._pair(kernel_size)
-        dilation = nn.modules.utils._pair(dilation)
-
-        # Calculate padding dimensions
-        pad_h = (kernel_size[0] - 1) * dilation[0]
-        pad_w = (kernel_size[1] - 1) * dilation[1]
-
-        # The padding tuple for F.pad is (pad_left, pad_right, pad_top, pad_bottom)
-        match self.causality_axis:
-            case CausalityAxis.NONE:
-                self.padding = (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2)
-            case CausalityAxis.WIDTH | CausalityAxis.WIDTH_COMPATIBILITY:
-                self.padding = (pad_w, 0, pad_h // 2, pad_h - pad_h // 2)
-            case CausalityAxis.HEIGHT:
-                self.padding = (pad_w // 2, pad_w - pad_w // 2, pad_h, 0)
-            case _:
-                raise ValueError(f"Invalid causality_axis: {causality_axis}")
-
-        # The internal convolution layer uses no padding, as we handle it manually
-        self.conv = ops.Conv2d(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=stride,
-            padding=0,
-            dilation=dilation,
-            groups=groups,
-            bias=bias,
-        )
-
-    def forward(self, x):
-        # Apply causal padding before convolution
-        x = F.pad(x, self.padding)
-        return self.conv(x)
-
-
-def make_conv2d(
-    in_channels,
-    out_channels,
-    kernel_size,
-    stride=1,
-    padding=None,
-    dilation=1,
-    groups=1,
-    bias=True,
-    causality_axis: Optional[CausalityAxis] = None,
-):
-    """
-    Create a 2D convolution layer that can be either causal or non-causal.
-
-    Args:
-        in_channels: Number of input channels
-        out_channels: Number of output channels
-        kernel_size: Size of the convolution kernel
-        stride: Convolution stride
-        padding: Padding (if None, will be calculated based on causal flag)
-        dilation: Dilation rate
-        groups: Number of groups for grouped convolution
-        bias: Whether to use bias
-        causality_axis: Dimension along which to apply causality.
-
-    Returns:
-        Either a regular Conv2d or CausalConv2d layer
-    """
-    if causality_axis is not None:
-        # For causal convolution, padding is handled internally by CausalConv2d
-        return CausalConv2d(in_channels, out_channels, kernel_size, stride, dilation, groups, bias, causality_axis)
-    else:
-        # For non-causal convolution, use symmetric padding if not specified
-        if padding is None:
-            if isinstance(kernel_size, int):
-                padding = kernel_size // 2
-            else:
-                padding = tuple(k // 2 for k in kernel_size)
-        return ops.Conv2d(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-        )
-
-
-class Upsample(nn.Module):
-    def __init__(self, in_channels, with_conv, causality_axis: CausalityAxis = CausalityAxis.HEIGHT):
-        super().__init__()
-        self.with_conv = with_conv
-        self.causality_axis = causality_axis
-        if self.with_conv:
-            self.conv = make_conv2d(in_channels, in_channels, kernel_size=3, stride=1, causality_axis=causality_axis)
-
-    def forward(self, x):
-        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
-        if self.with_conv:
-            x = self.conv(x)
-            # Drop FIRST element in the causal axis to undo encoder's padding, while keeping the length 1 + 2 * n.
-            # For example, if the input is [0, 1, 2], after interpolation, the output is [0, 0, 1, 1, 2, 2].
-            # The causal convolution will pad the first element as [-, -, 0, 0, 1, 1, 2, 2],
-            # So the output elements rely on the following windows:
-            # 0: [-,-,0]
-            # 1: [-,0,0]
-            # 2: [0,0,1]
-            # 3: [0,1,1]
-            # 4: [1,1,2]
-            # 5: [1,2,2]
-            # Notice that the first and second elements in the output rely only on the first element in the input,
-            # while all other elements rely on two elements in the input.
-            # So we can drop the first element to undo the padding (rather than the last element).
-            # This is a no-op for non-causal convolutions.
-            match self.causality_axis:
-                case CausalityAxis.NONE:
-                    pass  # x remains unchanged
-                case CausalityAxis.HEIGHT:
-                    x = x[:, :, 1:, :]
-                case CausalityAxis.WIDTH:
-                    x = x[:, :, :, 1:]
-                case CausalityAxis.WIDTH_COMPATIBILITY:
-                    pass  # x remains unchanged
-                case _:
-                    raise ValueError(f"Invalid causality_axis: {self.causality_axis}")
-
-        return x
-
-
-class Downsample(nn.Module):
-    """
-    A downsampling layer that can use either a strided convolution
-    or average pooling. Supports standard and causal padding for the
-    convolutional mode.
-    """
-
-    def __init__(self, in_channels, with_conv, causality_axis: CausalityAxis = CausalityAxis.WIDTH):
-        super().__init__()
-        self.with_conv = with_conv
-        self.causality_axis = causality_axis
-
-        if self.causality_axis != CausalityAxis.NONE and not self.with_conv:
-            raise ValueError("causality is only supported when `with_conv=True`.")
-
-        if self.with_conv:
-            # Do time downsampling here
-            # no asymmetric padding in torch conv, must do it ourselves
-            self.conv = ops.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
-
-    def forward(self, x):
-        if self.with_conv:
-            # (pad_left, pad_right, pad_top, pad_bottom)
-            match self.causality_axis:
-                case CausalityAxis.NONE:
-                    pad = (0, 1, 0, 1)
-                case CausalityAxis.WIDTH:
-                    pad = (2, 0, 0, 1)
-                case CausalityAxis.HEIGHT:
-                    pad = (0, 1, 2, 0)
-                case CausalityAxis.WIDTH_COMPATIBILITY:
-                    pad = (1, 0, 0, 1)
-                case _:
-                    raise ValueError(f"Invalid causality_axis: {self.causality_axis}")
-
-            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
-            x = self.conv(x)
-        else:
-            # This branch is only taken if with_conv=False, which implies causality_axis is NONE.
-            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
-
-        return x
-
-
-class ResnetBlock(nn.Module):
-    def __init__(
-        self,
-        *,
-        in_channels,
-        out_channels=None,
-        conv_shortcut=False,
-        dropout,
-        temb_channels=512,
-        norm_type="group",
-        causality_axis: CausalityAxis = CausalityAxis.HEIGHT,
-    ):
-        super().__init__()
-        self.causality_axis = causality_axis
-
-        if self.causality_axis != CausalityAxis.NONE and norm_type == "group":
-            raise ValueError("Causal ResnetBlock with GroupNorm is not supported.")
-        self.in_channels = in_channels
-        out_channels = in_channels if out_channels is None else out_channels
-        self.out_channels = out_channels
-        self.use_conv_shortcut = conv_shortcut
-
-        self.norm1 = Normalize(in_channels, normtype=norm_type)
-        self.non_linearity = nn.SiLU()
-        self.conv1 = make_conv2d(in_channels, out_channels, kernel_size=3, stride=1, causality_axis=causality_axis)
-        if temb_channels > 0:
-            self.temb_proj = ops.Linear(temb_channels, out_channels)
-        self.norm2 = Normalize(out_channels, normtype=norm_type)
-        self.dropout = torch.nn.Dropout(dropout)
-        self.conv2 = make_conv2d(out_channels, out_channels, kernel_size=3, stride=1, causality_axis=causality_axis)
-        if self.in_channels != self.out_channels:
-            if self.use_conv_shortcut:
-                self.conv_shortcut = make_conv2d(
-                    in_channels, out_channels, kernel_size=3, stride=1, causality_axis=causality_axis
-                )
-            else:
-                self.nin_shortcut = make_conv2d(
-                    in_channels, out_channels, kernel_size=1, stride=1, causality_axis=causality_axis
-                )
-
-    def forward(self, x, temb):
-        h = x
-        h = self.norm1(h)
-        h = self.non_linearity(h)
-        h = self.conv1(h)
-
-        if temb is not None:
-            h = h + self.temb_proj(self.non_linearity(temb))[:, :, None, None]
-
-        h = self.norm2(h)
-        h = self.non_linearity(h)
-        h = self.dropout(h)
-        h = self.conv2(h)
-
-        if self.in_channels != self.out_channels:
-            if self.use_conv_shortcut:
-                x = self.conv_shortcut(x)
-            else:
-                x = self.nin_shortcut(x)
-
-        return x + h
-
-
-class AttnBlock(nn.Module):
-    def __init__(self, in_channels, norm_type="group"):
-        super().__init__()
-        self.in_channels = in_channels
-
-        self.norm = Normalize(in_channels, normtype=norm_type)
-        self.q = ops.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
-        self.k = ops.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
-        self.v = ops.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
-        self.proj_out = ops.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
-
-    def forward(self, x):
-        h_ = x
-        h_ = self.norm(h_)
-        q = self.q(h_)
-        k = self.k(h_)
-        v = self.v(h_)
-
-        # compute attention
-        b, c, h, w = q.shape
-        q = q.reshape(b, c, h * w).contiguous()
-        q = q.permute(0, 2, 1).contiguous()  # b,hw,c
-        k = k.reshape(b, c, h * w).contiguous()  # b,c,hw
-        w_ = torch.bmm(q, k).contiguous()  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
-        w_ = w_ * (int(c) ** (-0.5))
-        w_ = torch.nn.functional.softmax(w_, dim=2)
-
-        # attend to values
-        v = v.reshape(b, c, h * w).contiguous()
-        w_ = w_.permute(0, 2, 1).contiguous()  # b,hw,hw (first hw of k, second of q)
-        h_ = torch.bmm(v, w_).contiguous()  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
-        h_ = h_.reshape(b, c, h, w).contiguous()
-
-        h_ = self.proj_out(h_)
-
-        return x + h_
-
-
-def make_attn(in_channels, attn_type="vanilla", norm_type="group"):
-    # Convert string to enum if needed
-    attn_type = AttentionType.str_to_enum(attn_type)
-
-    if attn_type != AttentionType.NONE:
-        logging.info(f"making attention of type '{attn_type.value}' with {in_channels} in_channels")
-    else:
-        logging.info(f"making identity attention with {in_channels} in_channels")
-
-    match attn_type:
-        case AttentionType.VANILLA:
-            return AttnBlock(in_channels, norm_type=norm_type)
-        case AttentionType.NONE:
-            return nn.Identity(in_channels)
-        case AttentionType.LINEAR:
-            raise NotImplementedError(f"Attention type {attn_type.value} is not supported yet.")
-        case _:
-            raise ValueError(f"Unknown attention type: {attn_type}")
-
-
-class Encoder(nn.Module):
-    def __init__(
-        self,
-        *,
-        ch,
-        out_ch,
-        ch_mult=(1, 2, 4, 8),
-        num_res_blocks,
-        attn_resolutions,
-        dropout=0.0,
-        resamp_with_conv=True,
-        in_channels,
-        resolution,
-        z_channels,
-        double_z=True,
-        attn_type="vanilla",
-        mid_block_add_attention=True,
-        norm_type="group",
-        causality_axis=CausalityAxis.WIDTH.value,
-        **ignore_kwargs,
-    ):
-        super().__init__()
-        self.ch = ch
-        self.temb_ch = 0
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.in_channels = in_channels
-        self.z_channels = z_channels
-        self.double_z = double_z
-        self.norm_type = norm_type
-        # Convert string to enum if needed (for config loading)
-        causality_axis = CausalityAxis.str_to_enum(causality_axis)
-        self.attn_type = AttentionType.str_to_enum(attn_type)
-
-        # downsampling
-        self.conv_in = make_conv2d(
-            in_channels,
-            self.ch,
-            kernel_size=3,
-            stride=1,
-            causality_axis=causality_axis,
-        )
-
-        self.non_linearity = nn.SiLU()
-
-        curr_res = resolution
-        in_ch_mult = (1,) + tuple(ch_mult)
-        self.in_ch_mult = in_ch_mult
-        self.down = nn.ModuleList()
-
-        for i_level in range(self.num_resolutions):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_in = ch * in_ch_mult[i_level]
-            block_out = ch * ch_mult[i_level]
-
-            for _ in range(self.num_res_blocks):
-                block.append(
-                    ResnetBlock(
-                        in_channels=block_in,
-                        out_channels=block_out,
-                        temb_channels=self.temb_ch,
-                        dropout=dropout,
-                        norm_type=self.norm_type,
-                        causality_axis=causality_axis,
-                    )
-                )
-                block_in = block_out
-                if curr_res in attn_resolutions:
-                    attn.append(make_attn(block_in, attn_type=self.attn_type, norm_type=self.norm_type))
-
-            down = nn.Module()
-            down.block = block
-            down.attn = attn
-            if i_level != self.num_resolutions - 1:
-                down.downsample = Downsample(block_in, resamp_with_conv, causality_axis=causality_axis)
-                curr_res = curr_res // 2
-            self.down.append(down)
-
-        # middle
-        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(
-            in_channels=block_in,
-            out_channels=block_in,
-            temb_channels=self.temb_ch,
-            dropout=dropout,
-            norm_type=self.norm_type,
-            causality_axis=causality_axis,
-        )
-        if mid_block_add_attention:
-            self.mid.attn_1 = make_attn(block_in, attn_type=self.attn_type, norm_type=self.norm_type)
-        else:
-            self.mid.attn_1 = nn.Identity()
-        self.mid.block_2 = ResnetBlock(
-            in_channels=block_in,
-            out_channels=block_in,
-            temb_channels=self.temb_ch,
-            dropout=dropout,
-            norm_type=self.norm_type,
-            causality_axis=causality_axis,
-        )
-
-        # end
-        self.norm_out = Normalize(block_in, normtype=self.norm_type)
-        self.conv_out = make_conv2d(
-            block_in,
-            2 * z_channels if double_z else z_channels,
-            kernel_size=3,
-            stride=1,
-            causality_axis=causality_axis,
-        )
-
-    def forward(self, x):
-        """
-        Forward pass through the encoder.
-
-        Args:
-            x: Input tensor of shape [batch, channels, time, n_mels]
-
-        Returns:
-            Encoded latent representation
-        """
-        feature_maps = [self.conv_in(x)]
-
-        # Process each resolution level (from high to low resolution)
-        for resolution_level in range(self.num_resolutions):
-            # Apply residual blocks at current resolution level
-            for block_idx in range(self.num_res_blocks):
-                # Apply ResNet block with optional timestep embedding
-                current_features = self.down[resolution_level].block[block_idx](feature_maps[-1], temb=None)
-
-                # Apply attention if configured for this resolution level
-                if len(self.down[resolution_level].attn) > 0:
-                    current_features = self.down[resolution_level].attn[block_idx](current_features)
-
-                # Store processed features
-                feature_maps.append(current_features)
-
-            # Downsample spatial dimensions (except at the final resolution level)
-            if resolution_level != self.num_resolutions - 1:
-                downsampled_features = self.down[resolution_level].downsample(feature_maps[-1])
-                feature_maps.append(downsampled_features)
-
-        # === MIDDLE PROCESSING PHASE ===
-        # Take the lowest resolution features for middle processing
-        bottleneck_features = feature_maps[-1]
-
-        # Apply first middle ResNet block
-        bottleneck_features = self.mid.block_1(bottleneck_features, temb=None)
-
-        # Apply middle attention block
-        bottleneck_features = self.mid.attn_1(bottleneck_features)
-
-        # Apply second middle ResNet block
-        bottleneck_features = self.mid.block_2(bottleneck_features, temb=None)
-
-        # === OUTPUT PHASE ===
-        # Normalize the bottleneck features
-        output_features = self.norm_out(bottleneck_features)
-
-        # Apply non-linearity (SiLU activation)
-        output_features = self.non_linearity(output_features)
-
-        # Final convolution to produce latent representation
-        # [batch, channels, time, n_mels] -> [batch, 2 * z_channels if double_z else z_channels, time, n_mels]
-        return self.conv_out(output_features)
-
-
-class Decoder(nn.Module):
-    def __init__(
-        self,
-        *,
-        ch,
-        out_ch,
-        ch_mult=(1, 2, 4, 8),
-        num_res_blocks,
-        attn_resolutions,
-        dropout=0.0,
-        resamp_with_conv=True,
-        in_channels,
-        resolution,
-        z_channels,
-        give_pre_end=False,
-        tanh_out=False,
-        attn_type="vanilla",
-        mid_block_add_attention=True,
-        norm_type="group",
-        causality_axis=CausalityAxis.WIDTH.value,
-        **ignorekwargs,
-    ):
-        super().__init__()
-        self.ch = ch
-        self.temb_ch = 0
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.in_channels = in_channels
-        self.out_ch = out_ch
-        self.give_pre_end = give_pre_end
-        self.tanh_out = tanh_out
-        self.norm_type = norm_type
-        self.z_channels = z_channels
-        # Convert string to enum if needed (for config loading)
-        causality_axis = CausalityAxis.str_to_enum(causality_axis)
-        self.attn_type = AttentionType.str_to_enum(attn_type)
-
-        # compute block_in and curr_res at lowest res
-        block_in = ch * ch_mult[self.num_resolutions - 1]
-        curr_res = resolution // 2 ** (self.num_resolutions - 1)
-        self.z_shape = (1, z_channels, curr_res, curr_res)
-
-        # z to block_in
-        self.conv_in = make_conv2d(z_channels, block_in, kernel_size=3, stride=1, causality_axis=causality_axis)
-
-        self.non_linearity = nn.SiLU()
-
-        # middle
-        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(
-            in_channels=block_in,
-            out_channels=block_in,
-            temb_channels=self.temb_ch,
-            dropout=dropout,
-            norm_type=self.norm_type,
-            causality_axis=causality_axis,
-        )
-        if mid_block_add_attention:
-            self.mid.attn_1 = make_attn(block_in, attn_type=self.attn_type, norm_type=self.norm_type)
-        else:
-            self.mid.attn_1 = nn.Identity()
-        self.mid.block_2 = ResnetBlock(
-            in_channels=block_in,
-            out_channels=block_in,
-            temb_channels=self.temb_ch,
-            dropout=dropout,
-            norm_type=self.norm_type,
-            causality_axis=causality_axis,
-        )
-
-        # upsampling
-        self.up = nn.ModuleList()
-        for i_level in reversed(range(self.num_resolutions)):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_out = ch * ch_mult[i_level]
-            for _ in range(self.num_res_blocks + 1):
-                block.append(
-                    ResnetBlock(
-                        in_channels=block_in,
-                        out_channels=block_out,
-                        temb_channels=self.temb_ch,
-                        dropout=dropout,
-                        norm_type=self.norm_type,
-                        causality_axis=causality_axis,
-                    )
-                )
-                block_in = block_out
-                if curr_res in attn_resolutions:
-                    attn.append(make_attn(block_in, attn_type=self.attn_type, norm_type=self.norm_type))
-            up = nn.Module()
-            up.block = block
-            up.attn = attn
-            if i_level != 0:
-                up.upsample = Upsample(block_in, resamp_with_conv, causality_axis=causality_axis)
-                curr_res = curr_res * 2
-            self.up.insert(0, up)  # prepend to get consistent order
-
-        # end
-        self.norm_out = Normalize(block_in, normtype=self.norm_type)
-        self.conv_out = make_conv2d(block_in, out_ch, kernel_size=3, stride=1, causality_axis=causality_axis)
-
-    def _adjust_output_shape(self, decoded_output, target_shape):
-        """
-        Adjust output shape to match target dimensions for variable-length audio.
-
-        This function handles the common case where decoded audio spectrograms need to be
-        resized to match a specific target shape.
-
-        Args:
-            decoded_output: Tensor of shape (batch, channels, time, frequency)
-            target_shape: Target shape tuple (batch, channels, time, frequency)
-
-        Returns:
-            Tensor adjusted to match target_shape exactly
-        """
-        # Current output shape: (batch, channels, time, frequency)
-        _, _, current_time, current_freq = decoded_output.shape
-        _, target_channels, target_time, target_freq = target_shape
-
-        # Step 1: Crop first to avoid exceeding target dimensions
-        decoded_output = decoded_output[
-            :, :target_channels, : min(current_time, target_time), : min(current_freq, target_freq)
-        ]
-
-        # Step 2: Calculate padding needed for time and frequency dimensions
-        time_padding_needed = target_time - decoded_output.shape[2]
-        freq_padding_needed = target_freq - decoded_output.shape[3]
-
-        # Step 3: Apply padding if needed
-        if time_padding_needed > 0 or freq_padding_needed > 0:
-            # PyTorch padding format: (pad_left, pad_right, pad_top, pad_bottom)
-            # For audio: pad_left/right = frequency, pad_top/bottom = time
-            padding = (
-                0,
-                max(freq_padding_needed, 0),  # frequency padding (left, right)
-                0,
-                max(time_padding_needed, 0),  # time padding (top, bottom)
-            )
-            decoded_output = F.pad(decoded_output, padding)
-
-        # Step 4: Final safety crop to ensure exact target shape
-        decoded_output = decoded_output[:, :target_channels, :target_time, :target_freq]
-
-        return decoded_output
-
-    def get_config(self):
-        return {
-            "ch": self.ch,
-            "out_ch": self.out_ch,
-            "ch_mult": self.ch_mult,
-            "num_res_blocks": self.num_res_blocks,
-            "in_channels": self.in_channels,
-            "resolution": self.resolution,
-            "z_channels": self.z_channels,
-        }
-
-    def forward(self, latent_features, target_shape=None):
-        """
-        Decode latent features back to audio spectrograms.
-
-        Args:
-            latent_features: Encoded latent representation of shape (batch, channels, height, width)
-            target_shape: Optional target output shape (batch, channels, time, frequency)
-                         If provided, output will be cropped/padded to match this shape
-
-        Returns:
-            Reconstructed audio spectrogram of shape (batch, channels, time, frequency)
-        """
-        assert target_shape is not None, "Target shape is required for CausalAudioAutoencoder Decoder"
-
-        # Transform latent features to decoder's internal feature dimension
-        hidden_features = self.conv_in(latent_features)
-
-        # Middle processing
-        hidden_features = self.mid.block_1(hidden_features, temb=None)
-        hidden_features = self.mid.attn_1(hidden_features)
-        hidden_features = self.mid.block_2(hidden_features, temb=None)
-
-        # Upsampling
-        # Progressively increase spatial resolution from lowest to highest
-        for resolution_level in reversed(range(self.num_resolutions)):
-            # Apply residual blocks at current resolution level
-            for block_index in range(self.num_res_blocks + 1):
-                hidden_features = self.up[resolution_level].block[block_index](hidden_features, temb=None)
-
-                if len(self.up[resolution_level].attn) > 0:
-                    hidden_features = self.up[resolution_level].attn[block_index](hidden_features)
-
-            if resolution_level != 0:
-                hidden_features = self.up[resolution_level].upsample(hidden_features)
-
-        # Output
-        if self.give_pre_end:
-            # Return intermediate features before final processing (for debugging/analysis)
-            decoded_output = hidden_features
-        else:
-            # Standard output path: normalize, activate, and convert to output channels
-            # Final normalization layer
-            hidden_features = self.norm_out(hidden_features)
-
-            # Apply SiLU (Swish) activation function
-            hidden_features = self.non_linearity(hidden_features)
-
-            # Final convolution to map to output channels (typically 2 for stereo audio)
-            decoded_output = self.conv_out(hidden_features)
-
-            # Optional tanh activation to bound output values to [-1, 1] range
-            if self.tanh_out:
-                decoded_output = torch.tanh(decoded_output)
-
-        # Adjust shape for audio data
-        if target_shape is not None:
-            decoded_output = self._adjust_output_shape(decoded_output, target_shape)
-
-        return decoded_output
-
-
-class processor(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.register_buffer("std-of-means", torch.empty(128))
-        self.register_buffer("mean-of-means", torch.empty(128))
-
-    def un_normalize(self, x):
-        return (x * self.get_buffer("std-of-means").to(x)) + self.get_buffer("mean-of-means").to(x)
-
-    def normalize(self, x):
-        return (x - self.get_buffer("mean-of-means").to(x)) / self.get_buffer("std-of-means").to(x)
-
-
-class CausalAudioAutoencoder(nn.Module):
-    def __init__(self, config=None):
-        super().__init__()
-
-        if config is None:
-            config = self._guess_config()
-
-        # Extract encoder and decoder configs from the new format
-        model_config = config.get("model", {}).get("params", {})
-        variables_config = config.get("variables", {})
-
-        self.sampling_rate = variables_config.get(
-            "sampling_rate",
-            model_config.get("sampling_rate", config.get("sampling_rate", 16000)),
-        )
-        encoder_config = model_config.get("encoder", model_config.get("ddconfig", {}))
-        decoder_config = model_config.get("decoder", encoder_config)
-
-        # Load mel spectrogram parameters
-        self.mel_bins = encoder_config.get("mel_bins", 64)
-        self.mel_hop_length = model_config.get("preprocessing", {}).get("stft", {}).get("hop_length", 160)
-        self.n_fft = model_config.get("preprocessing", {}).get("stft", {}).get("filter_length", 1024)
-
-        # Store causality configuration at VAE level (not just in encoder internals)
-        causality_axis_value = encoder_config.get("causality_axis", CausalityAxis.WIDTH.value)
-        self.causality_axis = CausalityAxis.str_to_enum(causality_axis_value)
-        self.is_causal = self.causality_axis == CausalityAxis.HEIGHT
-
-        self.encoder = Encoder(**encoder_config)
-        self.decoder = Decoder(**decoder_config)
-
-        self.per_channel_statistics = processor()
-
-    def _guess_config(self):
-        encoder_config = {
-            # Required parameters - based on ltx-video-av-1679000 model metadata
-            "ch": 128,
-            "out_ch": 8,
-            "ch_mult": [1, 2, 4],  # Based on metadata: [1, 2, 4] not [1, 2, 4, 8]
-            "num_res_blocks": 2,
-            "attn_resolutions": [],  # Based on metadata: empty list, no attention
-            "dropout": 0.0,
-            "resamp_with_conv": True,
-            "in_channels": 2,  # stereo
-            "resolution": 256,
-            "z_channels": 8,
-            "double_z": True,
-            "attn_type": "vanilla",
-            "mid_block_add_attention": False,  # Based on metadata: false
-            "norm_type": "pixel",
-            "causality_axis": "height",  # Based on metadata
-            "mel_bins": 64,  # Based on metadata: mel_bins = 64
-        }
-
-        decoder_config = {
-            # Inherits encoder config, can override specific params
-            **encoder_config,
-            "out_ch": 2,  # Stereo audio output (2 channels)
-            "give_pre_end": False,
-            "tanh_out": False,
-        }
-
-        config = {
-            "_class_name": "CausalAudioAutoencoder",
-            "sampling_rate": 16000,
-            "model": {
-                "params": {
-                    "encoder": encoder_config,
-                    "decoder": decoder_config,
-                }
-            },
-        }
-
-        return config
-
-    def get_config(self):
-        return {
-            "sampling_rate": self.sampling_rate,
-            "mel_bins": self.mel_bins,
-            "mel_hop_length": self.mel_hop_length,
-            "n_fft": self.n_fft,
-            "causality_axis": self.causality_axis.value,
-            "is_causal": self.is_causal,
-        }
-
-    def encode(self, x):
-        return self.encoder(x)
-
-    def decode(self, x, target_shape=None):
-        return self.decoder(x, target_shape=target_shape)
--- a/comfy/ldm/lightricks/vocoders/vocoder.py
+++ b/comfy/ldm/lightricks/vocoders/vocoder.py
@ -1,213 +0,0 @@
-import torch
-import torch.nn.functional as F
-import torch.nn as nn
-import comfy.ops
-import numpy as np
-
-ops = comfy.ops.disable_weight_init
-
-LRELU_SLOPE = 0.1
-
-def get_padding(kernel_size, dilation=1):
-    return int((kernel_size * dilation - dilation) / 2)
-
-
-class ResBlock1(torch.nn.Module):
-    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
-        super(ResBlock1, self).__init__()
-        self.convs1 = nn.ModuleList(
-            [
-                ops.Conv1d(
-                    channels,
-                    channels,
-                    kernel_size,
-                    1,
-                    dilation=dilation[0],
-                    padding=get_padding(kernel_size, dilation[0]),
-                ),
-                ops.Conv1d(
-                    channels,
-                    channels,
-                    kernel_size,
-                    1,
-                    dilation=dilation[1],
-                    padding=get_padding(kernel_size, dilation[1]),
-                ),
-                ops.Conv1d(
-                    channels,
-                    channels,
-                    kernel_size,
-                    1,
-                    dilation=dilation[2],
-                    padding=get_padding(kernel_size, dilation[2]),
-                ),
-            ]
-        )
-
-        self.convs2 = nn.ModuleList(
-            [
-                ops.Conv1d(
-                    channels,
-                    channels,
-                    kernel_size,
-                    1,
-                    dilation=1,
-                    padding=get_padding(kernel_size, 1),
-                ),
-                ops.Conv1d(
-                    channels,
-                    channels,
-                    kernel_size,
-                    1,
-                    dilation=1,
-                    padding=get_padding(kernel_size, 1),
-                ),
-                ops.Conv1d(
-                    channels,
-                    channels,
-                    kernel_size,
-                    1,
-                    dilation=1,
-                    padding=get_padding(kernel_size, 1),
-                ),
-            ]
-        )
-
-    def forward(self, x):
-        for c1, c2 in zip(self.convs1, self.convs2):
-            xt = F.leaky_relu(x, LRELU_SLOPE)
-            xt = c1(xt)
-            xt = F.leaky_relu(xt, LRELU_SLOPE)
-            xt = c2(xt)
-            x = xt + x
-        return x
-
-
-class ResBlock2(torch.nn.Module):
-    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
-        super(ResBlock2, self).__init__()
-        self.convs = nn.ModuleList(
-            [
-                ops.Conv1d(
-                    channels,
-                    channels,
-                    kernel_size,
-                    1,
-                    dilation=dilation[0],
-                    padding=get_padding(kernel_size, dilation[0]),
-                ),
-                ops.Conv1d(
-                    channels,
-                    channels,
-                    kernel_size,
-                    1,
-                    dilation=dilation[1],
-                    padding=get_padding(kernel_size, dilation[1]),
-                ),
-            ]
-        )
-
-    def forward(self, x):
-        for c in self.convs:
-            xt = F.leaky_relu(x, LRELU_SLOPE)
-            xt = c(xt)
-            x = xt + x
-        return x
-
-
-class Vocoder(torch.nn.Module):
-    """
-    Vocoder model for synthesizing audio from spectrograms, based on: https://github.com/jik876/hifi-gan.
-
-    """
-
-    def __init__(self, config=None):
-        super(Vocoder, self).__init__()
-
-        if config is None:
-            config = self.get_default_config()
-
-        resblock_kernel_sizes = config.get("resblock_kernel_sizes", [3, 7, 11])
-        upsample_rates = config.get("upsample_rates", [6, 5, 2, 2, 2])
-        upsample_kernel_sizes = config.get("upsample_kernel_sizes", [16, 15, 8, 4, 4])
-        resblock_dilation_sizes = config.get("resblock_dilation_sizes", [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
-        upsample_initial_channel = config.get("upsample_initial_channel", 1024)
-        stereo = config.get("stereo", True)
-        resblock = config.get("resblock", "1")
-
-        self.output_sample_rate = config.get("output_sample_rate")
-        self.num_kernels = len(resblock_kernel_sizes)
-        self.num_upsamples = len(upsample_rates)
-        in_channels = 128 if stereo else 64
-        self.conv_pre = ops.Conv1d(in_channels, upsample_initial_channel, 7, 1, padding=3)
-        resblock_class = ResBlock1 if resblock == "1" else ResBlock2
-
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-            self.ups.append(
-                ops.ConvTranspose1d(
-                    upsample_initial_channel // (2**i),
-                    upsample_initial_channel // (2 ** (i + 1)),
-                    k,
-                    u,
-                    padding=(k - u) // 2,
-                )
-            )
-
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = upsample_initial_channel // (2 ** (i + 1))
-            for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
-                self.resblocks.append(resblock_class(ch, k, d))
-
-        out_channels = 2 if stereo else 1
-        self.conv_post = ops.Conv1d(ch, out_channels, 7, 1, padding=3)
-
-        self.upsample_factor = np.prod([self.ups[i].stride[0] for i in range(len(self.ups))])
-
-    def get_default_config(self):
-        """Generate default configuration for the vocoder."""
-
-        config = {
-            "resblock_kernel_sizes": [3, 7, 11],
-            "upsample_rates": [6, 5, 2, 2, 2],
-            "upsample_kernel_sizes": [16, 15, 8, 4, 4],
-            "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
-            "upsample_initial_channel": 1024,
-            "stereo": True,
-            "resblock": "1",
-        }
-
-        return config
-
-    def forward(self, x):
-        """
-        Forward pass of the vocoder.
-
-        Args:
-            x: Input spectrogram tensor. Can be:
-               - 3D: (batch_size, channels, time_steps) for mono
-               - 4D: (batch_size, 2, channels, time_steps) for stereo
-
-        Returns:
-            Audio tensor of shape (batch_size, out_channels, audio_length)
-        """
-        if x.dim() == 4:  # stereo
-            assert x.shape[1] == 2, "Input must have 2 channels for stereo"
-            x = torch.cat((x[:, 0, :, :], x[:, 1, :, :]), dim=1)
-        x = self.conv_pre(x)
-        for i in range(self.num_upsamples):
-            x = F.leaky_relu(x, LRELU_SLOPE)
-            x = self.ups[i](x)
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-        x = F.leaky_relu(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-
-        return x
--- a/comfy/ldm/lumina/controlnet.py
+++ b/comfy/ldm/lumina/controlnet.py
@ -1,160 +0,0 @@
-import torch
-from torch import nn
-
-from .model import JointTransformerBlock
-
-class ZImageControlTransformerBlock(JointTransformerBlock):
-    def __init__(
-        self,
-        layer_id: int,
-        dim: int,
-        n_heads: int,
-        n_kv_heads: int,
-        multiple_of: int,
-        ffn_dim_multiplier: float,
-        norm_eps: float,
-        qk_norm: bool,
-        modulation=True,
-        block_id=0,
-        operation_settings=None,
-    ):
-        super().__init__(layer_id, dim, n_heads, n_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, qk_norm, modulation, z_image_modulation=True, operation_settings=operation_settings)
-        self.block_id = block_id
-        if block_id == 0:
-            self.before_proj = operation_settings.get("operations").Linear(self.dim, self.dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.after_proj = operation_settings.get("operations").Linear(self.dim, self.dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-
-    def forward(self, c, x, **kwargs):
-        if self.block_id == 0:
-            c = self.before_proj(c) + x
-        c = super().forward(c, **kwargs)
-        c_skip = self.after_proj(c)
-        return c_skip, c
-
-class ZImage_Control(torch.nn.Module):
-    def __init__(
-        self,
-        dim: int = 3840,
-        n_heads: int = 30,
-        n_kv_heads: int = 30,
-        multiple_of: int = 256,
-        ffn_dim_multiplier: float = (8.0 / 3.0),
-        norm_eps: float = 1e-5,
-        qk_norm: bool = True,
-        n_control_layers=6,
-        control_in_dim=16,
-        additional_in_dim=0,
-        broken=False,
-        refiner_control=False,
-        dtype=None,
-        device=None,
-        operations=None,
-        **kwargs
-    ):
-        super().__init__()
-        operation_settings = {"operations": operations, "device": device, "dtype": dtype}
-
-        self.broken = broken
-        self.additional_in_dim = additional_in_dim
-        self.control_in_dim = control_in_dim
-        n_refiner_layers = 2
-        self.n_control_layers = n_control_layers
-        self.control_layers = nn.ModuleList(
-            [
-                ZImageControlTransformerBlock(
-                    i,
-                    dim,
-                    n_heads,
-                    n_kv_heads,
-                    multiple_of,
-                    ffn_dim_multiplier,
-                    norm_eps,
-                    qk_norm,
-                    block_id=i,
-                    operation_settings=operation_settings,
-                )
-                for i in range(self.n_control_layers)
-            ]
-        )
-
-        all_x_embedder = {}
-        patch_size = 2
-        f_patch_size = 1
-        x_embedder = operations.Linear(f_patch_size * patch_size * patch_size * (self.control_in_dim + self.additional_in_dim), dim, bias=True, device=device, dtype=dtype)
-        all_x_embedder[f"{patch_size}-{f_patch_size}"] = x_embedder
-
-        self.refiner_control = refiner_control
-
-        self.control_all_x_embedder = nn.ModuleDict(all_x_embedder)
-        if self.refiner_control:
-            self.control_noise_refiner = nn.ModuleList(
-                [
-                    ZImageControlTransformerBlock(
-                        layer_id,
-                        dim,
-                        n_heads,
-                        n_kv_heads,
-                        multiple_of,
-                        ffn_dim_multiplier,
-                        norm_eps,
-                        qk_norm,
-                        block_id=layer_id,
-                        operation_settings=operation_settings,
-                    )
-                    for layer_id in range(n_refiner_layers)
-                ]
-            )
-        else:
-            self.control_noise_refiner = nn.ModuleList(
-                [
-                    JointTransformerBlock(
-                        layer_id,
-                        dim,
-                        n_heads,
-                        n_kv_heads,
-                        multiple_of,
-                        ffn_dim_multiplier,
-                        norm_eps,
-                        qk_norm,
-                        modulation=True,
-                        z_image_modulation=True,
-                        operation_settings=operation_settings,
-                    )
-                    for layer_id in range(n_refiner_layers)
-                ]
-            )
-
-    def forward(self, cap_feats, control_context, x_freqs_cis, adaln_input):
-        patch_size = 2
-        f_patch_size = 1
-        pH = pW = patch_size
-        B, C, H, W = control_context.shape
-        control_context = self.control_all_x_embedder[f"{patch_size}-{f_patch_size}"](control_context.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2))
-
-        x_attn_mask = None
-        if not self.refiner_control:
-            for layer in self.control_noise_refiner:
-                control_context = layer(control_context, x_attn_mask, x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input)
-
-        return control_context
-
-    def forward_noise_refiner_block(self, layer_id, control_context, x, x_attn_mask, x_freqs_cis, adaln_input):
-        if self.refiner_control:
-            if self.broken:
-                if layer_id == 0:
-                    return self.control_layers[layer_id](control_context, x, x_mask=x_attn_mask, freqs_cis=x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input=adaln_input)
-                if layer_id > 0:
-                    out = None
-                    for i in range(1, len(self.control_layers)):
-                        o, control_context = self.control_layers[i](control_context, x, x_mask=x_attn_mask, freqs_cis=x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input=adaln_input)
-                        if out is None:
-                            out = o
-
-                    return (out, control_context)
-            else:
-                return self.control_noise_refiner[layer_id](control_context, x, x_mask=x_attn_mask, freqs_cis=x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input=adaln_input)
-        else:
-            return (None, control_context)
-
-    def forward_control_block(self, layer_id, control_context, x, x_attn_mask, x_freqs_cis, adaln_input):
-        return self.control_layers[layer_id](control_context, x, x_mask=x_attn_mask, freqs_cis=x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input=adaln_input)
--- a/comfy/ldm/lumina/model.py
+++ b/comfy/ldm/lumina/model.py
@ -11,7 +11,6 @@ import comfy.ldm.common_dit
 from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder
 from comfy.ldm.modules.attention import optimized_attention_masked
 from comfy.ldm.flux.layers import EmbedND
-from comfy.ldm.flux.math import apply_rope
 import comfy.patcher_extension


@ -22,10 +21,6 @@ def modulate(x, scale):
 #                               Core NextDiT Model                              #
 #############################################################################

-def clamp_fp16(x):
-    if x.dtype == torch.float16:
-        return torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
-    return x

 class JointAttention(nn.Module):
    """Multi-head attention module."""
@ -36,7 +31,6 @@ class JointAttention(nn.Module):
        n_heads: int,
        n_kv_heads: Optional[int],
        qk_norm: bool,
-        out_bias: bool = False,
        operation_settings={},
    ):
        """
@ -65,7 +59,7 @@ class JointAttention(nn.Module):
        self.out = operation_settings.get("operations").Linear(
            n_heads * self.head_dim,
            dim,
-            bias=out_bias,
+            bias=False,
            device=operation_settings.get("device"),
            dtype=operation_settings.get("dtype"),
        )
@ -76,12 +70,40 @@ class JointAttention(nn.Module):
        else:
            self.q_norm = self.k_norm = nn.Identity()

+    @staticmethod
+    def apply_rotary_emb(
+        x_in: torch.Tensor,
+        freqs_cis: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Apply rotary embeddings to input tensors using the given frequency
+        tensor.
+
+        This function applies rotary embeddings to the given query 'xq' and
+        key 'xk' tensors using the provided frequency tensor 'freqs_cis'. The
+        input tensors are reshaped as complex numbers, and the frequency tensor
+        is reshaped for broadcasting compatibility. The resulting tensors
+        contain rotary embeddings and are returned as real tensors.
+
+        Args:
+            x_in (torch.Tensor): Query or Key tensor to apply rotary embeddings.
+            freqs_cis (torch.Tensor): Precomputed frequency tensor for complex
+                exponentials.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor
+                and key tensor with rotary embeddings.
+        """
+
+        t_ = x_in.reshape(*x_in.shape[:-1], -1, 1, 2)
+        t_out = freqs_cis[..., 0] * t_[..., 0] + freqs_cis[..., 1] * t_[..., 1]
+        return t_out.reshape(*x_in.shape)
+
    def forward(
        self,
        x: torch.Tensor,
        x_mask: torch.Tensor,
        freqs_cis: torch.Tensor,
-        transformer_options={},
    ) -> torch.Tensor:
        """

@ -111,13 +133,14 @@ class JointAttention(nn.Module):
        xq = self.q_norm(xq)
        xk = self.k_norm(xk)

-        xq, xk = apply_rope(xq, xk, freqs_cis)
+        xq = JointAttention.apply_rotary_emb(xq, freqs_cis=freqs_cis)
+        xk = JointAttention.apply_rotary_emb(xk, freqs_cis=freqs_cis)

        n_rep = self.n_local_heads // self.n_local_kv_heads
        if n_rep >= 1:
            xk = xk.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
            xv = xv.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
-        output = optimized_attention_masked(xq.movedim(1, 2), xk.movedim(1, 2), xv.movedim(1, 2), self.n_local_heads, x_mask, skip_reshape=True, transformer_options=transformer_options)
+        output = optimized_attention_masked(xq.movedim(1, 2), xk.movedim(1, 2), xv.movedim(1, 2), self.n_local_heads, x_mask, skip_reshape=True)

        return self.out(output)

@ -173,7 +196,7 @@ class FeedForward(nn.Module):

    # @torch.compile
    def _forward_silu_gating(self, x1, x3):
-        return clamp_fp16(F.silu(x1) * x3)
+        return F.silu(x1) * x3

    def forward(self, x):
        return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x)))
@ -191,8 +214,6 @@ class JointTransformerBlock(nn.Module):
        norm_eps: float,
        qk_norm: bool,
        modulation=True,
-        z_image_modulation=False,
-        attn_out_bias=False,
        operation_settings={},
    ) -> None:
        """
@ -213,10 +234,10 @@ class JointTransformerBlock(nn.Module):
        super().__init__()
        self.dim = dim
        self.head_dim = dim // n_heads
-        self.attention = JointAttention(dim, n_heads, n_kv_heads, qk_norm, out_bias=attn_out_bias, operation_settings=operation_settings)
+        self.attention = JointAttention(dim, n_heads, n_kv_heads, qk_norm, operation_settings=operation_settings)
        self.feed_forward = FeedForward(
            dim=dim,
-            hidden_dim=dim,
+            hidden_dim=4 * dim,
            multiple_of=multiple_of,
            ffn_dim_multiplier=ffn_dim_multiplier,
            operation_settings=operation_settings,
@ -230,27 +251,16 @@ class JointTransformerBlock(nn.Module):

        self.modulation = modulation
        if modulation:
-            if z_image_modulation:
-                self.adaLN_modulation = nn.Sequential(
-                    operation_settings.get("operations").Linear(
-                        min(dim, 256),
-                        4 * dim,
-                        bias=True,
-                        device=operation_settings.get("device"),
-                        dtype=operation_settings.get("dtype"),
-                    ),
-                )
-            else:
-                self.adaLN_modulation = nn.Sequential(
-                    nn.SiLU(),
-                    operation_settings.get("operations").Linear(
-                        min(dim, 1024),
-                        4 * dim,
-                        bias=True,
-                        device=operation_settings.get("device"),
-                        dtype=operation_settings.get("dtype"),
-                    ),
-                )
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                operation_settings.get("operations").Linear(
+                    min(dim, 1024),
+                    4 * dim,
+                    bias=True,
+                    device=operation_settings.get("device"),
+                    dtype=operation_settings.get("dtype"),
+                ),
+            )

    def forward(
        self,
@ -258,7 +268,6 @@ class JointTransformerBlock(nn.Module):
        x_mask: torch.Tensor,
        freqs_cis: torch.Tensor,
        adaln_input: Optional[torch.Tensor]=None,
-        transformer_options={},
    ):
        """
        Perform a forward pass through the TransformerBlock.
@ -277,27 +286,25 @@ class JointTransformerBlock(nn.Module):
            scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).chunk(4, dim=1)

            x = x + gate_msa.unsqueeze(1).tanh() * self.attention_norm2(
-                clamp_fp16(self.attention(
+                self.attention(
                    modulate(self.attention_norm1(x), scale_msa),
                    x_mask,
                    freqs_cis,
-                    transformer_options=transformer_options,
-                ))
+                )
            )
            x = x + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(
-                clamp_fp16(self.feed_forward(
+                self.feed_forward(
                    modulate(self.ffn_norm1(x), scale_mlp),
-                ))
+                )
            )
        else:
            assert adaln_input is None
            x = x + self.attention_norm2(
-                clamp_fp16(self.attention(
+                self.attention(
                    self.attention_norm1(x),
                    x_mask,
                    freqs_cis,
-                    transformer_options=transformer_options,
-                ))
+                )
            )
            x = x + self.ffn_norm2(
                self.feed_forward(
@ -312,7 +319,7 @@ class FinalLayer(nn.Module):
    The final layer of NextDiT.
    """

-    def __init__(self, hidden_size, patch_size, out_channels, z_image_modulation=False, operation_settings={}):
+    def __init__(self, hidden_size, patch_size, out_channels, operation_settings={}):
        super().__init__()
        self.norm_final = operation_settings.get("operations").LayerNorm(
            hidden_size,
@ -329,15 +336,10 @@ class FinalLayer(nn.Module):
            dtype=operation_settings.get("dtype"),
        )

-        if z_image_modulation:
-            min_mod = 256
-        else:
-            min_mod = 1024
-
        self.adaLN_modulation = nn.Sequential(
            nn.SiLU(),
            operation_settings.get("operations").Linear(
-                min(hidden_size, min_mod),
+                min(hidden_size, 1024),
                hidden_size,
                bias=True,
                device=operation_settings.get("device"),
@ -367,17 +369,12 @@ class NextDiT(nn.Module):
        n_heads: int = 32,
        n_kv_heads: Optional[int] = None,
        multiple_of: int = 256,
-        ffn_dim_multiplier: float = 4.0,
+        ffn_dim_multiplier: Optional[float] = None,
        norm_eps: float = 1e-5,
        qk_norm: bool = False,
        cap_feat_dim: int = 5120,
        axes_dims: List[int] = (16, 56, 56),
        axes_lens: List[int] = (1, 512, 512),
-        rope_theta=10000.0,
-        z_image_modulation=False,
-        time_scale=1.0,
-        pad_tokens_multiple=None,
-        clip_text_dim=None,
        image_model=None,
        device=None,
        dtype=None,
@ -389,8 +386,6 @@ class NextDiT(nn.Module):
        self.in_channels = in_channels
        self.out_channels = in_channels
        self.patch_size = patch_size
-        self.time_scale = time_scale
-        self.pad_tokens_multiple = pad_tokens_multiple

        self.x_embedder = operation_settings.get("operations").Linear(
            in_features=patch_size * patch_size * in_channels,
@ -412,7 +407,6 @@ class NextDiT(nn.Module):
                    norm_eps,
                    qk_norm,
                    modulation=True,
-                    z_image_modulation=z_image_modulation,
                    operation_settings=operation_settings,
                )
                for layer_id in range(n_refiner_layers)
@ -436,7 +430,7 @@ class NextDiT(nn.Module):
            ]
        )

-        self.t_embedder = TimestepEmbedder(min(dim, 1024), output_size=256 if z_image_modulation else None, **operation_settings)
+        self.t_embedder = TimestepEmbedder(min(dim, 1024), **operation_settings)
        self.cap_embedder = nn.Sequential(
            operation_settings.get("operations").RMSNorm(cap_feat_dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")),
            operation_settings.get("operations").Linear(
@ -448,31 +442,6 @@ class NextDiT(nn.Module):
            ),
        )

-        self.clip_text_pooled_proj = None
-
-        if clip_text_dim is not None:
-            self.clip_text_dim = clip_text_dim
-            self.clip_text_pooled_proj = nn.Sequential(
-                operation_settings.get("operations").RMSNorm(clip_text_dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")),
-                operation_settings.get("operations").Linear(
-                    clip_text_dim,
-                    clip_text_dim,
-                    bias=True,
-                    device=operation_settings.get("device"),
-                    dtype=operation_settings.get("dtype"),
-                ),
-            )
-            self.time_text_embed = nn.Sequential(
-                nn.SiLU(),
-                operation_settings.get("operations").Linear(
-                    min(dim, 1024) + clip_text_dim,
-                    min(dim, 1024),
-                    bias=True,
-                    device=operation_settings.get("device"),
-                    dtype=operation_settings.get("dtype"),
-                ),
-            )
-
        self.layers = nn.ModuleList(
            [
                JointTransformerBlock(
@ -484,25 +453,18 @@ class NextDiT(nn.Module):
                    ffn_dim_multiplier,
                    norm_eps,
                    qk_norm,
-                    z_image_modulation=z_image_modulation,
-                    attn_out_bias=False,
                    operation_settings=operation_settings,
                )
                for layer_id in range(n_layers)
            ]
        )
-        # This norm final is in the lumina 2.0 code but isn't actually used for anything.
-        # self.norm_final = operation_settings.get("operations").RMSNorm(dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.final_layer = FinalLayer(dim, patch_size, self.out_channels, z_image_modulation=z_image_modulation, operation_settings=operation_settings)
-
-        if self.pad_tokens_multiple is not None:
-            self.x_pad_token = nn.Parameter(torch.empty((1, dim), device=device, dtype=dtype))
-            self.cap_pad_token = nn.Parameter(torch.empty((1, dim), device=device, dtype=dtype))
+        self.norm_final = operation_settings.get("operations").RMSNorm(dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.final_layer = FinalLayer(dim, patch_size, self.out_channels, operation_settings=operation_settings)

        assert (dim // n_heads) == sum(axes_dims)
        self.axes_dims = axes_dims
        self.axes_lens = axes_lens
-        self.rope_embedder = EmbedND(dim=dim // n_heads, theta=rope_theta, axes_dim=axes_dims)
+        self.rope_embedder = EmbedND(dim=dim // n_heads, theta=10000.0, axes_dim=axes_dims)
        self.dim = dim
        self.n_heads = n_heads

@ -532,68 +494,101 @@ class NextDiT(nn.Module):
        return imgs

    def patchify_and_embed(
-        self, x: List[torch.Tensor] | torch.Tensor, cap_feats: torch.Tensor, cap_mask: torch.Tensor, t: torch.Tensor, num_tokens, transformer_options={}
+        self, x: List[torch.Tensor] | torch.Tensor, cap_feats: torch.Tensor, cap_mask: torch.Tensor, t: torch.Tensor, num_tokens
    ) -> Tuple[torch.Tensor, torch.Tensor, List[Tuple[int, int]], List[int], torch.Tensor]:
        bsz = len(x)
        pH = pW = self.patch_size
        device = x[0].device
-        orig_x = x
+        dtype = x[0].dtype

-        if self.pad_tokens_multiple is not None:
-            pad_extra = (-cap_feats.shape[1]) % self.pad_tokens_multiple
-            cap_feats = torch.cat((cap_feats, self.cap_pad_token.to(device=cap_feats.device, dtype=cap_feats.dtype, copy=True).unsqueeze(0).repeat(cap_feats.shape[0], pad_extra, 1)), dim=1)
+        if cap_mask is not None:
+            l_effective_cap_len = cap_mask.sum(dim=1).tolist()
+        else:
+            l_effective_cap_len = [num_tokens] * bsz

-        cap_pos_ids = torch.zeros(bsz, cap_feats.shape[1], 3, dtype=torch.float32, device=device)
-        cap_pos_ids[:, :, 0] = torch.arange(cap_feats.shape[1], dtype=torch.float32, device=device) + 1.0
+        if cap_mask is not None and not torch.is_floating_point(cap_mask):
+            cap_mask = (cap_mask - 1).to(dtype) * torch.finfo(dtype).max

-        B, C, H, W = x.shape
-        x = self.x_embedder(x.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2))
+        img_sizes = [(img.size(1), img.size(2)) for img in x]
+        l_effective_img_len = [(H // pH) * (W // pW) for (H, W) in img_sizes]

-        rope_options = transformer_options.get("rope_options", None)
-        h_scale = 1.0
-        w_scale = 1.0
-        h_start = 0
-        w_start = 0
-        if rope_options is not None:
-            h_scale = rope_options.get("scale_y", 1.0)
-            w_scale = rope_options.get("scale_x", 1.0)
+        max_seq_len = max(
+            (cap_len+img_len for cap_len, img_len in zip(l_effective_cap_len, l_effective_img_len))
+        )
+        max_cap_len = max(l_effective_cap_len)
+        max_img_len = max(l_effective_img_len)

-            h_start = rope_options.get("shift_y", 0.0)
-            w_start = rope_options.get("shift_x", 0.0)
+        position_ids = torch.zeros(bsz, max_seq_len, 3, dtype=torch.int32, device=device)

-        H_tokens, W_tokens = H // pH, W // pW
-        x_pos_ids = torch.zeros((bsz, x.shape[1], 3), dtype=torch.float32, device=device)
-        x_pos_ids[:, :, 0] = cap_feats.shape[1] + 1
-        x_pos_ids[:, :, 1] = (torch.arange(H_tokens, dtype=torch.float32, device=device) * h_scale + h_start).view(-1, 1).repeat(1, W_tokens).flatten()
-        x_pos_ids[:, :, 2] = (torch.arange(W_tokens, dtype=torch.float32, device=device) * w_scale + w_start).view(1, -1).repeat(H_tokens, 1).flatten()
+        for i in range(bsz):
+            cap_len = l_effective_cap_len[i]
+            img_len = l_effective_img_len[i]
+            H, W = img_sizes[i]
+            H_tokens, W_tokens = H // pH, W // pW
+            assert H_tokens * W_tokens == img_len

-        if self.pad_tokens_multiple is not None:
-            pad_extra = (-x.shape[1]) % self.pad_tokens_multiple
-            x = torch.cat((x, self.x_pad_token.to(device=x.device, dtype=x.dtype, copy=True).unsqueeze(0).repeat(x.shape[0], pad_extra, 1)), dim=1)
-            x_pos_ids = torch.nn.functional.pad(x_pos_ids, (0, 0, 0, pad_extra))
+            position_ids[i, :cap_len, 0] = torch.arange(cap_len, dtype=torch.int32, device=device)
+            position_ids[i, cap_len:cap_len+img_len, 0] = cap_len
+            row_ids = torch.arange(H_tokens, dtype=torch.int32, device=device).view(-1, 1).repeat(1, W_tokens).flatten()
+            col_ids = torch.arange(W_tokens, dtype=torch.int32, device=device).view(1, -1).repeat(H_tokens, 1).flatten()
+            position_ids[i, cap_len:cap_len+img_len, 1] = row_ids
+            position_ids[i, cap_len:cap_len+img_len, 2] = col_ids

-        freqs_cis = self.rope_embedder(torch.cat((cap_pos_ids, x_pos_ids), dim=1)).movedim(1, 2)
+        freqs_cis = self.rope_embedder(position_ids).movedim(1, 2).to(dtype)

-        patches = transformer_options.get("patches", {})
+        # build freqs_cis for cap and image individually
+        cap_freqs_cis_shape = list(freqs_cis.shape)
+        # cap_freqs_cis_shape[1] = max_cap_len
+        cap_freqs_cis_shape[1] = cap_feats.shape[1]
+        cap_freqs_cis = torch.zeros(*cap_freqs_cis_shape, device=device, dtype=freqs_cis.dtype)
+
+        img_freqs_cis_shape = list(freqs_cis.shape)
+        img_freqs_cis_shape[1] = max_img_len
+        img_freqs_cis = torch.zeros(*img_freqs_cis_shape, device=device, dtype=freqs_cis.dtype)
+
+        for i in range(bsz):
+            cap_len = l_effective_cap_len[i]
+            img_len = l_effective_img_len[i]
+            cap_freqs_cis[i, :cap_len] = freqs_cis[i, :cap_len]
+            img_freqs_cis[i, :img_len] = freqs_cis[i, cap_len:cap_len+img_len]

        # refine context
        for layer in self.context_refiner:
-            cap_feats = layer(cap_feats, cap_mask, freqs_cis[:, :cap_pos_ids.shape[1]], transformer_options=transformer_options)
+            cap_feats = layer(cap_feats, cap_mask, cap_freqs_cis)

-        padded_img_mask = None
-        x_input = x
-        for i, layer in enumerate(self.noise_refiner):
-            x = layer(x, padded_img_mask, freqs_cis[:, cap_pos_ids.shape[1]:], t, transformer_options=transformer_options)
-            if "noise_refiner" in patches:
-                for p in patches["noise_refiner"]:
-                    out = p({"img": x, "img_input": x_input, "txt": cap_feats, "pe": freqs_cis[:, cap_pos_ids.shape[1]:], "vec": t, "x": orig_x, "block_index": i, "transformer_options": transformer_options, "block_type": "noise_refiner"})
-                    if "img" in out:
-                        x = out["img"]
+        # refine image
+        flat_x = []
+        for i in range(bsz):
+            img = x[i]
+            C, H, W = img.size()
+            img = img.view(C, H // pH, pH, W // pW, pW).permute(1, 3, 2, 4, 0).flatten(2).flatten(0, 1)
+            flat_x.append(img)
+        x = flat_x
+        padded_img_embed = torch.zeros(bsz, max_img_len, x[0].shape[-1], device=device, dtype=x[0].dtype)
+        padded_img_mask = torch.zeros(bsz, max_img_len, dtype=dtype, device=device)
+        for i in range(bsz):
+            padded_img_embed[i, :l_effective_img_len[i]] = x[i]
+            padded_img_mask[i, l_effective_img_len[i]:] = -torch.finfo(dtype).max
+
+        padded_img_embed = self.x_embedder(padded_img_embed)
+        padded_img_mask = padded_img_mask.unsqueeze(1)
+        for layer in self.noise_refiner:
+            padded_img_embed = layer(padded_img_embed, padded_img_mask, img_freqs_cis, t)
+
+        if cap_mask is not None:
+            mask = torch.zeros(bsz, max_seq_len, dtype=dtype, device=device)
+            mask[:, :max_cap_len] = cap_mask[:, :max_cap_len]
+        else:
+            mask = None
+
+        padded_full_embed = torch.zeros(bsz, max_seq_len, self.dim, device=device, dtype=x[0].dtype)
+        for i in range(bsz):
+            cap_len = l_effective_cap_len[i]
+            img_len = l_effective_img_len[i]
+
+            padded_full_embed[i, :cap_len] = cap_feats[i, :cap_len]
+            padded_full_embed[i, cap_len:cap_len+img_len] = padded_img_embed[i, :img_len]

-        padded_full_embed = torch.cat((cap_feats, x), dim=1)
-        mask = None
-        img_sizes = [(H, W)] * bsz
-        l_effective_cap_len = [cap_feats.shape[1]] * bsz
        return padded_full_embed, mask, img_sizes, l_effective_cap_len, freqs_cis

    def forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwargs):
@ -604,7 +599,7 @@ class NextDiT(nn.Module):
        ).execute(x, timesteps, context, num_tokens, attention_mask, **kwargs)

    # def forward(self, x, t, cap_feats, cap_mask):
-    def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, transformer_options={}, **kwargs):
+    def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwargs):
        t = 1.0 - timesteps
        cap_feats = context
        cap_mask = attention_mask
@ -616,41 +611,20 @@ class NextDiT(nn.Module):
        y: (N,) tensor of text tokens/features
        """

-        t = self.t_embedder(t * self.time_scale, dtype=x.dtype)  # (N, D)
+        t = self.t_embedder(t, dtype=x.dtype)  # (N, D)
        adaln_input = t

        cap_feats = self.cap_embedder(cap_feats)  # (N, L, D)  # todo check if able to batchify w.o. redundant compute

-        if self.clip_text_pooled_proj is not None:
-            pooled = kwargs.get("clip_text_pooled", None)
-            if pooled is not None:
-                pooled = self.clip_text_pooled_proj(pooled)
-            else:
-                pooled = torch.zeros((x.shape[0], self.clip_text_dim), device=x.device, dtype=x.dtype)
-
-            adaln_input = self.time_text_embed(torch.cat((t, pooled), dim=-1))
-
-        patches = transformer_options.get("patches", {})
        x_is_tensor = isinstance(x, torch.Tensor)
-        img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, adaln_input, num_tokens, transformer_options=transformer_options)
-        freqs_cis = freqs_cis.to(img.device)
+        x, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens)
+        freqs_cis = freqs_cis.to(x.device)

-        transformer_options["total_blocks"] = len(self.layers)
-        transformer_options["block_type"] = "double"
-        img_input = img
-        for i, layer in enumerate(self.layers):
-            transformer_options["block_index"] = i
-            img = layer(img, mask, freqs_cis, adaln_input, transformer_options=transformer_options)
-            if "double_block" in patches:
-                for p in patches["double_block"]:
-                    out = p({"img": img[:, cap_size[0]:], "img_input": img_input[:, cap_size[0]:], "txt": img[:, :cap_size[0]], "pe": freqs_cis[:, cap_size[0]:], "vec": adaln_input, "x": x, "block_index": i, "transformer_options": transformer_options})
-                    if "img" in out:
-                        img[:, cap_size[0]:] = out["img"]
-                    if "txt" in out:
-                        img[:, :cap_size[0]] = out["txt"]
+        for layer in self.layers:
+            x = layer(x, mask, freqs_cis, adaln_input)

-        img = self.final_layer(img, adaln_input)
-        img = self.unpatchify(img, img_size, cap_size, return_tensor=x_is_tensor)[:, :, :h, :w]
+        x = self.final_layer(x, adaln_input)
+        x = self.unpatchify(x, img_size, cap_size, return_tensor=x_is_tensor)[:,:,:h,:w]

-        return -img
+        return -x

--- a/comfy/ldm/mmaudio/vae/init.py
+++ b/comfy/ldm/mmaudio/vae/init.py
--- a/comfy/ldm/mmaudio/vae/activations.py
+++ b/comfy/ldm/mmaudio/vae/activations.py
@ -1,120 +0,0 @@
-# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
-#   LICENSE is in incl_licenses directory.
-
-import torch
-from torch import nn, sin, pow
-from torch.nn import Parameter
-import comfy.model_management
-
-class Snake(nn.Module):
-    '''
-    Implementation of a sine-based periodic activation function
-    Shape:
-        - Input: (B, C, T)
-        - Output: (B, C, T), same shape as the input
-    Parameters:
-        - alpha - trainable parameter
-    References:
-        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
-        https://arxiv.org/abs/2006.08195
-    Examples:
-        >>> a1 = snake(256)
-        >>> x = torch.randn(256)
-        >>> x = a1(x)
-    '''
-    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
-        '''
-        Initialization.
-        INPUT:
-            - in_features: shape of the input
-            - alpha: trainable parameter
-            alpha is initialized to 1 by default, higher values = higher-frequency.
-            alpha will be trained along with the rest of your model.
-        '''
-        super(Snake, self).__init__()
-        self.in_features = in_features
-
-        # initialize alpha
-        self.alpha_logscale = alpha_logscale
-        if self.alpha_logscale:
-            self.alpha = Parameter(torch.empty(in_features))
-        else:
-            self.alpha = Parameter(torch.empty(in_features))
-
-        self.alpha.requires_grad = alpha_trainable
-
-        self.no_div_by_zero = 0.000000001
-
-    def forward(self, x):
-        '''
-        Forward pass of the function.
-        Applies the function to the input elementwise.
-        Snake ∶= x + 1/a * sin^2 (xa)
-        '''
-        alpha = comfy.model_management.cast_to(self.alpha, dtype=x.dtype, device=x.device).unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
-        if self.alpha_logscale:
-            alpha = torch.exp(alpha)
-        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
-
-        return x
-
-
-class SnakeBeta(nn.Module):
-    '''
-    A modified Snake function which uses separate parameters for the magnitude of the periodic components
-    Shape:
-        - Input: (B, C, T)
-        - Output: (B, C, T), same shape as the input
-    Parameters:
-        - alpha - trainable parameter that controls frequency
-        - beta - trainable parameter that controls magnitude
-    References:
-        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
-        https://arxiv.org/abs/2006.08195
-    Examples:
-        >>> a1 = snakebeta(256)
-        >>> x = torch.randn(256)
-        >>> x = a1(x)
-    '''
-    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
-        '''
-        Initialization.
-        INPUT:
-            - in_features: shape of the input
-            - alpha - trainable parameter that controls frequency
-            - beta - trainable parameter that controls magnitude
-            alpha is initialized to 1 by default, higher values = higher-frequency.
-            beta is initialized to 1 by default, higher values = higher-magnitude.
-            alpha will be trained along with the rest of your model.
-        '''
-        super(SnakeBeta, self).__init__()
-        self.in_features = in_features
-
-        # initialize alpha
-        self.alpha_logscale = alpha_logscale
-        if self.alpha_logscale:
-            self.alpha = Parameter(torch.empty(in_features))
-            self.beta = Parameter(torch.empty(in_features))
-        else:
-            self.alpha = Parameter(torch.empty(in_features))
-            self.beta = Parameter(torch.empty(in_features))
-
-        self.alpha.requires_grad = alpha_trainable
-        self.beta.requires_grad = alpha_trainable
-
-        self.no_div_by_zero = 0.000000001
-
-    def forward(self, x):
-        '''
-        Forward pass of the function.
-        Applies the function to the input elementwise.
-        SnakeBeta ∶= x + 1/b * sin^2 (xa)
-        '''
-        alpha = comfy.model_management.cast_to(self.alpha, dtype=x.dtype, device=x.device).unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
-        beta = comfy.model_management.cast_to(self.beta, dtype=x.dtype, device=x.device).unsqueeze(0).unsqueeze(-1)
-        if self.alpha_logscale:
-            alpha = torch.exp(alpha)
-            beta = torch.exp(beta)
-        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
-
-        return x
--- a/comfy/ldm/mmaudio/vae/alias_free_torch.py
+++ b/comfy/ldm/mmaudio/vae/alias_free_torch.py
@ -1,157 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import math
-import comfy.model_management
-
-if 'sinc' in dir(torch):
-    sinc = torch.sinc
-else:
-    # This code is adopted from adefossez's julius.core.sinc under the MIT License
-    # https://adefossez.github.io/julius/julius/core.html
-    #   LICENSE is in incl_licenses directory.
-    def sinc(x: torch.Tensor):
-        """
-        Implementation of sinc, i.e. sin(pi * x) / (pi * x)
-        __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
-        """
-        return torch.where(x == 0,
-                           torch.tensor(1., device=x.device, dtype=x.dtype),
-                           torch.sin(math.pi * x) / math.pi / x)
-
-
-# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
-# https://adefossez.github.io/julius/julius/lowpass.html
-#   LICENSE is in incl_licenses directory.
-def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size]
-    even = (kernel_size % 2 == 0)
-    half_size = kernel_size // 2
-
-    #For kaiser window
-    delta_f = 4 * half_width
-    A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
-    if A > 50.:
-        beta = 0.1102 * (A - 8.7)
-    elif A >= 21.:
-        beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.)
-    else:
-        beta = 0.
-    window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
-
-    # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
-    if even:
-        time = (torch.arange(-half_size, half_size) + 0.5)
-    else:
-        time = torch.arange(kernel_size) - half_size
-    if cutoff == 0:
-        filter_ = torch.zeros_like(time)
-    else:
-        filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
-        # Normalize filter to have sum = 1, otherwise we will have a small leakage
-        # of the constant component in the input signal.
-        filter_ /= filter_.sum()
-        filter = filter_.view(1, 1, kernel_size)
-
-    return filter
-
-
-class LowPassFilter1d(nn.Module):
-    def __init__(self,
-                 cutoff=0.5,
-                 half_width=0.6,
-                 stride: int = 1,
-                 padding: bool = True,
-                 padding_mode: str = 'replicate',
-                 kernel_size: int = 12):
-        # kernel_size should be even number for stylegan3 setup,
-        # in this implementation, odd number is also possible.
-        super().__init__()
-        if cutoff < -0.:
-            raise ValueError("Minimum cutoff must be larger than zero.")
-        if cutoff > 0.5:
-            raise ValueError("A cutoff above 0.5 does not make sense.")
-        self.kernel_size = kernel_size
-        self.even = (kernel_size % 2 == 0)
-        self.pad_left = kernel_size // 2 - int(self.even)
-        self.pad_right = kernel_size // 2
-        self.stride = stride
-        self.padding = padding
-        self.padding_mode = padding_mode
-        filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
-        self.register_buffer("filter", filter)
-
-    #input [B, C, T]
-    def forward(self, x):
-        _, C, _ = x.shape
-
-        if self.padding:
-            x = F.pad(x, (self.pad_left, self.pad_right),
-                      mode=self.padding_mode)
-        out = F.conv1d(x, comfy.model_management.cast_to(self.filter.expand(C, -1, -1), dtype=x.dtype, device=x.device),
-                       stride=self.stride, groups=C)
-
-        return out
-
-
-class UpSample1d(nn.Module):
-    def __init__(self, ratio=2, kernel_size=None):
-        super().__init__()
-        self.ratio = ratio
-        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
-        self.stride = ratio
-        self.pad = self.kernel_size // ratio - 1
-        self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
-        self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
-        filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio,
-                                      half_width=0.6 / ratio,
-                                      kernel_size=self.kernel_size)
-        self.register_buffer("filter", filter)
-
-    # x: [B, C, T]
-    def forward(self, x):
-        _, C, _ = x.shape
-
-        x = F.pad(x, (self.pad, self.pad), mode='replicate')
-        x = self.ratio * F.conv_transpose1d(
-            x, comfy.model_management.cast_to(self.filter.expand(C, -1, -1), dtype=x.dtype, device=x.device), stride=self.stride, groups=C)
-        x = x[..., self.pad_left:-self.pad_right]
-
-        return x
-
-
-class DownSample1d(nn.Module):
-    def __init__(self, ratio=2, kernel_size=None):
-        super().__init__()
-        self.ratio = ratio
-        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
-        self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio,
-                                       half_width=0.6 / ratio,
-                                       stride=ratio,
-                                       kernel_size=self.kernel_size)
-
-    def forward(self, x):
-        xx = self.lowpass(x)
-
-        return xx
-
-class Activation1d(nn.Module):
-    def __init__(self,
-                 activation,
-                 up_ratio: int = 2,
-                 down_ratio: int = 2,
-                 up_kernel_size: int = 12,
-                 down_kernel_size: int = 12):
-        super().__init__()
-        self.up_ratio = up_ratio
-        self.down_ratio = down_ratio
-        self.act = activation
-        self.upsample = UpSample1d(up_ratio, up_kernel_size)
-        self.downsample = DownSample1d(down_ratio, down_kernel_size)
-
-    # x: [B,C,T]
-    def forward(self, x):
-        x = self.upsample(x)
-        x = self.act(x)
-        x = self.downsample(x)
-
-        return x
--- a/comfy/ldm/mmaudio/vae/autoencoder.py
+++ b/comfy/ldm/mmaudio/vae/autoencoder.py
@ -1,156 +0,0 @@
-from typing import Literal
-
-import torch
-import torch.nn as nn
-
-from .distributions import DiagonalGaussianDistribution
-from .vae import VAE_16k
-from .bigvgan import BigVGANVocoder
-import logging
-
-try:
-    import torchaudio
-except:
-    logging.warning("torchaudio missing, MMAudio VAE model will be broken")
-
-def dynamic_range_compression_torch(x, C=1, clip_val=1e-5, *, norm_fn):
-    return norm_fn(torch.clamp(x, min=clip_val) * C)
-
-
-def spectral_normalize_torch(magnitudes, norm_fn):
-    output = dynamic_range_compression_torch(magnitudes, norm_fn=norm_fn)
-    return output
-
-class MelConverter(nn.Module):
-
-    def __init__(
-        self,
-        *,
-        sampling_rate: float,
-        n_fft: int,
-        num_mels: int,
-        hop_size: int,
-        win_size: int,
-        fmin: float,
-        fmax: float,
-        norm_fn,
-    ):
-        super().__init__()
-        self.sampling_rate = sampling_rate
-        self.n_fft = n_fft
-        self.num_mels = num_mels
-        self.hop_size = hop_size
-        self.win_size = win_size
-        self.fmin = fmin
-        self.fmax = fmax
-        self.norm_fn = norm_fn
-
-        # mel = librosa_mel_fn(sr=self.sampling_rate,
-        #                      n_fft=self.n_fft,
-        #                      n_mels=self.num_mels,
-        #                      fmin=self.fmin,
-        #                      fmax=self.fmax)
-        # mel_basis = torch.from_numpy(mel).float()
-        mel_basis = torch.empty((num_mels, 1 + n_fft // 2))
-        hann_window = torch.hann_window(self.win_size)
-
-        self.register_buffer('mel_basis', mel_basis)
-        self.register_buffer('hann_window', hann_window)
-
-    @property
-    def device(self):
-        return self.mel_basis.device
-
-    def forward(self, waveform: torch.Tensor, center: bool = False) -> torch.Tensor:
-        waveform = waveform.clamp(min=-1., max=1.).to(self.device)
-
-        waveform = torch.nn.functional.pad(
-            waveform.unsqueeze(1),
-            [int((self.n_fft - self.hop_size) / 2),
-             int((self.n_fft - self.hop_size) / 2)],
-            mode='reflect')
-        waveform = waveform.squeeze(1)
-
-        spec = torch.stft(waveform,
-                          self.n_fft,
-                          hop_length=self.hop_size,
-                          win_length=self.win_size,
-                          window=self.hann_window,
-                          center=center,
-                          pad_mode='reflect',
-                          normalized=False,
-                          onesided=True,
-                          return_complex=True)
-
-        spec = torch.view_as_real(spec)
-        spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
-        spec = torch.matmul(self.mel_basis, spec)
-        spec = spectral_normalize_torch(spec, self.norm_fn)
-
-        return spec
-
-class AudioAutoencoder(nn.Module):
-
-    def __init__(
-        self,
-        *,
-        # ckpt_path: str,
-        mode=Literal['16k', '44k'],
-        need_vae_encoder: bool = True,
-    ):
-        super().__init__()
-
-        assert mode == "16k", "Only 16k mode is supported currently."
-        self.mel_converter = MelConverter(sampling_rate=16_000,
-                            n_fft=1024,
-                            num_mels=80,
-                            hop_size=256,
-                            win_size=1024,
-                            fmin=0,
-                            fmax=8_000,
-                            norm_fn=torch.log10)
-
-        self.vae = VAE_16k().eval()
-
-        bigvgan_config = {
-            "resblock": "1",
-            "num_mels": 80,
-            "upsample_rates": [4, 4, 2, 2, 2, 2],
-            "upsample_kernel_sizes": [8, 8, 4, 4, 4, 4],
-            "upsample_initial_channel": 1536,
-            "resblock_kernel_sizes": [3, 7, 11],
-            "resblock_dilation_sizes": [
-                [1, 3, 5],
-                [1, 3, 5],
-                [1, 3, 5],
-            ],
-            "activation": "snakebeta",
-            "snake_logscale": True,
-        }
-
-        self.vocoder = BigVGANVocoder(
-            bigvgan_config
-        ).eval()
-
-    @torch.inference_mode()
-    def encode_audio(self, x) -> DiagonalGaussianDistribution:
-        # x: (B * L)
-        mel = self.mel_converter(x)
-        dist = self.vae.encode(mel)
-
-        return dist
-
-    @torch.no_grad()
-    def decode(self, z):
-        mel_decoded = self.vae.decode(z)
-        audio = self.vocoder(mel_decoded)
-
-        audio = torchaudio.functional.resample(audio, 16000, 44100)
-        return audio
-
-    @torch.no_grad()
-    def encode(self, audio):
-        audio = audio.mean(dim=1)
-        audio = torchaudio.functional.resample(audio, 44100, 16000)
-        dist = self.encode_audio(audio)
-        return dist.mean
--- a/comfy/ldm/mmaudio/vae/bigvgan.py
+++ b/comfy/ldm/mmaudio/vae/bigvgan.py
@ -1,219 +0,0 @@
-# Copyright (c) 2022 NVIDIA CORPORATION.
-#   Licensed under the MIT license.
-
-# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
-#   LICENSE is in incl_licenses directory.
-
-import torch
-import torch.nn as nn
-from types import SimpleNamespace
-from . import activations
-from .alias_free_torch import Activation1d
-import comfy.ops
-ops = comfy.ops.disable_weight_init
-
-def get_padding(kernel_size, dilation=1):
-    return int((kernel_size * dilation - dilation) / 2)
-
-class AMPBlock1(torch.nn.Module):
-
-    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5), activation=None):
-        super(AMPBlock1, self).__init__()
-        self.h = h
-
-        self.convs1 = nn.ModuleList([
-                ops.Conv1d(channels,
-                       channels,
-                       kernel_size,
-                       1,
-                       dilation=dilation[0],
-                       padding=get_padding(kernel_size, dilation[0])),
-                ops.Conv1d(channels,
-                       channels,
-                       kernel_size,
-                       1,
-                       dilation=dilation[1],
-                       padding=get_padding(kernel_size, dilation[1])),
-                ops.Conv1d(channels,
-                       channels,
-                       kernel_size,
-                       1,
-                       dilation=dilation[2],
-                       padding=get_padding(kernel_size, dilation[2]))
-        ])
-
-        self.convs2 = nn.ModuleList([
-                ops.Conv1d(channels,
-                       channels,
-                       kernel_size,
-                       1,
-                       dilation=1,
-                       padding=get_padding(kernel_size, 1)),
-                ops.Conv1d(channels,
-                       channels,
-                       kernel_size,
-                       1,
-                       dilation=1,
-                       padding=get_padding(kernel_size, 1)),
-                ops.Conv1d(channels,
-                       channels,
-                       kernel_size,
-                       1,
-                       dilation=1,
-                       padding=get_padding(kernel_size, 1))
-        ])
-
-        self.num_layers = len(self.convs1) + len(self.convs2)  # total number of conv layers
-
-        if activation == 'snake':  # periodic nonlinearity with snake function and anti-aliasing
-            self.activations = nn.ModuleList([
-                Activation1d(
-                    activation=activations.Snake(channels, alpha_logscale=h.snake_logscale))
-                for _ in range(self.num_layers)
-            ])
-        elif activation == 'snakebeta':  # periodic nonlinearity with snakebeta function and anti-aliasing
-            self.activations = nn.ModuleList([
-                Activation1d(
-                    activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale))
-                for _ in range(self.num_layers)
-            ])
-        else:
-            raise NotImplementedError(
-                "activation incorrectly specified. check the config file and look for 'activation'."
-            )
-
-    def forward(self, x):
-        acts1, acts2 = self.activations[::2], self.activations[1::2]
-        for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
-            xt = a1(x)
-            xt = c1(xt)
-            xt = a2(xt)
-            xt = c2(xt)
-            x = xt + x
-
-        return x
-
-
-class AMPBlock2(torch.nn.Module):
-
-    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3), activation=None):
-        super(AMPBlock2, self).__init__()
-        self.h = h
-
-        self.convs = nn.ModuleList([
-                ops.Conv1d(channels,
-                       channels,
-                       kernel_size,
-                       1,
-                       dilation=dilation[0],
-                       padding=get_padding(kernel_size, dilation[0])),
-                ops.Conv1d(channels,
-                       channels,
-                       kernel_size,
-                       1,
-                       dilation=dilation[1],
-                       padding=get_padding(kernel_size, dilation[1]))
-        ])
-
-        self.num_layers = len(self.convs)  # total number of conv layers
-
-        if activation == 'snake':  # periodic nonlinearity with snake function and anti-aliasing
-            self.activations = nn.ModuleList([
-                Activation1d(
-                    activation=activations.Snake(channels, alpha_logscale=h.snake_logscale))
-                for _ in range(self.num_layers)
-            ])
-        elif activation == 'snakebeta':  # periodic nonlinearity with snakebeta function and anti-aliasing
-            self.activations = nn.ModuleList([
-                Activation1d(
-                    activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale))
-                for _ in range(self.num_layers)
-            ])
-        else:
-            raise NotImplementedError(
-                "activation incorrectly specified. check the config file and look for 'activation'."
-            )
-
-    def forward(self, x):
-        for c, a in zip(self.convs, self.activations):
-            xt = a(x)
-            xt = c(xt)
-            x = xt + x
-
-        return x
-
-
-class BigVGANVocoder(torch.nn.Module):
-    # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks.
-    def __init__(self, h):
-        super().__init__()
-        if isinstance(h, dict):
-            h = SimpleNamespace(**h)
-        self.h = h
-
-        self.num_kernels = len(h.resblock_kernel_sizes)
-        self.num_upsamples = len(h.upsample_rates)
-
-        # pre conv
-        self.conv_pre = ops.Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)
-
-        # define which AMPBlock to use. BigVGAN uses AMPBlock1 as default
-        resblock = AMPBlock1 if h.resblock == '1' else AMPBlock2
-
-        # transposed conv-based upsamplers. does not apply anti-aliasing
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
-            self.ups.append(
-                nn.ModuleList([
-                        ops.ConvTranspose1d(h.upsample_initial_channel // (2**i),
-                                        h.upsample_initial_channel // (2**(i + 1)),
-                                        k,
-                                        u,
-                                        padding=(k - u) // 2)
-                ]))
-
-        # residual blocks using anti-aliased multi-periodicity composition modules (AMP)
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = h.upsample_initial_channel // (2**(i + 1))
-            for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
-                self.resblocks.append(resblock(h, ch, k, d, activation=h.activation))
-
-        # post conv
-        if h.activation == "snake":  # periodic nonlinearity with snake function and anti-aliasing
-            activation_post = activations.Snake(ch, alpha_logscale=h.snake_logscale)
-            self.activation_post = Activation1d(activation=activation_post)
-        elif h.activation == "snakebeta":  # periodic nonlinearity with snakebeta function and anti-aliasing
-            activation_post = activations.SnakeBeta(ch, alpha_logscale=h.snake_logscale)
-            self.activation_post = Activation1d(activation=activation_post)
-        else:
-            raise NotImplementedError(
-                "activation incorrectly specified. check the config file and look for 'activation'."
-            )
-
-        self.conv_post = ops.Conv1d(ch, 1, 7, 1, padding=3)
-
-
-    def forward(self, x):
-        # pre conv
-        x = self.conv_pre(x)
-
-        for i in range(self.num_upsamples):
-            # upsampling
-            for i_up in range(len(self.ups[i])):
-                x = self.ups[i][i_up](x)
-            # AMP blocks
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-
-        # post conv
-        x = self.activation_post(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-
-        return x
--- a/comfy/ldm/mmaudio/vae/distributions.py
+++ b/comfy/ldm/mmaudio/vae/distributions.py
@ -1,92 +0,0 @@
-import torch
-import numpy as np
-
-
-class AbstractDistribution:
-    def sample(self):
-        raise NotImplementedError()
-
-    def mode(self):
-        raise NotImplementedError()
-
-
-class DiracDistribution(AbstractDistribution):
-    def __init__(self, value):
-        self.value = value
-
-    def sample(self):
-        return self.value
-
-    def mode(self):
-        return self.value
-
-
-class DiagonalGaussianDistribution(object):
-    def __init__(self, parameters, deterministic=False):
-        self.parameters = parameters
-        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
-        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
-        self.deterministic = deterministic
-        self.std = torch.exp(0.5 * self.logvar)
-        self.var = torch.exp(self.logvar)
-        if self.deterministic:
-            self.var = self.std = torch.zeros_like(self.mean, device=self.parameters.device)
-
-    def sample(self):
-        x = self.mean + self.std * torch.randn(self.mean.shape, device=self.parameters.device)
-        return x
-
-    def kl(self, other=None):
-        if self.deterministic:
-            return torch.Tensor([0.])
-        else:
-            if other is None:
-                return 0.5 * torch.sum(torch.pow(self.mean, 2)
-                                       + self.var - 1.0 - self.logvar,
-                                       dim=[1, 2, 3])
-            else:
-                return 0.5 * torch.sum(
-                    torch.pow(self.mean - other.mean, 2) / other.var
-                    + self.var / other.var - 1.0 - self.logvar + other.logvar,
-                    dim=[1, 2, 3])
-
-    def nll(self, sample, dims=[1,2,3]):
-        if self.deterministic:
-            return torch.Tensor([0.])
-        logtwopi = np.log(2.0 * np.pi)
-        return 0.5 * torch.sum(
-            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
-            dim=dims)
-
-    def mode(self):
-        return self.mean
-
-
-def normal_kl(mean1, logvar1, mean2, logvar2):
-    """
-    source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
-    Compute the KL divergence between two gaussians.
-    Shapes are automatically broadcasted, so batches can be compared to
-    scalars, among other use cases.
-    """
-    tensor = None
-    for obj in (mean1, logvar1, mean2, logvar2):
-        if isinstance(obj, torch.Tensor):
-            tensor = obj
-            break
-    assert tensor is not None, "at least one argument must be a Tensor"
-
-    # Force variances to be Tensors. Broadcasting helps convert scalars to
-    # Tensors, but it does not work for torch.exp().
-    logvar1, logvar2 = [
-        x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
-        for x in (logvar1, logvar2)
-    ]
-
-    return 0.5 * (
-        -1.0
-        + logvar2
-        - logvar1
-        + torch.exp(logvar1 - logvar2)
-        + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
-    )
--- a/comfy/ldm/mmaudio/vae/vae.py
+++ b/comfy/ldm/mmaudio/vae/vae.py
@ -1,358 +0,0 @@
-import logging
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-from .vae_modules import (AttnBlock1D, Downsample1D, ResnetBlock1D,
-                                                 Upsample1D, nonlinearity)
-from .distributions import DiagonalGaussianDistribution
-
-import comfy.ops
-ops = comfy.ops.disable_weight_init
-
-log = logging.getLogger()
-
-DATA_MEAN_80D = [
-    -1.6058, -1.3676, -1.2520, -1.2453, -1.2078, -1.2224, -1.2419, -1.2439, -1.2922, -1.2927,
-    -1.3170, -1.3543, -1.3401, -1.3836, -1.3907, -1.3912, -1.4313, -1.4152, -1.4527, -1.4728,
-    -1.4568, -1.5101, -1.5051, -1.5172, -1.5623, -1.5373, -1.5746, -1.5687, -1.6032, -1.6131,
-    -1.6081, -1.6331, -1.6489, -1.6489, -1.6700, -1.6738, -1.6953, -1.6969, -1.7048, -1.7280,
-    -1.7361, -1.7495, -1.7658, -1.7814, -1.7889, -1.8064, -1.8221, -1.8377, -1.8417, -1.8643,
-    -1.8857, -1.8929, -1.9173, -1.9379, -1.9531, -1.9673, -1.9824, -2.0042, -2.0215, -2.0436,
-    -2.0766, -2.1064, -2.1418, -2.1855, -2.2319, -2.2767, -2.3161, -2.3572, -2.3954, -2.4282,
-    -2.4659, -2.5072, -2.5552, -2.6074, -2.6584, -2.7107, -2.7634, -2.8266, -2.8981, -2.9673
-]
-
-DATA_STD_80D = [
-    1.0291, 1.0411, 1.0043, 0.9820, 0.9677, 0.9543, 0.9450, 0.9392, 0.9343, 0.9297, 0.9276, 0.9263,
-    0.9242, 0.9254, 0.9232, 0.9281, 0.9263, 0.9315, 0.9274, 0.9247, 0.9277, 0.9199, 0.9188, 0.9194,
-    0.9160, 0.9161, 0.9146, 0.9161, 0.9100, 0.9095, 0.9145, 0.9076, 0.9066, 0.9095, 0.9032, 0.9043,
-    0.9038, 0.9011, 0.9019, 0.9010, 0.8984, 0.8983, 0.8986, 0.8961, 0.8962, 0.8978, 0.8962, 0.8973,
-    0.8993, 0.8976, 0.8995, 0.9016, 0.8982, 0.8972, 0.8974, 0.8949, 0.8940, 0.8947, 0.8936, 0.8939,
-    0.8951, 0.8956, 0.9017, 0.9167, 0.9436, 0.9690, 1.0003, 1.0225, 1.0381, 1.0491, 1.0545, 1.0604,
-    1.0761, 1.0929, 1.1089, 1.1196, 1.1176, 1.1156, 1.1117, 1.1070
-]
-
-DATA_MEAN_128D = [
-    -3.3462, -2.6723, -2.4893, -2.3143, -2.2664, -2.3317, -2.1802, -2.4006, -2.2357, -2.4597,
-    -2.3717, -2.4690, -2.5142, -2.4919, -2.6610, -2.5047, -2.7483, -2.5926, -2.7462, -2.7033,
-    -2.7386, -2.8112, -2.7502, -2.9594, -2.7473, -3.0035, -2.8891, -2.9922, -2.9856, -3.0157,
-    -3.1191, -2.9893, -3.1718, -3.0745, -3.1879, -3.2310, -3.1424, -3.2296, -3.2791, -3.2782,
-    -3.2756, -3.3134, -3.3509, -3.3750, -3.3951, -3.3698, -3.4505, -3.4509, -3.5089, -3.4647,
-    -3.5536, -3.5788, -3.5867, -3.6036, -3.6400, -3.6747, -3.7072, -3.7279, -3.7283, -3.7795,
-    -3.8259, -3.8447, -3.8663, -3.9182, -3.9605, -3.9861, -4.0105, -4.0373, -4.0762, -4.1121,
-    -4.1488, -4.1874, -4.2461, -4.3170, -4.3639, -4.4452, -4.5282, -4.6297, -4.7019, -4.7960,
-    -4.8700, -4.9507, -5.0303, -5.0866, -5.1634, -5.2342, -5.3242, -5.4053, -5.4927, -5.5712,
-    -5.6464, -5.7052, -5.7619, -5.8410, -5.9188, -6.0103, -6.0955, -6.1673, -6.2362, -6.3120,
-    -6.3926, -6.4797, -6.5565, -6.6511, -6.8130, -6.9961, -7.1275, -7.2457, -7.3576, -7.4663,
-    -7.6136, -7.7469, -7.8815, -8.0132, -8.1515, -8.3071, -8.4722, -8.7418, -9.3975, -9.6628,
-    -9.7671, -9.8863, -9.9992, -10.0860, -10.1709, -10.5418, -11.2795, -11.3861
-]
-
-DATA_STD_128D = [
-    2.3804, 2.4368, 2.3772, 2.3145, 2.2803, 2.2510, 2.2316, 2.2083, 2.1996, 2.1835, 2.1769, 2.1659,
-    2.1631, 2.1618, 2.1540, 2.1606, 2.1571, 2.1567, 2.1612, 2.1579, 2.1679, 2.1683, 2.1634, 2.1557,
-    2.1668, 2.1518, 2.1415, 2.1449, 2.1406, 2.1350, 2.1313, 2.1415, 2.1281, 2.1352, 2.1219, 2.1182,
-    2.1327, 2.1195, 2.1137, 2.1080, 2.1179, 2.1036, 2.1087, 2.1036, 2.1015, 2.1068, 2.0975, 2.0991,
-    2.0902, 2.1015, 2.0857, 2.0920, 2.0893, 2.0897, 2.0910, 2.0881, 2.0925, 2.0873, 2.0960, 2.0900,
-    2.0957, 2.0958, 2.0978, 2.0936, 2.0886, 2.0905, 2.0845, 2.0855, 2.0796, 2.0840, 2.0813, 2.0817,
-    2.0838, 2.0840, 2.0917, 2.1061, 2.1431, 2.1976, 2.2482, 2.3055, 2.3700, 2.4088, 2.4372, 2.4609,
-    2.4731, 2.4847, 2.5072, 2.5451, 2.5772, 2.6147, 2.6529, 2.6596, 2.6645, 2.6726, 2.6803, 2.6812,
-    2.6899, 2.6916, 2.6931, 2.6998, 2.7062, 2.7262, 2.7222, 2.7158, 2.7041, 2.7485, 2.7491, 2.7451,
-    2.7485, 2.7233, 2.7297, 2.7233, 2.7145, 2.6958, 2.6788, 2.6439, 2.6007, 2.4786, 2.2469, 2.1877,
-    2.1392, 2.0717, 2.0107, 1.9676, 1.9140, 1.7102, 0.9101, 0.7164
-]
-
-
-class VAE(nn.Module):
-
-    def __init__(
-        self,
-        *,
-        data_dim: int,
-        embed_dim: int,
-        hidden_dim: int,
-    ):
-        super().__init__()
-
-        if data_dim == 80:
-            self.data_mean = nn.Buffer(torch.tensor(DATA_MEAN_80D, dtype=torch.float32))
-            self.data_std = nn.Buffer(torch.tensor(DATA_STD_80D, dtype=torch.float32))
-        elif data_dim == 128:
-            self.data_mean = nn.Buffer(torch.tensor(DATA_MEAN_128D, dtype=torch.float32))
-            self.data_std = nn.Buffer(torch.tensor(DATA_STD_128D, dtype=torch.float32))
-
-        self.data_mean = self.data_mean.view(1, -1, 1)
-        self.data_std = self.data_std.view(1, -1, 1)
-
-        self.encoder = Encoder1D(
-            dim=hidden_dim,
-            ch_mult=(1, 2, 4),
-            num_res_blocks=2,
-            attn_layers=[3],
-            down_layers=[0],
-            in_dim=data_dim,
-            embed_dim=embed_dim,
-        )
-        self.decoder = Decoder1D(
-            dim=hidden_dim,
-            ch_mult=(1, 2, 4),
-            num_res_blocks=2,
-            attn_layers=[3],
-            down_layers=[0],
-            in_dim=data_dim,
-            out_dim=data_dim,
-            embed_dim=embed_dim,
-        )
-
-        self.embed_dim = embed_dim
-        # self.quant_conv = nn.Conv1d(2 * embed_dim, 2 * embed_dim, 1)
-        # self.post_quant_conv = nn.Conv1d(embed_dim, embed_dim, 1)
-
-        self.initialize_weights()
-
-    def initialize_weights(self):
-        pass
-
-    def encode(self, x: torch.Tensor, normalize: bool = True) -> DiagonalGaussianDistribution:
-        if normalize:
-            x = self.normalize(x)
-        moments = self.encoder(x)
-        posterior = DiagonalGaussianDistribution(moments)
-        return posterior
-
-    def decode(self, z: torch.Tensor, unnormalize: bool = True) -> torch.Tensor:
-        dec = self.decoder(z)
-        if unnormalize:
-            dec = self.unnormalize(dec)
-        return dec
-
-    def normalize(self, x: torch.Tensor) -> torch.Tensor:
-        return (x - comfy.model_management.cast_to(self.data_mean, dtype=x.dtype, device=x.device)) / comfy.model_management.cast_to(self.data_std, dtype=x.dtype, device=x.device)
-
-    def unnormalize(self, x: torch.Tensor) -> torch.Tensor:
-        return x * comfy.model_management.cast_to(self.data_std, dtype=x.dtype, device=x.device) + comfy.model_management.cast_to(self.data_mean, dtype=x.dtype, device=x.device)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        sample_posterior: bool = True,
-        rng: Optional[torch.Generator] = None,
-        normalize: bool = True,
-        unnormalize: bool = True,
-    ) -> tuple[torch.Tensor, DiagonalGaussianDistribution]:
-
-        posterior = self.encode(x, normalize=normalize)
-        if sample_posterior:
-            z = posterior.sample(rng)
-        else:
-            z = posterior.mode()
-        dec = self.decode(z, unnormalize=unnormalize)
-        return dec, posterior
-
-    def load_weights(self, src_dict) -> None:
-        self.load_state_dict(src_dict, strict=True)
-
-    @property
-    def device(self) -> torch.device:
-        return next(self.parameters()).device
-
-    def get_last_layer(self):
-        return self.decoder.conv_out.weight
-
-    def remove_weight_norm(self):
-        return self
-
-
-class Encoder1D(nn.Module):
-
-    def __init__(self,
-                 *,
-                 dim: int,
-                 ch_mult: tuple[int] = (1, 2, 4, 8),
-                 num_res_blocks: int,
-                 attn_layers: list[int] = [],
-                 down_layers: list[int] = [],
-                 resamp_with_conv: bool = True,
-                 in_dim: int,
-                 embed_dim: int,
-                 double_z: bool = True,
-                 kernel_size: int = 3,
-                 clip_act: float = 256.0):
-        super().__init__()
-        self.dim = dim
-        self.num_layers = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.in_channels = in_dim
-        self.clip_act = clip_act
-        self.down_layers = down_layers
-        self.attn_layers = attn_layers
-        self.conv_in = ops.Conv1d(in_dim, self.dim, kernel_size=kernel_size, padding=kernel_size // 2, bias=False)
-
-        in_ch_mult = (1, ) + tuple(ch_mult)
-        self.in_ch_mult = in_ch_mult
-        # downsampling
-        self.down = nn.ModuleList()
-        for i_level in range(self.num_layers):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_in = dim * in_ch_mult[i_level]
-            block_out = dim * ch_mult[i_level]
-            for i_block in range(self.num_res_blocks):
-                block.append(
-                    ResnetBlock1D(in_dim=block_in,
-                                  out_dim=block_out,
-                                  kernel_size=kernel_size,
-                                  use_norm=True))
-                block_in = block_out
-                if i_level in attn_layers:
-                    attn.append(AttnBlock1D(block_in))
-            down = nn.Module()
-            down.block = block
-            down.attn = attn
-            if i_level in down_layers:
-                down.downsample = Downsample1D(block_in, resamp_with_conv)
-            self.down.append(down)
-
-        # middle
-        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock1D(in_dim=block_in,
-                                         out_dim=block_in,
-                                         kernel_size=kernel_size,
-                                         use_norm=True)
-        self.mid.attn_1 = AttnBlock1D(block_in)
-        self.mid.block_2 = ResnetBlock1D(in_dim=block_in,
-                                         out_dim=block_in,
-                                         kernel_size=kernel_size,
-                                         use_norm=True)
-
-        # end
-        self.conv_out = ops.Conv1d(block_in,
-                                 2 * embed_dim if double_z else embed_dim,
-                                 kernel_size=kernel_size, padding=kernel_size // 2, bias=False)
-
-        self.learnable_gain = nn.Parameter(torch.zeros([]))
-
-    def forward(self, x):
-
-        # downsampling
-        h = self.conv_in(x)
-        for i_level in range(self.num_layers):
-            for i_block in range(self.num_res_blocks):
-                h = self.down[i_level].block[i_block](h)
-                if len(self.down[i_level].attn) > 0:
-                    h = self.down[i_level].attn[i_block](h)
-                h = h.clamp(-self.clip_act, self.clip_act)
-            if i_level in self.down_layers:
-                h = self.down[i_level].downsample(h)
-
-        # middle
-        h = self.mid.block_1(h)
-        h = self.mid.attn_1(h)
-        h = self.mid.block_2(h)
-        h = h.clamp(-self.clip_act, self.clip_act)
-
-        # end
-        h = nonlinearity(h)
-        h = self.conv_out(h) * (self.learnable_gain + 1)
-        return h
-
-
-class Decoder1D(nn.Module):
-
-    def __init__(self,
-                 *,
-                 dim: int,
-                 out_dim: int,
-                 ch_mult: tuple[int] = (1, 2, 4, 8),
-                 num_res_blocks: int,
-                 attn_layers: list[int] = [],
-                 down_layers: list[int] = [],
-                 kernel_size: int = 3,
-                 resamp_with_conv: bool = True,
-                 in_dim: int,
-                 embed_dim: int,
-                 clip_act: float = 256.0):
-        super().__init__()
-        self.ch = dim
-        self.num_layers = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.in_channels = in_dim
-        self.clip_act = clip_act
-        self.down_layers = [i + 1 for i in down_layers]  # each downlayer add one
-
-        # compute in_ch_mult, block_in and curr_res at lowest res
-        block_in = dim * ch_mult[self.num_layers - 1]
-
-        # z to block_in
-        self.conv_in = ops.Conv1d(embed_dim, block_in, kernel_size=kernel_size, padding=kernel_size // 2, bias=False)
-
-        # middle
-        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock1D(in_dim=block_in, out_dim=block_in, use_norm=True)
-        self.mid.attn_1 = AttnBlock1D(block_in)
-        self.mid.block_2 = ResnetBlock1D(in_dim=block_in, out_dim=block_in, use_norm=True)
-
-        # upsampling
-        self.up = nn.ModuleList()
-        for i_level in reversed(range(self.num_layers)):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_out = dim * ch_mult[i_level]
-            for i_block in range(self.num_res_blocks + 1):
-                block.append(ResnetBlock1D(in_dim=block_in, out_dim=block_out, use_norm=True))
-                block_in = block_out
-                if i_level in attn_layers:
-                    attn.append(AttnBlock1D(block_in))
-            up = nn.Module()
-            up.block = block
-            up.attn = attn
-            if i_level in self.down_layers:
-                up.upsample = Upsample1D(block_in, resamp_with_conv)
-            self.up.insert(0, up)  # prepend to get consistent order
-
-        # end
-        self.conv_out = ops.Conv1d(block_in, out_dim, kernel_size=kernel_size, padding=kernel_size // 2, bias=False)
-        self.learnable_gain = nn.Parameter(torch.zeros([]))
-
-    def forward(self, z):
-        # z to block_in
-        h = self.conv_in(z)
-
-        # middle
-        h = self.mid.block_1(h)
-        h = self.mid.attn_1(h)
-        h = self.mid.block_2(h)
-        h = h.clamp(-self.clip_act, self.clip_act)
-
-        # upsampling
-        for i_level in reversed(range(self.num_layers)):
-            for i_block in range(self.num_res_blocks + 1):
-                h = self.up[i_level].block[i_block](h)
-                if len(self.up[i_level].attn) > 0:
-                    h = self.up[i_level].attn[i_block](h)
-                h = h.clamp(-self.clip_act, self.clip_act)
-            if i_level in self.down_layers:
-                h = self.up[i_level].upsample(h)
-
-        h = nonlinearity(h)
-        h = self.conv_out(h) * (self.learnable_gain + 1)
-        return h
-
-
-def VAE_16k(**kwargs) -> VAE:
-    return VAE(data_dim=80, embed_dim=20, hidden_dim=384, **kwargs)
-
-
-def VAE_44k(**kwargs) -> VAE:
-    return VAE(data_dim=128, embed_dim=40, hidden_dim=512, **kwargs)
-
-
-def get_my_vae(name: str, **kwargs) -> VAE:
-    if name == '16k':
-        return VAE_16k(**kwargs)
-    if name == '44k':
-        return VAE_44k(**kwargs)
-    raise ValueError(f'Unknown model: {name}')
-
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jedrzej Kosinski	3b54b0256d	Merge branch 'master' into fix-context-window-slicing	2025-09-11 20:23:31 -07:00
Jedrzej Kosinski	2835f7f63e	Apply cond slice fix	2025-09-09 17:45:35 -07:00