Add plan for aligning local asset/tag endpoints with cloud API

Amp-Thread-ID: https://ampcode.com/threads/T-019befd9-1a77-70eb-808d-c83aa0c26515 Co-authored-by: Amp <amp@ampcode.com>
Add input + output to seed_assets calls
2026-01-27 07:16:09 +08:00 · 2026-01-24 04:22:03 -08:00 · 2026-01-24 03:50:42 -08:00 · 2026-01-24 03:43:01 -08:00 · 2026-01-23 19:56:14 -05:00 · 2026-01-23 19:50:48 -05:00
365 changed files with 45235 additions and 24194 deletions
--- a/.ci/update_windows/update.py
+++ b/.ci/update_windows/update.py
@ -53,6 +53,16 @@ try:
    repo.stash(ident)
 except KeyError:
    print("nothing to stash")  # noqa: T201
+except:
+    print("Could not stash, cleaning index and trying again.")  # noqa: T201
+    repo.state_cleanup()
+    repo.index.read_tree(repo.head.peel().tree)
+    repo.index.write()
+    try:
+        repo.stash(ident)
+    except KeyError:
+        print("nothing to stash.")  # noqa: T201
+
 backup_branch_name = 'backup_branch_{}'.format(datetime.today().strftime('%Y-%m-%d_%H_%M_%S'))
 print("creating backup branch: {}".format(backup_branch_name))  # noqa: T201
 try:
@ -66,8 +76,10 @@ if branch is None:
    try:
        ref = repo.lookup_reference('refs/remotes/origin/master')
    except:
-        print("pulling.")  # noqa: T201
-        pull(repo)
+        print("fetching.")  # noqa: T201
+        for remote in repo.remotes:
+            if remote.name == "origin":
+                remote.fetch()
        ref = repo.lookup_reference('refs/remotes/origin/master')
    repo.checkout(ref)
    branch = repo.lookup_branch('master')
@ -149,3 +161,4 @@ try:
        shutil.copy(stable_update_script, stable_update_script_to)
 except:
    pass
+
--- a/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt
+++ b/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt
@ -0,0 +1,28 @@
+As of the time of writing this you need this driver for best results:
+https://www.amd.com/en/resources/support-articles/release-notes/RN-AMDGPU-WINDOWS-PYTORCH-7-1-1.html
+
+HOW TO RUN:
+
+If you have a AMD gpu:
+
+run_amd_gpu.bat
+
+If you have memory issues you can try disabling the smart memory management by running comfyui with:
+
+run_amd_gpu_disable_smart_memory.bat
+
+IF YOU GET A RED ERROR IN THE UI MAKE SURE YOU HAVE A MODEL/CHECKPOINT IN: ComfyUI\models\checkpoints
+
+You can download the stable diffusion XL one from: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors
+
+
+RECOMMENDED WAY TO UPDATE:
+To update the ComfyUI code: update\update_comfyui.bat
+
+
+TO SHARE MODELS BETWEEN COMFYUI AND ANOTHER UI:
+In the ComfyUI directory you will find a file: extra_model_paths.yaml.example
+Rename this file to: extra_model_paths.yaml and edit it with your favorite text editor.
+
+
+
--- a/.ci/windows_amd_base_files/run_amd_gpu.bat
+++ b/.ci/windows_amd_base_files/run_amd_gpu.bat
--- a/.ci/windows_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
+++ b/.ci/windows_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
@ -1,2 +1,2 @@
-.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast fp16_accumulation
+.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --disable-smart-memory
 pause
--- a/.ci/windows_nvidia_base_files/README_VERY_IMPORTANT.txt
+++ b/.ci/windows_nvidia_base_files/README_VERY_IMPORTANT.txt
--- a/.ci/windows_nvidia_base_files/advanced/run_nvidia_gpu_disable_api_nodes.bat
+++ b/.ci/windows_nvidia_base_files/advanced/run_nvidia_gpu_disable_api_nodes.bat
@ -0,0 +1,3 @@
+..\python_embeded\python.exe -s ..\ComfyUI\main.py --windows-standalone-build --disable-api-nodes
+echo If you see this and ComfyUI did not start try updating your Nvidia Drivers to the latest. If you get a c10.dll error you need to install vc redist that you can find: https://aka.ms/vc14/vc_redist.x64.exe
+pause
--- a/.ci/windows_nvidia_base_files/run_cpu.bat
+++ b/.ci/windows_nvidia_base_files/run_cpu.bat
--- a/.ci/windows_nvidia_base_files/run_nvidia_gpu.bat
+++ b/.ci/windows_nvidia_base_files/run_nvidia_gpu.bat
@ -0,0 +1,3 @@
+.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build
+echo If you see this and ComfyUI did not start try updating your Nvidia Drivers to the latest. If you get a c10.dll error you need to install vc redist that you can find: https://aka.ms/vc14/vc_redist.x64.exe
+pause
--- a/.ci/windows_nvidia_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
+++ b/.ci/windows_nvidia_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
@ -0,0 +1,3 @@
+.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast fp16_accumulation
+echo If you see this and ComfyUI did not start try updating your Nvidia Drivers to the latest. If you get a c10.dll error you need to install vc redist that you can find: https://aka.ms/vc14/vc_redist.x64.exe
+pause
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -8,13 +8,15 @@ body:
        Before submitting a **Bug Report**, please ensure the following:

        - **1:** You are running the latest version of ComfyUI.
-        - **2:** You have looked at the existing bug reports and made sure this isn't already reported.
+        - **2:** You have your ComfyUI logs and relevant workflow on hand and will post them in this bug report.
        - **3:** You confirmed that the bug is not caused by a custom node. You can disable all custom nodes by passing
-        `--disable-all-custom-nodes` command line argument.
+        `--disable-all-custom-nodes` command line argument. If you have custom node try updating them to the latest version.
        - **4:** This is an actual bug in ComfyUI, not just a support question. A bug is when you can specify exact
        steps to replicate what went wrong and others will be able to repeat your steps and see the same issue happen.

-        If unsure, ask on the [ComfyUI Matrix Space](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) or the [Comfy Org Discord](https://discord.gg/comfyorg) first.
+        ## Very Important
+
+        Please make sure that you post ALL your ComfyUI logs in the bug report. A bug report without logs will likely be ignored.
  - type: checkboxes
    id: custom-nodes-test
    attributes:
--- a/.github/PULL_REQUEST_TEMPLATE/api-node.md
+++ b/.github/PULL_REQUEST_TEMPLATE/api-node.md
@ -0,0 +1,21 @@
+<!-- API_NODE_PR_CHECKLIST: do not remove -->
+
+## API Node PR Checklist
+
+### Scope
+- [ ] **Is API Node Change**
+
+### Pricing & Billing
+- [ ] **Need pricing update**
+- [ ] **No pricing update**
+
+If **Need pricing update**:
+- [ ] Metronome rate cards updated
+- [ ] Auto‑billing tests updated and passing
+
+### QA
+- [ ] **QA done**
+- [ ] **QA not required**
+
+### Comms
+- [ ] Informed **Kosinkadink**
--- a/.github/workflows/api-node-template.yml
+++ b/.github/workflows/api-node-template.yml
@ -0,0 +1,58 @@
+name: Append API Node PR template
+
+on:
+  pull_request_target:
+    types: [opened, reopened, synchronize, ready_for_review]
+    paths:
+      - 'comfy_api_nodes/**'   # only run if these files changed
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  inject:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Ensure template exists and append to PR body
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const { owner, repo } = context.repo;
+            const number = context.payload.pull_request.number;
+            const templatePath = '.github/PULL_REQUEST_TEMPLATE/api-node.md';
+            const marker = '<!-- API_NODE_PR_CHECKLIST: do not remove -->';
+
+            const { data: pr } = await github.rest.pulls.get({ owner, repo, pull_number: number });
+
+            let templateText;
+            try {
+              const res = await github.rest.repos.getContent({
+                owner,
+                repo,
+                path: templatePath,
+                ref: pr.base.ref
+              });
+              const buf = Buffer.from(res.data.content, res.data.encoding || 'base64');
+              templateText = buf.toString('utf8');
+            } catch (e) {
+              core.setFailed(`Required PR template not found at "${templatePath}" on ${pr.base.ref}. Please add it to the repo.`);
+              return;
+            }
+
+            // Enforce the presence of the marker inside the template (for idempotence)
+            if (!templateText.includes(marker)) {
+              core.setFailed(`Template at "${templatePath}" does not contain the required marker:\n${marker}\nAdd it so we can detect duplicates safely.`);
+              return;
+            }
+
+            // If the PR already contains the marker, do not append again.
+            const body = pr.body || '';
+            if (body.includes(marker)) {
+              core.info('Template already present in PR body; nothing to inject.');
+              return;
+            }
+
+            const newBody = (body ? body + '\n\n' : '') + templateText + '\n';
+            await github.rest.pulls.update({ owner, repo, pull_number: number, body: newBody });
+            core.notice('API Node template appended to PR description.');
--- a/.github/workflows/release-stable-all.yml
+++ b/.github/workflows/release-stable-all.yml
@ -0,0 +1,78 @@
+name: "Release Stable All Portable Versions"
+
+on:
+  workflow_dispatch:
+    inputs:
+      git_tag:
+        description: 'Git tag'
+        required: true
+        type: string
+
+jobs:
+  release_nvidia_default:
+    permissions:
+      contents: "write"
+      packages: "write"
+      pull-requests: "read"
+    name: "Release NVIDIA Default (cu130)"
+    uses: ./.github/workflows/stable-release.yml
+    with:
+      git_tag: ${{ inputs.git_tag }}
+      cache_tag: "cu130"
+      python_minor: "13"
+      python_patch: "9"
+      rel_name: "nvidia"
+      rel_extra_name: ""
+      test_release: true
+    secrets: inherit
+
+  release_nvidia_cu128:
+    permissions:
+      contents: "write"
+      packages: "write"
+      pull-requests: "read"
+    name: "Release NVIDIA cu128"
+    uses: ./.github/workflows/stable-release.yml
+    with:
+      git_tag: ${{ inputs.git_tag }}
+      cache_tag: "cu128"
+      python_minor: "12"
+      python_patch: "10"
+      rel_name: "nvidia"
+      rel_extra_name: "_cu128"
+      test_release: true
+    secrets: inherit
+
+  release_nvidia_cu126:
+    permissions:
+      contents: "write"
+      packages: "write"
+      pull-requests: "read"
+    name: "Release NVIDIA cu126"
+    uses: ./.github/workflows/stable-release.yml
+    with:
+      git_tag: ${{ inputs.git_tag }}
+      cache_tag: "cu126"
+      python_minor: "12"
+      python_patch: "10"
+      rel_name: "nvidia"
+      rel_extra_name: "_cu126"
+      test_release: true
+    secrets: inherit
+
+  release_amd_rocm:
+    permissions:
+      contents: "write"
+      packages: "write"
+      pull-requests: "read"
+    name: "Release AMD ROCm 7.1.1"
+    uses: ./.github/workflows/stable-release.yml
+    with:
+      git_tag: ${{ inputs.git_tag }}
+      cache_tag: "rocm711"
+      python_minor: "12"
+      python_patch: "10"
+      rel_name: "amd"
+      rel_extra_name: ""
+      test_release: false
+    secrets: inherit
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@ -21,3 +21,28 @@ jobs:

    - name: Run Ruff
      run: ruff check .
+
+  pylint:
+    name: Run Pylint
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.12'
+
+    - name: Install requirements
+      run: |
+        python -m pip install --upgrade pip
+        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+        pip install -r requirements.txt
+
+    - name: Install Pylint
+      run: pip install pylint
+
+    - name: Run Pylint
+      run: pylint comfy_api_nodes
--- a/.github/workflows/stable-release.yml
+++ b/.github/workflows/stable-release.yml
@ -2,17 +2,17 @@
 name: "Release Stable Version"

 on:
-  workflow_dispatch:
+  workflow_call:
    inputs:
      git_tag:
        description: 'Git tag'
        required: true
        type: string
-      cu:
-        description: 'CUDA version'
+      cache_tag:
+        description: 'Cached dependencies tag'
        required: true
        type: string
-        default: "129"
+        default: "cu129"
      python_minor:
        description: 'Python minor version'
        required: true
@ -23,7 +23,57 @@ on:
        required: true
        type: string
        default: "6"
-
+      rel_name:
+        description: 'Release name'
+        required: true
+        type: string
+        default: "nvidia"
+      rel_extra_name:
+        description: 'Release extra name'
+        required: false
+        type: string
+        default: ""
+      test_release:
+        description: 'Test Release'
+        required: true
+        type: boolean
+        default: true
+  workflow_dispatch:
+    inputs:
+      git_tag:
+        description: 'Git tag'
+        required: true
+        type: string
+      cache_tag:
+        description: 'Cached dependencies tag'
+        required: true
+        type: string
+        default: "cu129"
+      python_minor:
+        description: 'Python minor version'
+        required: true
+        type: string
+        default: "13"
+      python_patch:
+        description: 'Python patch version'
+        required: true
+        type: string
+        default: "6"
+      rel_name:
+        description: 'Release name'
+        required: true
+        type: string
+        default: "nvidia"
+      rel_extra_name:
+        description: 'Release extra name'
+        required: false
+        type: string
+        default: ""
+      test_release:
+        description: 'Test Release'
+        required: true
+        type: boolean
+        default: true

 jobs:
  package_comfy_windows:
@ -42,15 +92,15 @@ jobs:
        id: cache
        with:
          path: |
-            cu${{ inputs.cu }}_python_deps.tar
+            ${{ inputs.cache_tag }}_python_deps.tar
            update_comfyui_and_python_dependencies.bat
-          key: ${{ runner.os }}-build-cu${{ inputs.cu }}-${{ inputs.python_minor }}
+          key: ${{ runner.os }}-build-${{ inputs.cache_tag }}-${{ inputs.python_minor }}
      - shell: bash
        run: |
-          mv cu${{ inputs.cu }}_python_deps.tar ../
+          mv ${{ inputs.cache_tag }}_python_deps.tar ../
          mv update_comfyui_and_python_dependencies.bat ../
          cd ..
-          tar xf cu${{ inputs.cu }}_python_deps.tar
+          tar xf ${{ inputs.cache_tag }}_python_deps.tar
          pwd
          ls

@ -65,12 +115,19 @@ jobs:
          echo 'import site' >> ./python3${{ inputs.python_minor }}._pth
          curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
          ./python.exe get-pip.py
-          ./python.exe -s -m pip install ../cu${{ inputs.cu }}_python_deps/*
+          ./python.exe -s -m pip install ../${{ inputs.cache_tag }}_python_deps/*
+
+          grep comfy ../ComfyUI/requirements.txt > ./requirements_comfyui.txt
+          ./python.exe -s -m pip install -r requirements_comfyui.txt
+          rm requirements_comfyui.txt
+
          sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth

-          rm ./Lib/site-packages/torch/lib/dnnl.lib #I don't think this is actually used and I need the space
-          rm ./Lib/site-packages/torch/lib/libprotoc.lib
-          rm ./Lib/site-packages/torch/lib/libprotobuf.lib
+          if test -f ./Lib/site-packages/torch/lib/dnnl.lib; then
+            rm ./Lib/site-packages/torch/lib/dnnl.lib #I don't think this is actually used and I need the space
+            rm ./Lib/site-packages/torch/lib/libprotoc.lib
+            rm ./Lib/site-packages/torch/lib/libprotobuf.lib
+          fi

          cd ..

@ -85,14 +142,18 @@ jobs:

          mkdir update
          cp -r ComfyUI/.ci/update_windows/* ./update/
-          cp -r ComfyUI/.ci/windows_base_files/* ./
+          cp -r ComfyUI/.ci/windows_${{ inputs.rel_name }}_base_files/* ./
          cp ../update_comfyui_and_python_dependencies.bat ./update/

          cd ..

          "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=768m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable
-          mv ComfyUI_windows_portable.7z ComfyUI/ComfyUI_windows_portable_nvidia.7z
+          mv ComfyUI_windows_portable.7z ComfyUI/ComfyUI_windows_portable_${{ inputs.rel_name }}${{ inputs.rel_extra_name }}.7z

+      - shell: bash
+        if: ${{ inputs.test_release }}
+        run: |
+          cd ..
          cd ComfyUI_windows_portable
          python_embeded/python.exe -s ComfyUI/main.py --quick-test-for-ci --cpu

@ -101,10 +162,9 @@ jobs:
          ls

      - name: Upload binaries to release
-        uses: svenstaro/upload-release-action@v2
+        uses: softprops/action-gh-release@v2
        with:
-          repo_token: ${{ secrets.GITHUB_TOKEN }}
-          file: ComfyUI_windows_portable_nvidia.7z
-          tag: ${{ inputs.git_tag }}
-          overwrite: true
+          files: ComfyUI_windows_portable_${{ inputs.rel_name }}${{ inputs.rel_extra_name }}.7z
+          tag_name: ${{ inputs.git_tag }}
          draft: true
+          overwrite_files: true
--- a/.github/workflows/test-assets.yml
+++ b/.github/workflows/test-assets.yml
@ -1,173 +0,0 @@
-name: Asset System Tests
-
-on:
-  push:
-    paths:
-      - 'app/**'
-      - 'tests-assets/**'
-      - '.github/workflows/test-assets.yml'
-      - 'requirements.txt'
-  pull_request:
-    branches: [master]
-  workflow_dispatch:
-
-permissions:
-  contents: read
-
-env:
-  PIP_DISABLE_PIP_VERSION_CHECK: '1'
-  PYTHONUNBUFFERED: '1'
-
-jobs:
-  sqlite:
-    name: SQLite (${{ matrix.sqlite_mode }}) • Python ${{ matrix.python }}
-    runs-on: ubuntu-latest
-    timeout-minutes: 40
-    strategy:
-      fail-fast: false
-      matrix:
-        python: ['3.9', '3.12']
-        sqlite_mode: ['memory', 'file']
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python }}
-
-      - name: Install dependencies
-        run: |
-          python -m pip install -U pip wheel
-          pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install -r requirements.txt
-          pip install pytest pytest-aiohttp pytest-asyncio
-
-      - name: Set deterministic test base dir
-        id: basedir
-        shell: bash
-        run: |
-          BASE="$RUNNER_TEMP/comfyui-assets-tests-${{ matrix.python }}-${{ matrix.sqlite_mode }}-${{ github.run_id }}-${{ github.run_attempt }}"
-          echo "ASSETS_TEST_BASE_DIR=$BASE" >> "$GITHUB_ENV"
-          echo "ASSETS_TEST_LOGS=$BASE/logs" >> "$GITHUB_ENV"
-          mkdir -p "$BASE/logs"
-          echo "ASSETS_TEST_BASE_DIR=$BASE"
-
-      - name: Set DB URL for SQLite
-        id: setdb
-        shell: bash
-        run: |
-          if [ "${{ matrix.sqlite_mode }}" = "memory" ]; then
-            echo "ASSETS_TEST_DB_URL=sqlite+aiosqlite:///:memory:" >> "$GITHUB_ENV"
-          else
-            DBFILE="$RUNNER_TEMP/assets-tests.sqlite"
-            mkdir -p "$(dirname "$DBFILE")"
-            echo "ASSETS_TEST_DB_URL=sqlite+aiosqlite:///$DBFILE" >> "$GITHUB_ENV"
-          fi
-
-      - name: Run tests
-        run: python -m pytest tests-assets
-
-      - name: Show ComfyUI logs
-        if: always()
-        shell: bash
-        run: |
-          echo "==== ASSETS_TEST_BASE_DIR: $ASSETS_TEST_BASE_DIR ===="
-          echo "==== ASSETS_TEST_LOGS: $ASSETS_TEST_LOGS ===="
-          ls -la "$ASSETS_TEST_LOGS" || true
-          for f in "$ASSETS_TEST_LOGS"/stdout.log "$ASSETS_TEST_LOGS"/stderr.log; do
-            if [ -f "$f" ]; then
-              echo "----- BEGIN $f -----"
-              sed -n '1,400p' "$f"
-              echo "----- END $f -----"
-            fi
-          done
-
-      - name: Upload ComfyUI logs
-        if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: asset-logs-sqlite-${{ matrix.sqlite_mode }}-py${{ matrix.python }}
-          path: ${{ env.ASSETS_TEST_LOGS }}/*.log
-          if-no-files-found: warn
-
-  postgres:
-    name: PostgreSQL ${{ matrix.pgsql }} • Python ${{ matrix.python }}
-    runs-on: ubuntu-latest
-    timeout-minutes: 40
-    strategy:
-      fail-fast: false
-      matrix:
-        python: ['3.9', '3.12']
-        pgsql: ['16', '18']
-
-    services:
-      postgres:
-        image: postgres:${{ matrix.pgsql }}
-        env:
-          POSTGRES_DB: assets
-          POSTGRES_USER: postgres
-          POSTGRES_PASSWORD: postgres
-        ports:
-          - 5432:5432
-        options: >-
-          --health-cmd "pg_isready -U postgres -d assets"
-          --health-interval 10s
-          --health-timeout 5s
-          --health-retries 12
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python }}
-
-      - name: Install dependencies
-        run: |
-          python -m pip install -U pip wheel
-          pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install -r requirements.txt
-          pip install pytest pytest-aiohttp pytest-asyncio
-          pip install greenlet psycopg
-
-      - name: Set deterministic test base dir
-        id: basedir
-        shell: bash
-        run: |
-          BASE="$RUNNER_TEMP/comfyui-assets-tests-${{ matrix.python }}-${{ matrix.sqlite_mode }}-${{ github.run_id }}-${{ github.run_attempt }}"
-          echo "ASSETS_TEST_BASE_DIR=$BASE" >> "$GITHUB_ENV"
-          echo "ASSETS_TEST_LOGS=$BASE/logs" >> "$GITHUB_ENV"
-          mkdir -p "$BASE/logs"
-          echo "ASSETS_TEST_BASE_DIR=$BASE"
-
-      - name: Set DB URL for PostgreSQL
-        shell: bash
-        run: |
-          echo "ASSETS_TEST_DB_URL=postgresql+psycopg://postgres:postgres@localhost:5432/assets" >> "$GITHUB_ENV"
-
-      - name: Run tests
-        run: python -m pytest tests-assets
-
-      - name: Show ComfyUI logs
-        if: always()
-        shell: bash
-        run: |
-          echo "==== ASSETS_TEST_BASE_DIR: $ASSETS_TEST_BASE_DIR ===="
-          echo "==== ASSETS_TEST_LOGS: $ASSETS_TEST_LOGS ===="
-          ls -la "$ASSETS_TEST_LOGS" || true
-          for f in "$ASSETS_TEST_LOGS"/stdout.log "$ASSETS_TEST_LOGS"/stderr.log; do
-            if [ -f "$f" ]; then
-              echo "----- BEGIN $f -----"
-              sed -n '1,400p' "$f"
-              echo "----- END $f -----"
-            fi
-          done
-
-      - name: Upload ComfyUI logs
-        if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: asset-logs-pgsql-${{ matrix.pgsql }}-py${{ matrix.python }}
-          path: ${{ env.ASSETS_TEST_LOGS }}/*.log
-          if-no-files-found: warn
--- a/.github/workflows/test-build.yml
+++ b/.github/workflows/test-build.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
    steps:
      - uses: actions/checkout@v4
      - name: Set up Python ${{ matrix.python-version }}
--- a/.github/workflows/test-ci.yml
+++ b/.github/workflows/test-ci.yml
@ -5,6 +5,7 @@ on:
  push:
    branches:
      - master
+      - release/**
    paths-ignore:
      - 'app/**'
      - 'input/**'
@ -21,14 +22,15 @@ jobs:
      fail-fast: false
      matrix:
        # os: [macos, linux, windows]
-        os: [macos, linux]
-        python_version: ["3.9", "3.10", "3.11", "3.12"]
+        # os: [macos, linux]
+        os: [linux]
+        python_version: ["3.10", "3.11", "3.12"]
        cuda_version: ["12.1"]
        torch_version: ["stable"]
        include:
-          - os: macos
-            runner_label: [self-hosted, macOS]
-            flags: "--use-pytorch-cross-attention"
+          # - os: macos
+          #   runner_label: [self-hosted, macOS]
+          #   flags: "--use-pytorch-cross-attention"
          - os: linux
            runner_label: [self-hosted, Linux]
            flags: ""
@ -73,14 +75,15 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        os: [macos, linux]
+        # os: [macos, linux]
+        os: [linux]
        python_version: ["3.11"]
        cuda_version: ["12.1"]
        torch_version: ["nightly"]
        include:
-          - os: macos
-            runner_label: [self-hosted, macOS]
-            flags: "--use-pytorch-cross-attention"
+          # - os: macos
+          #   runner_label: [self-hosted, macOS]
+          #   flags: "--use-pytorch-cross-attention"
          - os: linux
            runner_label: [self-hosted, Linux]
            flags: ""
--- a/.github/workflows/test-execution.yml
+++ b/.github/workflows/test-execution.yml
@ -2,9 +2,9 @@ name: Execution Tests

 on:
  push:
-    branches: [ main, master ]
+    branches: [ main, master, release/** ]
  pull_request:
-    branches: [ main, master ]
+    branches: [ main, master, release/** ]

 jobs:
  test:
--- a/.github/workflows/test-launch.yml
+++ b/.github/workflows/test-launch.yml
@ -2,9 +2,9 @@ name: Test server launches without errors

 on:
  push:
-    branches: [ main, master ]
+    branches: [ main, master, release/** ]
  pull_request:
-    branches: [ main, master ]
+    branches: [ main, master, release/** ]

 jobs:
  test:
@ -13,7 +13,7 @@ jobs:
    - name: Checkout ComfyUI
      uses: actions/checkout@v4
      with:
-        repository: "comfyanonymous/ComfyUI"
+        repository: "Comfy-Org/ComfyUI"
        path: "ComfyUI"
    - uses: actions/setup-python@v4
      with:
@ -32,7 +32,9 @@ jobs:
      working-directory: ComfyUI
    - name: Check for unhandled exceptions in server log
      run: |
-        if grep -qE "Exception|Error" console_output.log; then
+        grep -v "Found comfy_kitchen backend triton: {'available': False, 'disabled': True, 'unavailable_reason': \"ImportError: No module named 'triton'\", 'capabilities': \[\]}" console_output.log | grep -v "Found comfy_kitchen backend triton: {'available': False, 'disabled': False, 'unavailable_reason': \"ImportError: No module named 'triton'\", 'capabilities': \[\]}" > console_output_filtered.log
+        cat console_output_filtered.log
+        if grep -qE "Exception|Error" console_output_filtered.log; then
          echo "Unhandled exception/error found in server log."
          exit 1
        fi
--- a/.github/workflows/test-unit.yml
+++ b/.github/workflows/test-unit.yml
@ -2,9 +2,9 @@ name: Unit Tests

 on:
  push:
-    branches: [ main, master ]
+    branches: [ main, master, release/** ]
  pull_request:
-    branches: [ main, master ]
+    branches: [ main, master, release/** ]

 jobs:
  test:
--- a/.github/workflows/update-ci-container.yml
+++ b/.github/workflows/update-ci-container.yml
@ -0,0 +1,59 @@
+name: "CI: Update CI Container"
+
+on:
+  release:
+    types: [published]
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'ComfyUI version (e.g., v0.7.0)'
+        required: true
+        type: string
+
+jobs:
+  update-ci-container:
+    runs-on: ubuntu-latest
+    # Skip pre-releases unless manually triggered
+    if: github.event_name == 'workflow_dispatch' || !github.event.release.prerelease
+    steps:
+      - name: Get version
+        id: version
+        run: |
+          if [ "${{ github.event_name }}" = "release" ]; then
+            VERSION="${{ github.event.release.tag_name }}"
+          else
+            VERSION="${{ inputs.version }}"
+          fi
+          echo "version=$VERSION" >> $GITHUB_OUTPUT
+
+      - name: Checkout comfyui-ci-container
+        uses: actions/checkout@v4
+        with:
+          repository: comfy-org/comfyui-ci-container
+          token: ${{ secrets.CI_CONTAINER_PAT }}
+
+      - name: Check current version
+        id: current
+        run: |
+          CURRENT=$(grep -oP 'ARG COMFYUI_VERSION=\K.*' Dockerfile || echo "unknown")
+          echo "current_version=$CURRENT" >> $GITHUB_OUTPUT
+
+      - name: Update Dockerfile
+        run: |
+          VERSION="${{ steps.version.outputs.version }}"
+          sed -i "s/^ARG COMFYUI_VERSION=.*/ARG COMFYUI_VERSION=${VERSION}/" Dockerfile
+
+      - name: Create Pull Request
+        id: create-pr
+        uses: peter-evans/create-pull-request@v7
+        with:
+          token: ${{ secrets.CI_CONTAINER_PAT }}
+          branch: automation/comfyui-${{ steps.version.outputs.version }}
+          title: "chore: bump ComfyUI to ${{ steps.version.outputs.version }}"
+          body: |
+            Updates ComfyUI version from `${{ steps.current.outputs.current_version }}` to `${{ steps.version.outputs.version }}`
+
+            **Triggered by:** ${{ github.event_name == 'release' && format('[Release {0}]({1})', github.event.release.tag_name, github.event.release.html_url) || 'Manual workflow dispatch' }}
+
+          labels: automation
+          commit-message: "chore: bump ComfyUI to ${{ steps.version.outputs.version }}"
--- a/.github/workflows/update-version.yml
+++ b/.github/workflows/update-version.yml
@ -6,6 +6,7 @@ on:
      - "pyproject.toml"
    branches:
      - master
+      - release/**

 jobs:
  update-version:
--- a/.github/workflows/windows_release_dependencies.yml
+++ b/.github/workflows/windows_release_dependencies.yml
@ -17,7 +17,7 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "129"
+        default: "130"

      python_minor:
        description: 'python minor version'
@ -29,7 +29,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "6"
+        default: "9"
 #  push:
 #    branches:
 #      - master
@ -56,7 +56,8 @@ jobs:
            ..\python_embeded\python.exe -s -m pip install --upgrade torch torchvision torchaudio ${{ inputs.xformers }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2
            pause" > update_comfyui_and_python_dependencies.bat

-            python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} ${{ inputs.extra_dependencies }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r requirements.txt pygit2 -w ./temp_wheel_dir
+            grep -v comfyui requirements.txt > requirements_nocomfyui.txt
+            python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} ${{ inputs.extra_dependencies }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r requirements_nocomfyui.txt pygit2 -w ./temp_wheel_dir
            python -m pip install --no-cache-dir ./temp_wheel_dir/*
            echo installed basic
            ls -lah temp_wheel_dir
--- a/.github/workflows/windows_release_dependencies_manual.yml
+++ b/.github/workflows/windows_release_dependencies_manual.yml
@ -0,0 +1,64 @@
+name: "Windows Release dependencies Manual"
+
+on:
+  workflow_dispatch:
+    inputs:
+      torch_dependencies:
+        description: 'torch dependencies'
+        required: false
+        type: string
+        default: "torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu128"
+      cache_tag:
+        description: 'Cached dependencies tag'
+        required: true
+        type: string
+        default: "cu128"
+
+      python_minor:
+        description: 'python minor version'
+        required: true
+        type: string
+        default: "12"
+
+      python_patch:
+        description: 'python patch version'
+        required: true
+        type: string
+        default: "10"
+
+jobs:
+  build_dependencies:
+    runs-on: windows-latest
+    steps:
+        - uses: actions/checkout@v4
+        - uses: actions/setup-python@v5
+          with:
+            python-version: 3.${{ inputs.python_minor }}.${{ inputs.python_patch }}
+
+        - shell: bash
+          run: |
+            echo "@echo off
+            call update_comfyui.bat nopause
+            echo -
+            echo This will try to update pytorch and all python dependencies.
+            echo -
+            echo If you just want to update normally, close this and run update_comfyui.bat instead.
+            echo -
+            pause
+            ..\python_embeded\python.exe -s -m pip install --upgrade ${{ inputs.torch_dependencies }} -r ../ComfyUI/requirements.txt pygit2
+            pause" > update_comfyui_and_python_dependencies.bat
+
+            grep -v comfyui requirements.txt > requirements_nocomfyui.txt
+            python -m pip wheel --no-cache-dir ${{ inputs.torch_dependencies }} -r requirements_nocomfyui.txt pygit2 -w ./temp_wheel_dir
+            python -m pip install --no-cache-dir ./temp_wheel_dir/*
+            echo installed basic
+            ls -lah temp_wheel_dir
+            mv temp_wheel_dir ${{ inputs.cache_tag }}_python_deps
+            tar cf ${{ inputs.cache_tag }}_python_deps.tar ${{ inputs.cache_tag }}_python_deps
+
+        - uses: actions/cache/save@v4
+          with:
+            path: |
+              ${{ inputs.cache_tag }}_python_deps.tar
+              update_comfyui_and_python_dependencies.bat
+            key: ${{ runner.os }}-build-${{ inputs.cache_tag }}-${{ inputs.python_minor }}
--- a/.github/workflows/windows_release_nightly_pytorch.yml
+++ b/.github/workflows/windows_release_nightly_pytorch.yml
@ -68,7 +68,7 @@ jobs:

            mkdir update
            cp -r ComfyUI/.ci/update_windows/* ./update/
-            cp -r ComfyUI/.ci/windows_base_files/* ./
+            cp -r ComfyUI/.ci/windows_nvidia_base_files/* ./
            cp -r ComfyUI/.ci/windows_nightly_base_files/* ./

            echo "call update_comfyui.bat nopause
--- a/.github/workflows/windows_release_package.yml
+++ b/.github/workflows/windows_release_package.yml
@ -81,7 +81,7 @@ jobs:

            mkdir update
            cp -r ComfyUI/.ci/update_windows/* ./update/
-            cp -r ComfyUI/.ci/windows_base_files/* ./
+            cp -r ComfyUI/.ci/windows_nvidia_base_files/* ./
            cp ../update_comfyui_and_python_dependencies.bat ./update/

            cd ..
--- a/.gitignore
+++ b/.gitignore
@ -21,6 +21,7 @@ venv/
 *.log
 web_custom_versions/
 .DS_Store
+*:Zone.Identifier
 openapi.yaml
 filtered-openapi.yaml
 uv.lock
--- a/3
+++ b/3
@ -1,3 +1,2 @@
 # Admins
-* @comfyanonymous
-* @kosinkadink
+* @comfyanonymous @kosinkadink @guill
--- a/PLAN.md
+++ b/PLAN.md
@ -0,0 +1,139 @@
+# Plan: Align Local Asset/Tag Endpoints with Cloud
+
+## Endpoint Comparison
+
+| Endpoint | Cloud (openapi.yaml) | Local (routes.py) |
+|----------|---------------------|-------------------|
+| `GET /api/assets` | ✅ + `include_public` param | ✅ |
+| `POST /api/assets` | ✅ multipart + JSON URL upload | ✅ multipart only |
+| `GET /api/assets/{id}` | ✅ | ✅ |
+| `PUT /api/assets/{id}` | ✅ (`name`, `mime_type`, `preview_id`, `user_metadata`) | ✅ (`name`, `tags`, `user_metadata`) |
+| `DELETE /api/assets/{id}` | ✅ | ✅ |
+| `GET /api/assets/{id}/content` | ❌ | ✅ |
+| `POST /api/assets/{id}/tags` | ✅ | ✅ |
+| `DELETE /api/assets/{id}/tags` | ✅ | ✅ |
+| `PUT /api/assets/{id}/preview` | ❌ | ✅ |
+| `POST /api/assets/from-hash` | ✅ | ✅ |
+| `HEAD /api/assets/hash/{hash}` | ✅ | ✅ |
+| `GET /api/assets/remote-metadata` | ✅ | ❌ |
+| `POST /api/assets/download` | ✅ (background download) | ❌ |
+| `GET /api/assets/tags/refine` | ✅ (tag histogram) | ❌ |
+| `GET /api/tags` | ✅ + `include_public` param | ✅ |
+| `POST /api/assets/scan/seed` | ❌ | ✅ (local only) |
+
+---
+
+## Phase 1: Add Missing Cloud Endpoints to Local
+
+### 1.1 `GET /api/assets/remote-metadata` *(deferred)*
+Fetch metadata from remote URLs (CivitAI, HuggingFace) without downloading the file.
+
+**Status:** Not supported yet. Add stub/placeholder that returns 501 Not Implemented.
+
+**Parameters:**
+- `url` (required): Download URL to retrieve metadata from
+
+**Returns:** Asset metadata (name, size, hash if available, etc.)
+
+### 1.2 `POST /api/assets/download` *(deferred)*
+Initiate background download job for large files from HuggingFace or CivitAI.
+
+**Status:** Not supported yet. Add stub/placeholder that returns 501 Not Implemented.
+
+**Request body:**
+- `source_url` (required): URL to download from
+- `tags`: Optional tags for the asset
+- `user_metadata`: Optional metadata
+- `preview_id`: Optional preview asset ID
+
+**Returns:**
+- 200 if file already exists (returns asset immediately)
+- 202 with `task_id` for background download tracking via `GET /api/tasks/{task_id}`
+
+### 1.3 `GET /api/assets/tags/refine`
+Get tag histogram for filtered assets (useful for search refinement UI).
+
+**Parameters:**
+- `include_tags`: Filter assets with ALL these tags
+- `exclude_tags`: Exclude assets with ANY of these tags
+- `name_contains`: Filter by name substring
+- `metadata_filter`: JSON filter for metadata fields
+- `limit`: Max tags to return (default 100)
+- `include_public`: Include public/shared assets
+
+**Returns:** List of tags with counts for matching assets
+
+---
+
+## Phase 2: Update Existing Endpoints for Parity
+
+### 2.1 `GET /api/assets`
+- Add `include_public` query parameter (boolean, default true)
+
+### 2.2 `POST /api/assets`
+- Add JSON body upload path for URL-based uploads:
+  ```json
+  {
+    "url": "https://...",
+    "name": "model.safetensors",
+    "tags": ["models", "checkpoints"],
+    "user_metadata": {},
+    "preview_id": "uuid"
+  }
+  ```
+- Keep existing multipart upload support
+
+### 2.3 `PUT /api/assets/{id}`
+- Add `mime_type` field support
+- Add `preview_id` field support
+- Remove direct `tags` field (recommend using dedicated `POST/DELETE /api/assets/{id}/tags` endpoints instead)
+
+### 2.4 `GET /api/tags`
+- Add `include_public` query parameter (boolean, default true)
+
+---
+
+## Phase 3: Local-Only Endpoints
+
+These endpoints exist locally but not in cloud.
+
+### 3.1 `GET /api/assets/{id}/content`
+Download asset file content. Cloud uses signed URLs instead. **Keep for local.**
+
+### 3.2 `PUT /api/assets/{id}/preview`
+**Remove this endpoint.** Merge functionality into `PUT /api/assets/{id}` by adding `preview_id` field support (aligns with cloud).
+
+### 3.3 `POST /api/assets/scan/seed`
+Filesystem seeding/scanning for local asset discovery. Not applicable to cloud. **Keep as local-only.**
+
+---
+
+## Phase 4: Testing
+
+Add tests for all new and modified endpoints to ensure functionality matches cloud behavior.
+
+### 4.1 New Endpoint Tests
+- `GET /api/assets/remote-metadata` – Test with valid/invalid URLs, various sources (CivitAI, HuggingFace)
+- `POST /api/assets/download` – Test background download initiation, existing file detection, task tracking
+- `GET /api/assets/tags/refine` – Test histogram generation with various filter combinations
+
+### 4.2 Updated Endpoint Tests
+- `GET /api/assets` – Test `include_public` param filtering
+- `POST /api/assets` – Test JSON URL upload path alongside existing multipart tests
+- `PUT /api/assets/{id}` – Test `mime_type` and `preview_id` field updates
+- `GET /api/tags` – Test `include_public` param filtering
+
+### 4.3 Removed Endpoint Tests
+- Remove tests for `PUT /api/assets/{id}/preview`
+- Add tests for `preview_id` in `PUT /api/assets/{id}` to cover the merged functionality
+
+---
+
+## Implementation Order
+
+1. Phase 2.1, 2.4 – Add `include_public` params (low effort, high compatibility)
+2. Phase 2.3 – Update PUT endpoint fields + remove preview endpoint
+3. Phase 2.2 – Add JSON URL upload to POST
+4. Phase 1.3 – Add tags/refine endpoint
+5. Phase 1.1, 1.2 – Add stub endpoints returning 501 (deferred implementation)
+6. Phase 4 – Add tests for each phase as implemented
--- a/QUANTIZATION.md
+++ b/QUANTIZATION.md
@ -0,0 +1,168 @@
+# The Comfy guide to Quantization
+
+
+## How does quantization work?
+
+Quantization aims to map a high-precision value x_f to a lower precision format with minimal loss in accuracy. These smaller formats then serve to reduce the models memory footprint and increase throughput by using specialized hardware.
+
+When simply converting a value from FP16 to FP8 using the round-nearest method we might hit two issues:
+- The dynamic range of FP16 (-65,504, 65,504) far exceeds FP8 formats like E4M3 (-448, 448) or E5M2 (-57,344, 57,344), potentially resulting in clipped values
+- The original values are concentrated in a small range (e.g. -1,1) leaving many FP8-bits "unused"
+
+By using a scaling factor, we aim to map these values into the quantized-dtype range, making use of the full spectrum. One of the easiest approaches, and common, is using per-tensor absolute-maximum scaling.
+
+```
+absmax = max(abs(tensor))
+scale = amax / max_dynamic_range_low_precision
+
+# Quantization
+tensor_q = (tensor / scale).to(low_precision_dtype)
+
+# De-Quantization
+tensor_dq = tensor_q.to(fp16) * scale
+
+tensor_dq ~ tensor
+```
+
+Given that additional information (scaling factor) is needed to "interpret" the quantized values, we describe those as derived datatypes.
+
+
+## Quantization in Comfy
+
+```
+QuantizedTensor (torch.Tensor subclass)
+  ↓ __torch_dispatch__
+Two-Level Registry (generic + layout handlers)
+  ↓
+MixedPrecisionOps + Metadata Detection
+```
+
+### Representation
+
+To represent these derived datatypes, ComfyUI uses a subclass of torch.Tensor to implements these using the `QuantizedTensor` class found in `comfy/quant_ops.py`
+
+A `Layout` class defines how a specific quantization format behaves:
+- Required parameters
+- Quantize method
+- De-Quantize method
+
+```python
+from comfy.quant_ops import QuantizedLayout
+
+class MyLayout(QuantizedLayout):
+    @classmethod
+    def quantize(cls, tensor, **kwargs):
+        # Convert to quantized format
+        qdata = ...
+        params = {'scale': ..., 'orig_dtype': tensor.dtype}
+        return qdata, params
+    
+    @staticmethod
+    def dequantize(qdata, scale, orig_dtype, **kwargs):
+        return qdata.to(orig_dtype) * scale
+```
+
+To then run operations using these QuantizedTensors we use two registry systems to define supported operations. 
+The first is a **generic registry** that handles operations common to all quantized formats (e.g., `.to()`, `.clone()`, `.reshape()`).
+
+The second registry is layout-specific and allows to implement fast-paths like nn.Linear.
+```python
+from comfy.quant_ops import register_layout_op
+
+@register_layout_op(torch.ops.aten.linear.default, MyLayout)
+def my_linear(func, args, kwargs):
+    # Extract tensors, call optimized kernel
+    ...
+```
+When `torch.nn.functional.linear()` is called with QuantizedTensor arguments, `__torch_dispatch__` automatically routes to the registered implementation.
+For any unsupported operation, QuantizedTensor will fallback to call `dequantize` and dispatch using the high-precision implementation.
+
+
+### Mixed Precision
+
+The `MixedPrecisionOps` class (lines 542-648 in `comfy/ops.py`) enables per-layer quantization decisions, allowing different layers in a model to use different precisions. This is activated when a model config contains a `layer_quant_config` dictionary that specifies which layers should be quantized and how.
+
+**Architecture:**
+
+```python
+class MixedPrecisionOps(disable_weight_init):
+    _layer_quant_config = {}  # Maps layer names to quantization configs
+    _compute_dtype = torch.bfloat16  # Default compute / dequantize precision
+```
+
+**Key mechanism:**
+
+The custom `Linear._load_from_state_dict()` method inspects each layer during model loading:
+- If the layer name is **not** in `_layer_quant_config`: load weight as regular tensor in `_compute_dtype`
+- If the layer name **is** in `_layer_quant_config`: 
+  - Load weight as `QuantizedTensor` with the specified layout (e.g., `TensorCoreFP8Layout`)
+  - Load associated quantization parameters (scales, block_size, etc.)
+
+**Why it's needed:**
+
+Not all layers tolerate quantization equally. Sensitive operations like final projections can be kept in higher precision, while compute-heavy matmuls are quantized. This provides most of the performance benefits while maintaining quality.
+
+The system is selected in `pick_operations()` when `model_config.layer_quant_config` is present, making it the highest-priority operation mode.
+
+
+## Checkpoint Format
+
+Quantized checkpoints are stored as standard safetensors files with quantized weight tensors and associated scaling parameters, plus a `_quantization_metadata` JSON entry describing the quantization scheme.
+
+The quantized checkpoint will contain the same layers as the original checkpoint but:
+- The weights are stored as quantized values, sometimes using a different storage datatype. E.g. uint8 container for fp8.
+- For each quantized weight a number of additional scaling parameters are stored alongside depending on the recipe.
+- We store a metadata.json in the metadata of the final safetensor containing the `_quantization_metadata` describing which layers are quantized and what layout has been used.
+
+### Scaling Parameters details
+We define 4 possible scaling parameters that should cover most recipes in the near-future:
+- **weight_scale**: quantization scalers for the weights
+- **weight_scale_2**: global scalers in the context of double scaling
+- **pre_quant_scale**: scalers used for smoothing salient weights
+- **input_scale**: quantization scalers for the activations
+
+| Format | Storage dtype | weight_scale | weight_scale_2 | pre_quant_scale | input_scale |
+|--------|---------------|--------------|----------------|-----------------|-------------|
+| float8_e4m3fn | float32 | float32 (scalar) | - | - | float32 (scalar) |
+
+You can find the defined formats in `comfy/quant_ops.py` (QUANT_ALGOS).
+
+### Quantization Metadata
+
+The metadata stored alongside the checkpoint contains:
+- **format_version**: String to define a version of the standard
+- **layers**: A dictionary mapping layer names to their quantization format. The format string maps to the definitions found in `QUANT_ALGOS`. 
+
+Example:
+```json
+{
+  "_quantization_metadata": {
+    "format_version": "1.0",
+    "layers": {
+      "model.layers.0.mlp.up_proj": "float8_e4m3fn",
+      "model.layers.0.mlp.down_proj": "float8_e4m3fn",
+      "model.layers.1.mlp.up_proj": "float8_e4m3fn"
+    }
+  }
+}
+```
+
+
+## Creating Quantized Checkpoints
+
+To create compatible checkpoints, use any quantization tool provided the output follows the checkpoint format described above and uses a layout defined in `QUANT_ALGOS`.
+
+### Weight Quantization
+
+Weight quantization is straightforward - compute the scaling factor directly from the weight tensor using the absolute maximum method described earlier. Each layer's weights are quantized independently and stored with their corresponding `weight_scale` parameter.
+
+### Calibration (for Activation Quantization)
+
+Activation quantization (e.g., for FP8 Tensor Core operations) requires `input_scale` parameters that cannot be determined from static weights alone. Since activation values depend on actual inputs, we use **post-training calibration (PTQ)**:
+
+1. **Collect statistics**: Run inference on N representative samples
+2. **Track activations**: Record the absolute maximum (`amax`) of inputs to each quantized layer
+3. **Compute scales**: Derive `input_scale` from collected statistics
+4. **Store in checkpoint**: Save `input_scale` parameters alongside weights
+
+The calibration dataset should be representative of your target use case. For diffusion models, this typically means a diverse set of prompts and generation parameters.
--- a/README.md
+++ b/README.md
@ -67,6 +67,8 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
   - [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/)
   - [Qwen Image](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/)
   - [Hunyuan Image 2.1](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_image/)
+   - [Flux 2](https://comfyanonymous.github.io/ComfyUI_examples/flux2/)
+   - [Z Image](https://comfyanonymous.github.io/ComfyUI_examples/z_image/)
 - Image Editing Models
   - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/)
   - [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model)
@ -79,6 +81,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
   - [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/)
   - [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/)
   - [Wan 2.2](https://comfyanonymous.github.io/ComfyUI_examples/wan22/)
+   - [Hunyuan Video 1.5](https://docs.comfy.org/tutorials/video/hunyuan/hunyuan-video-1-5)
 - Audio Models
   - [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
   - [ACE Step](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
@ -105,17 +108,21 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
 - [LCM models and Loras](https://comfyanonymous.github.io/ComfyUI_examples/lcm/)
 - Latent previews with [TAESD](#how-to-show-high-quality-previews)
 - Works fully offline: core will never download anything unless you want to.
- Optional API nodes to use paid models from external providers through the online [Comfy API](https://docs.comfy.org/tutorials/api-nodes/overview).
+- Optional API nodes to use paid models from external providers through the online [Comfy API](https://docs.comfy.org/tutorials/api-nodes/overview) disable with: `--disable-api-nodes`
 - [Config file](extra_model_paths.yaml.example) to set the search paths for models.

 Workflow examples can be found on the [Examples page](https://comfyanonymous.github.io/ComfyUI_examples/)

 ## Release Process

-ComfyUI follows a weekly release cycle targeting Friday but this regularly changes because of model releases or large changes to the codebase. There are three interconnected repositories:
+ComfyUI follows a weekly release cycle targeting Monday but this regularly changes because of model releases or large changes to the codebase. There are three interconnected repositories:

 1. **[ComfyUI Core](https://github.com/comfyanonymous/ComfyUI)**
-   - Releases a new stable version (e.g., v0.7.0)
+   - Releases a new stable version (e.g., v0.7.0) roughly every week.
+   - Starting from v0.4.0 patch versions will be used for fixes backported onto the current stable release.
+   - Minor versions will be used for releases off the master branch.
+   - Patch versions may still be used for releases on the master branch in cases where a backport would not make sense.
+   - Commits outside of the stable release tags may be very unstable and break many custom nodes.
   - Serves as the foundation for the desktop release

 2. **[ComfyUI Desktop](https://github.com/Comfy-Org/desktop)**
@ -172,10 +179,20 @@ There is a portable standalone build for Windows that should work for running on

 ### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia.7z)

-Simply download, extract with [7-Zip](https://7-zip.org) and run. Make sure you put your Stable Diffusion checkpoints/models (the huge ckpt/safetensors files) in: ComfyUI\models\checkpoints
+Simply download, extract with [7-Zip](https://7-zip.org) or with the windows explorer on recent windows versions and run. For smaller models you normally only need to put the checkpoints (the huge ckpt/safetensors files) in: ComfyUI\models\checkpoints but many of the larger models have multiple files. Make sure to follow the instructions to know which subfolder to put them in ComfyUI\models\

 If you have trouble extracting it, right click the file -> properties -> unblock

+The portable above currently comes with python 3.13 and pytorch cuda 13.0. Update your Nvidia drivers if it doesn't start.
+
+#### Alternative Downloads:
+
+[Experimental portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)
+
+[Portable with pytorch cuda 12.8 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu128.7z).
+
+[Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).
+
 #### How do I share models between another UI and ComfyUI?

 See the [Config file](extra_model_paths.yaml.example) to set the search paths for models. In the standalone windows build you can find this file in the ComfyUI directory. Rename this file to extra_model_paths.yaml and edit it with your favorite text editor.
@ -191,7 +208,13 @@ comfy install

 ## Manual Install (Windows, Linux)

-Python 3.13 is very well supported. If you have trouble with some custom node dependencies you can try 3.12
+Python 3.14 works but you may encounter issues with the torch compile node. The free threaded variant is still missing some dependencies.
+
+Python 3.13 is very well supported. If you have trouble with some custom node dependencies on 3.13 you can try 3.12
+
+torch 2.4 and above is supported but some features and optimizations might only work on newer versions. We generally recommend using the latest major version of pytorch with the latest cuda version unless it is less than 2 weeks old.
+
+### Instructions:

 Git clone this repo.

@ -200,18 +223,36 @@ Put your SD checkpoints (the huge ckpt/safetensors files) in: models/checkpoints
 Put your VAE in: models/vae


-### AMD GPUs (Linux only)
+### AMD GPUs (Linux)
+
 AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:

 ```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.4```

-This is the command to install the nightly with ROCm 6.4 which might have some performance improvements:
+This is the command to install the nightly with ROCm 7.1 which might have some performance improvements:

-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.4```
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm7.1```
+
+
+### AMD GPUs (Experimental: Windows and Linux), RDNA 3, 3.5 and 4 only.
+
+These have less hardware support than the builds above but they work on windows. You also need to install the pytorch version specific to your hardware.
+
+RDNA 3 (RX 7000 series):
+
+```pip install --pre torch torchvision torchaudio --index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/```
+
+RDNA 3.5 (Strix halo/Ryzen AI Max+ 365):
+
+```pip install --pre torch torchvision torchaudio --index-url https://rocm.nightlies.amd.com/v2/gfx1151/```
+
+RDNA 4 (RX 9000 series):
+
+```pip install --pre torch torchvision torchaudio --index-url https://rocm.nightlies.amd.com/v2/gfx120X-all/```

 ### Intel GPUs (Windows and Linux)

-(Option 1) Intel Arc GPU users can install native PyTorch with torch.xpu support using pip. More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html)
+Intel Arc GPU users can install native PyTorch with torch.xpu support using pip. More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html)

 1. To install PyTorch xpu, use the following command:

@ -221,19 +262,15 @@ This is the command to install the Pytorch xpu nightly which might have some per

 ```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu```

-(Option 2) Alternatively, Intel GPUs supported by Intel Extension for PyTorch (IPEX) can leverage IPEX for improved performance.
-
-1. visit [Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu) for more information.
-
 ### NVIDIA

 Nvidia users should install stable pytorch using this command:

-```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu129```
+```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu130```

 This is the command to install pytorch nightly instead which might have performance improvements.

-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu129```
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu130```

 #### Troubleshooting

@ -264,12 +301,6 @@ You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS ve

 > **Note**: Remember to add your models, VAE, LoRAs etc. to the corresponding Comfy folders, as discussed in [ComfyUI manual installation](#manual-install-windows-linux).

-#### DirectML (AMD Cards on Windows)
-
-This is very badly supported and is not recommended. There are some unofficial builds of pytorch ROCm on windows that exist that will give you a much better experience than this. This readme will be updated once official pytorch ROCm builds for windows come out.
-
-```pip install torch-directml``` Then you can launch ComfyUI with: ```python main.py --directml```
-
 #### Ascend NPUs

 For models compatible with Ascend Extension for PyTorch (torch_npu). To get started, ensure your environment meets the prerequisites outlined on the [installation](https://ascend.github.io/docs/sources/ascend/quick_install.html) page. Here's a step-by-step guide tailored to your platform and installation method:
@ -294,6 +325,32 @@ For models compatible with Iluvatar Extension for PyTorch. Here's a step-by-step
 1. Install the Iluvatar Corex Toolkit by adhering to the platform-specific instructions on the [Installation](https://support.iluvatar.com/#/DocumentCentre?id=1&nameCenter=2&productId=520117912052801536)
 2. Launch ComfyUI by running `python main.py`

+
+## [ComfyUI-Manager](https://github.com/Comfy-Org/ComfyUI-Manager/tree/manager-v4)
+
+**ComfyUI-Manager** is an extension that allows you to easily install, update, and manage custom nodes for ComfyUI.
+
+### Setup
+
+1. Install the manager dependencies:
+   ```bash
+   pip install -r manager_requirements.txt
+   ```
+
+2. Enable the manager with the `--enable-manager` flag when running ComfyUI:
+   ```bash
+   python main.py --enable-manager
+   ```
+
+### Command Line Options
+
+| Flag | Description |
+|------|-------------|
+| `--enable-manager` | Enable ComfyUI-Manager |
+| `--enable-manager-legacy-ui` | Use the legacy manager UI instead of the new UI (requires `--enable-manager`) |
+| `--disable-manager-ui` | Disable the manager UI and endpoints while keeping background features like security checks and scheduled installation completion (requires `--enable-manager`) |
+
+
 # Running

 ```python main.py```
--- a/alembic.ini
+++ b/alembic.ini
@ -3,7 +3,7 @@
 [alembic]
 # path to migration scripts
 # Use forward slashes (/) also on windows to provide an os agnostic path
-script_location = app/alembic_db
+script_location = alembic_db

 # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
 # Uncomment the line below if you want the files to be prepended with date and time
--- a/app/alembic_db/README.md
+++ b/app/alembic_db/README.md
--- a/app/alembic_db/env.py
+++ b/app/alembic_db/env.py
@ -2,12 +2,13 @@ from sqlalchemy import engine_from_config
 from sqlalchemy import pool

 from alembic import context
-from app.assets.database.models import Base

 # this is the Alembic Config object, which provides
 # access to the values within the .ini file in use.
 config = context.config

+
+from app.database.models import Base
 target_metadata = Base.metadata

 # other values from the config, defined by the needs of env.py,
--- a/app/alembic_db/script.py.mako
+++ b/app/alembic_db/script.py.mako
--- a/app/alembic_db/versions/0001_assets.py
+++ b/app/alembic_db/versions/0001_assets.py
@ -1,13 +1,12 @@
-"""initial assets schema
-
+"""
+Initial assets schema
 Revision ID: 0001_assets
-Revises:
-Create Date: 2025-08-20 00:00:00
+Revises: None
+Create Date: 2025-12-10 00:00:00
 """

 from alembic import op
 import sqlalchemy as sa
-from sqlalchemy.dialects import postgresql

 revision = "0001_assets"
 down_revision = None
@ -94,7 +93,7 @@ def upgrade() -> None:
        sa.Column("val_str", sa.String(length=2048), nullable=True),
        sa.Column("val_num", sa.Numeric(38, 10), nullable=True),
        sa.Column("val_bool", sa.Boolean(), nullable=True),
-        sa.Column("val_json", sa.JSON().with_variant(postgresql.JSONB(), 'postgresql'), nullable=True),
+        sa.Column("val_json", sa.JSON(), nullable=True),
        sa.PrimaryKeyConstraint("asset_info_id", "key", "ordinal", name="pk_asset_info_meta"),
    )
    op.create_index("ix_asset_info_meta_key", "asset_info_meta", ["key"])
--- a/api_server/routes/internal/internal_routes.py
+++ b/api_server/routes/internal/internal_routes.py
@ -58,8 +58,13 @@ class InternalRoutes:
                return web.json_response({"error": "Invalid directory type"}, status=400)

            directory = get_directory_by_type(directory_type)
+
+            def is_visible_file(entry: os.DirEntry) -> bool:
+                """Filter out hidden files (e.g., .DS_Store on macOS)."""
+                return entry.is_file() and not entry.name.startswith('.')
+
            sorted_files = sorted(
-                (entry for entry in os.scandir(directory) if entry.is_file()),
+                (entry for entry in os.scandir(directory) if is_visible_file(entry)),
                key=lambda entry: -entry.stat().st_mtime
            )
            return web.json_response([entry.name for entry in sorted_files], status=200)
--- a/app/assets/api/init.py
+++ b/app/assets/api/init.py
--- a/app/assets/init.py
+++ b/app/assets/init.py
@ -1,4 +0,0 @@
-from .api.routes import register_assets_system
-from .scanner import sync_seed_assets
-
-__all__ = ["sync_seed_assets", "register_assets_system"]
--- a/app/assets/api/routes.py
+++ b/app/assets/api/routes.py
@ -1,26 +1,38 @@
-import contextlib
 import logging
-import os
-import urllib.parse
 import uuid
-from typing import Optional
-
+import urllib.parse
+import os
+import contextlib
 from aiohttp import web
+
 from pydantic import ValidationError

+import app.assets.manager as manager
+import app.assets.scanner as scanner
+from app import user_manager
+from app.assets.api import schemas_in
+from app.assets.helpers import get_query_dict
+
 import folder_paths

-from ... import user_manager
-from .. import manager, scanner
-from . import schemas_in, schemas_out
-
 ROUTES = web.RouteTableDef()
-USER_MANAGER: Optional[user_manager.UserManager] = None
-LOGGER = logging.getLogger(__name__)
+USER_MANAGER: user_manager.UserManager | None = None

 # UUID regex (canonical hyphenated form, case-insensitive)
 UUID_RE = r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}"

+def register_assets_system(app: web.Application, user_manager_instance: user_manager.UserManager) -> None:
+    global USER_MANAGER
+    USER_MANAGER = user_manager_instance
+    app.add_routes(ROUTES)
+
+def _error_response(status: int, code: str, message: str, details: dict | None = None) -> web.Response:
+    return web.json_response({"error": {"code": code, "message": message, "details": details or {}}}, status=status)
+
+
+def _validation_error_response(code: str, ve: ValidationError) -> web.Response:
+    return _error_response(400, code, "Validation failed.", {"errors": ve.json()})
+

@ROUTES.head("/api/assets/hash/{hash}")
 async def head_asset_by_hash(request: web.Request) -> web.Response:
@ -30,29 +42,22 @@ async def head_asset_by_hash(request: web.Request) -> web.Response:
    algo, digest = hash_str.split(":", 1)
    if algo != "blake3" or not digest or any(c for c in digest if c not in "0123456789abcdef"):
        return _error_response(400, "INVALID_HASH", "hash must be like 'blake3:<hex>'")
-    exists = await manager.asset_exists(asset_hash=hash_str)
+    exists = manager.asset_exists(asset_hash=hash_str)
    return web.Response(status=200 if exists else 404)


@ROUTES.get("/api/assets")
 async def list_assets(request: web.Request) -> web.Response:
-    qp = request.rel_url.query
-    query_dict = {}
-    if "include_tags" in qp:
-        query_dict["include_tags"] = qp.getall("include_tags")
-    if "exclude_tags" in qp:
-        query_dict["exclude_tags"] = qp.getall("exclude_tags")
-    for k in ("name_contains", "metadata_filter", "limit", "offset", "sort", "order"):
-        v = qp.get(k)
-        if v is not None:
-            query_dict[k] = v
-
+    """
+    GET request to list assets.
+    """
+    query_dict = get_query_dict(request)
    try:
        q = schemas_in.ListAssetsQuery.model_validate(query_dict)
    except ValidationError as ve:
        return _validation_error_response("INVALID_QUERY", ve)

-    payload = await manager.list_assets(
+    payload = manager.list_assets(
        include_tags=q.include_tags,
        exclude_tags=q.exclude_tags,
        name_contains=q.name_contains,
@ -66,14 +71,38 @@ async def list_assets(request: web.Request) -> web.Response:
    return web.json_response(payload.model_dump(mode="json"))


+@ROUTES.get(f"/api/assets/{{id:{UUID_RE}}}")
+async def get_asset(request: web.Request) -> web.Response:
+    """
+    GET request to get an asset's info as JSON.
+    """
+    asset_info_id = str(uuid.UUID(request.match_info["id"]))
+    try:
+        result = manager.get_asset(
+            asset_info_id=asset_info_id,
+            owner_id=USER_MANAGER.get_request_user_id(request),
+        )
+    except ValueError as e:
+        return _error_response(404, "ASSET_NOT_FOUND", str(e), {"id": asset_info_id})
+    except Exception:
+        logging.exception(
+            "get_asset failed for asset_info_id=%s, owner_id=%s",
+            asset_info_id,
+            USER_MANAGER.get_request_user_id(request),
+        )
+        return _error_response(500, "INTERNAL", "Unexpected server error.")
+    return web.json_response(result.model_dump(mode="json"), status=200)
+
+
@ROUTES.get(f"/api/assets/{{id:{UUID_RE}}}/content")
 async def download_asset_content(request: web.Request) -> web.Response:
+    # question: do we need disposition? could we just stick with one of these?
    disposition = request.query.get("disposition", "attachment").lower().strip()
    if disposition not in {"inline", "attachment"}:
        disposition = "attachment"

    try:
-        abs_path, content_type, filename = await manager.resolve_asset_content_for_download(
+        abs_path, content_type, filename = manager.resolve_asset_content_for_download(
            asset_info_id=str(uuid.UUID(request.match_info["id"])),
            owner_id=USER_MANAGER.get_request_user_id(request),
        )
@ -103,7 +132,7 @@ async def create_asset_from_hash(request: web.Request) -> web.Response:
    except Exception:
        return _error_response(400, "INVALID_JSON", "Request body must be valid JSON.")

-    result = await manager.create_asset_from_hash(
+    result = manager.create_asset_from_hash(
        hash_str=body.hash,
        name=body.name,
        tags=body.tags,
@ -125,15 +154,15 @@ async def upload_asset(request: web.Request) -> web.Response:
    reader = await request.multipart()

    file_present = False
-    file_client_name: Optional[str] = None
+    file_client_name: str | None = None
    tags_raw: list[str] = []
-    provided_name: Optional[str] = None
-    user_metadata_raw: Optional[str] = None
-    provided_hash: Optional[str] = None
-    provided_hash_exists: Optional[bool] = None
+    provided_name: str | None = None
+    user_metadata_raw: str | None = None
+    provided_hash: str | None = None
+    provided_hash_exists: bool | None = None

    file_written = 0
-    tmp_path: Optional[str] = None
+    tmp_path: str | None = None
    while True:
        field = await reader.next()
        if field is None:
@ -155,7 +184,7 @@ async def upload_asset(request: web.Request) -> web.Response:
                    return _error_response(400, "INVALID_HASH", "hash must be like 'blake3:<hex>'")
                provided_hash = f"{algo}:{digest}"
                try:
-                    provided_hash_exists = await manager.asset_exists(asset_hash=provided_hash)
+                    provided_hash_exists = manager.asset_exists(asset_hash=provided_hash)
                except Exception:
                    provided_hash_exists = None  # do not fail the whole request here

@ -242,7 +271,7 @@ async def upload_asset(request: web.Request) -> web.Response:
    # Fast path: if a valid provided hash exists, create AssetInfo without writing anything
    if spec.hash and provided_hash_exists is True:
        try:
-            result = await manager.create_asset_from_hash(
+            result = manager.create_asset_from_hash(
                hash_str=spec.hash,
                name=spec.name or (spec.hash.split(":", 1)[1]),
                tags=spec.tags,
@ -250,7 +279,7 @@ async def upload_asset(request: web.Request) -> web.Response:
                owner_id=owner_id,
            )
        except Exception:
-            LOGGER.exception("create_asset_from_hash failed for hash=%s, owner_id=%s", spec.hash, owner_id)
+            logging.exception("create_asset_from_hash failed for hash=%s, owner_id=%s", spec.hash, owner_id)
            return _error_response(500, "INTERNAL", "Unexpected server error.")

        if result is None:
@ -270,7 +299,7 @@ async def upload_asset(request: web.Request) -> web.Response:
        return _error_response(404, "ASSET_NOT_FOUND", "Provided hash not found and no file uploaded.")

    try:
-        created = await manager.upload_asset_from_temp_path(
+        created = manager.upload_asset_from_temp_path(
            spec,
            temp_path=tmp_path,
            client_filename=file_client_name,
@ -293,30 +322,10 @@ async def upload_asset(request: web.Request) -> web.Response:
    except Exception:
        if tmp_path and os.path.exists(tmp_path):
            os.remove(tmp_path)
-        LOGGER.exception("upload_asset_from_temp_path failed for tmp_path=%s, owner_id=%s", tmp_path, owner_id)
+        logging.exception("upload_asset_from_temp_path failed for tmp_path=%s, owner_id=%s", tmp_path, owner_id)
        return _error_response(500, "INTERNAL", "Unexpected server error.")


-@ROUTES.get(f"/api/assets/{{id:{UUID_RE}}}")
-async def get_asset(request: web.Request) -> web.Response:
-    asset_info_id = str(uuid.UUID(request.match_info["id"]))
-    try:
-        result = await manager.get_asset(
-            asset_info_id=asset_info_id,
-            owner_id=USER_MANAGER.get_request_user_id(request),
-        )
-    except ValueError as ve:
-        return _error_response(404, "ASSET_NOT_FOUND", str(ve), {"id": asset_info_id})
-    except Exception:
-        LOGGER.exception(
-            "get_asset failed for asset_info_id=%s, owner_id=%s",
-            asset_info_id,
-            USER_MANAGER.get_request_user_id(request),
-        )
-        return _error_response(500, "INTERNAL", "Unexpected server error.")
-    return web.json_response(result.model_dump(mode="json"), status=200)
-
-
@ROUTES.put(f"/api/assets/{{id:{UUID_RE}}}")
 async def update_asset(request: web.Request) -> web.Response:
    asset_info_id = str(uuid.UUID(request.match_info["id"]))
@ -328,7 +337,7 @@ async def update_asset(request: web.Request) -> web.Response:
        return _error_response(400, "INVALID_JSON", "Request body must be valid JSON.")

    try:
-        result = await manager.update_asset(
+        result = manager.update_asset(
            asset_info_id=asset_info_id,
            name=body.name,
            tags=body.tags,
@ -338,7 +347,7 @@ async def update_asset(request: web.Request) -> web.Response:
    except (ValueError, PermissionError) as ve:
        return _error_response(404, "ASSET_NOT_FOUND", str(ve), {"id": asset_info_id})
    except Exception:
-        LOGGER.exception(
+        logging.exception(
            "update_asset failed for asset_info_id=%s, owner_id=%s",
            asset_info_id,
            USER_MANAGER.get_request_user_id(request),
@ -358,7 +367,7 @@ async def set_asset_preview(request: web.Request) -> web.Response:
        return _error_response(400, "INVALID_JSON", "Request body must be valid JSON.")

    try:
-        result = await manager.set_asset_preview(
+        result = manager.set_asset_preview(
            asset_info_id=asset_info_id,
            preview_asset_id=body.preview_id,
            owner_id=USER_MANAGER.get_request_user_id(request),
@ -366,7 +375,7 @@ async def set_asset_preview(request: web.Request) -> web.Response:
    except (PermissionError, ValueError) as ve:
        return _error_response(404, "ASSET_NOT_FOUND", str(ve), {"id": asset_info_id})
    except Exception:
-        LOGGER.exception(
+        logging.exception(
            "set_asset_preview failed for asset_info_id=%s, owner_id=%s",
            asset_info_id,
            USER_MANAGER.get_request_user_id(request),
@ -382,13 +391,13 @@ async def delete_asset(request: web.Request) -> web.Response:
    delete_content = True if delete_content is None else delete_content.lower() not in {"0", "false", "no"}

    try:
-        deleted = await manager.delete_asset_reference(
+        deleted = manager.delete_asset_reference(
            asset_info_id=asset_info_id,
            owner_id=USER_MANAGER.get_request_user_id(request),
            delete_content_if_orphan=delete_content,
        )
    except Exception:
-        LOGGER.exception(
+        logging.exception(
            "delete_asset_reference failed for asset_info_id=%s, owner_id=%s",
            asset_info_id,
            USER_MANAGER.get_request_user_id(request),
@ -402,17 +411,20 @@ async def delete_asset(request: web.Request) -> web.Response:

@ROUTES.get("/api/tags")
 async def get_tags(request: web.Request) -> web.Response:
+    """
+    GET request to list all tags based on query parameters.
+    """
    query_map = dict(request.rel_url.query)

    try:
        query = schemas_in.TagsListQuery.model_validate(query_map)
-    except ValidationError as ve:
+    except ValidationError as e:
        return web.json_response(
-            {"error": {"code": "INVALID_QUERY", "message": "Invalid query parameters", "details": ve.errors()}},
+            {"error": {"code": "INVALID_QUERY", "message": "Invalid query parameters", "details": e.errors()}},
            status=400,
        )

-    result = await manager.list_tags(
+    result = manager.list_tags(
        prefix=query.prefix,
        limit=query.limit,
        offset=query.offset,
@ -422,7 +434,6 @@ async def get_tags(request: web.Request) -> web.Response:
    )
    return web.json_response(result.model_dump(mode="json"))

-
@ROUTES.post(f"/api/assets/{{id:{UUID_RE}}}/tags")
 async def add_asset_tags(request: web.Request) -> web.Response:
    asset_info_id = str(uuid.UUID(request.match_info["id"]))
@ -435,7 +446,7 @@ async def add_asset_tags(request: web.Request) -> web.Response:
        return _error_response(400, "INVALID_JSON", "Request body must be valid JSON.")

    try:
-        result = await manager.add_tags_to_asset(
+        result = manager.add_tags_to_asset(
            asset_info_id=asset_info_id,
            tags=data.tags,
            origin="manual",
@ -444,7 +455,7 @@ async def add_asset_tags(request: web.Request) -> web.Response:
    except (ValueError, PermissionError) as ve:
        return _error_response(404, "ASSET_NOT_FOUND", str(ve), {"id": asset_info_id})
    except Exception:
-        LOGGER.exception(
+        logging.exception(
            "add_tags_to_asset failed for asset_info_id=%s, owner_id=%s",
            asset_info_id,
            USER_MANAGER.get_request_user_id(request),
@ -466,7 +477,7 @@ async def delete_asset_tags(request: web.Request) -> web.Response:
        return _error_response(400, "INVALID_JSON", "Request body must be valid JSON.")

    try:
-        result = await manager.remove_tags_from_asset(
+        result = manager.remove_tags_from_asset(
            asset_info_id=asset_info_id,
            tags=data.tags,
            owner_id=USER_MANAGER.get_request_user_id(request),
@ -474,7 +485,7 @@ async def delete_asset_tags(request: web.Request) -> web.Response:
    except ValueError as ve:
        return _error_response(404, "ASSET_NOT_FOUND", str(ve), {"id": asset_info_id})
    except Exception:
-        LOGGER.exception(
+        logging.exception(
            "remove_tags_from_asset failed for asset_info_id=%s, owner_id=%s",
            asset_info_id,
            USER_MANAGER.get_request_user_id(request),
@ -497,48 +508,8 @@ async def seed_assets(request: web.Request) -> web.Response:
        return _validation_error_response("INVALID_BODY", ve)

    try:
-        await scanner.sync_seed_assets(body.roots)
+        scanner.seed_assets(body.roots)
    except Exception:
-        LOGGER.exception("sync_seed_assets failed for roots=%s", body.roots)
+        logging.exception("seed_assets failed for roots=%s", body.roots)
        return _error_response(500, "INTERNAL", "Unexpected server error.")
    return web.json_response({"synced": True, "roots": body.roots}, status=200)
-
-
-@ROUTES.post("/api/assets/scan/schedule")
-async def schedule_asset_scan(request: web.Request) -> web.Response:
-    try:
-        payload = await request.json()
-    except Exception:
-        payload = {}
-
-    try:
-        body = schemas_in.ScheduleAssetScanBody.model_validate(payload)
-    except ValidationError as ve:
-        return _validation_error_response("INVALID_BODY", ve)
-
-    states = await scanner.schedule_scans(body.roots)
-    return web.json_response(states.model_dump(mode="json"), status=202)
-
-
-@ROUTES.get("/api/assets/scan")
-async def get_asset_scan_status(request: web.Request) -> web.Response:
-    root = request.query.get("root", "").strip().lower()
-    states = scanner.current_statuses()
-    if root in {"models", "input", "output"}:
-        states = [s for s in states.scans if s.root == root]  # type: ignore
-        states = schemas_out.AssetScanStatusResponse(scans=states)
-    return web.json_response(states.model_dump(mode="json"), status=200)
-
-
-def register_assets_system(app: web.Application, user_manager_instance: user_manager.UserManager) -> None:
-    global USER_MANAGER
-    USER_MANAGER = user_manager_instance
-    app.add_routes(ROUTES)
-
-
-def _error_response(status: int, code: str, message: str, details: Optional[dict] = None) -> web.Response:
-    return web.json_response({"error": {"code": code, "message": message, "details": details or {}}}, status=status)
-
-
-def _validation_error_response(code: str, ve: ValidationError) -> web.Response:
-    return _error_response(400, code, "Validation failed.", {"errors": ve.json()})
--- a/app/assets/api/schemas_in.py
+++ b/app/assets/api/schemas_in.py
@ -1,6 +1,6 @@
 import json
 import uuid
-from typing import Any, Literal, Optional
+from typing import Any, Literal

 from pydantic import (
    BaseModel,
@ -11,14 +11,15 @@ from pydantic import (
    model_validator,
 )

+from app.assets.helpers import RootType

 class ListAssetsQuery(BaseModel):
    include_tags: list[str] = Field(default_factory=list)
    exclude_tags: list[str] = Field(default_factory=list)
-    name_contains: Optional[str] = None
+    name_contains: str | None = None

    # Accept either a JSON string (query param) or a dict
-    metadata_filter: Optional[dict[str, Any]] = None
+    metadata_filter: dict[str, Any] | None = None

    limit: conint(ge=1, le=500) = 20
    offset: conint(ge=0) = 0
@ -59,9 +60,9 @@ class ListAssetsQuery(BaseModel):


 class UpdateAssetBody(BaseModel):
-    name: Optional[str] = None
-    tags: Optional[list[str]] = None
-    user_metadata: Optional[dict[str, Any]] = None
+    name: str | None = None
+    tags: list[str] | None = None
+    user_metadata: dict[str, Any] | None = None

    @model_validator(mode="after")
    def _at_least_one(self):
@ -116,7 +117,7 @@ class CreateFromHashBody(BaseModel):
 class TagsListQuery(BaseModel):
    model_config = ConfigDict(extra="ignore", str_strip_whitespace=True)

-    prefix: Optional[str] = Field(None, min_length=1, max_length=256)
+    prefix: str | None = Field(None, min_length=1, max_length=256)
    limit: int = Field(100, ge=1, le=1000)
    offset: int = Field(0, ge=0, le=10_000_000)
    order: Literal["count_desc", "name_asc"] = "count_desc"
@ -124,7 +125,7 @@ class TagsListQuery(BaseModel):

    @field_validator("prefix")
    @classmethod
-    def normalize_prefix(cls, v: Optional[str]) -> Optional[str]:
+    def normalize_prefix(cls, v: str | None) -> str | None:
        if v is None:
            return v
        v = v.strip()
@ -158,14 +159,6 @@ class TagsRemove(TagsAdd):
    pass


-RootType = Literal["models", "input", "output"]
-ALLOWED_ROOTS: tuple[RootType, ...] = ("models", "input", "output")
-
-
-class ScheduleAssetScanBody(BaseModel):
-    roots: list[RootType] = Field(..., min_length=1)
-
-
 class UploadAssetSpec(BaseModel):
    """Upload Asset operation.
    - tags: ordered; first is root ('models'|'input'|'output');
@ -180,9 +173,9 @@ class UploadAssetSpec(BaseModel):
    model_config = ConfigDict(extra="ignore", str_strip_whitespace=True)

    tags: list[str] = Field(..., min_length=1)
-    name: Optional[str] = Field(default=None, max_length=512, description="Display Name")
+    name: str | None = Field(default=None, max_length=512, description="Display Name")
    user_metadata: dict[str, Any] = Field(default_factory=dict)
-    hash: Optional[str] = Field(default=None)
+    hash: str | None = Field(default=None)

    @field_validator("hash", mode="before")
    @classmethod
@ -280,7 +273,7 @@ class UploadAssetSpec(BaseModel):

 class SetPreviewBody(BaseModel):
    """Set or clear the preview for an AssetInfo. Provide an Asset.id or null."""
-    preview_id: Optional[str] = None
+    preview_id: str | None = None

    @field_validator("preview_id", mode="before")
    @classmethod
@ -295,3 +288,7 @@ class SetPreviewBody(BaseModel):
        except Exception:
            raise ValueError("preview_id must be a UUID")
        return s
+
+
+class ScheduleAssetScanBody(BaseModel):
+    roots: list[RootType] = Field(..., min_length=1)
--- a/app/assets/api/schemas_out.py
+++ b/app/assets/api/schemas_out.py
@ -1,5 +1,5 @@
 from datetime import datetime
-from typing import Any, Literal, Optional
+from typing import Any

 from pydantic import BaseModel, ConfigDict, Field, field_serializer

@ -7,19 +7,19 @@ from pydantic import BaseModel, ConfigDict, Field, field_serializer
 class AssetSummary(BaseModel):
    id: str
    name: str
-    asset_hash: Optional[str]
-    size: Optional[int] = None
-    mime_type: Optional[str] = None
+    asset_hash: str | None = None
+    size: int | None = None
+    mime_type: str | None = None
    tags: list[str] = Field(default_factory=list)
-    preview_url: Optional[str] = None
-    created_at: Optional[datetime] = None
-    updated_at: Optional[datetime] = None
-    last_access_time: Optional[datetime] = None
+    preview_url: str | None = None
+    created_at: datetime | None = None
+    updated_at: datetime | None = None
+    last_access_time: datetime | None = None

    model_config = ConfigDict(from_attributes=True)

    @field_serializer("created_at", "updated_at", "last_access_time")
-    def _ser_dt(self, v: Optional[datetime], _info):
+    def _ser_dt(self, v: datetime | None, _info):
        return v.isoformat() if v else None


@ -32,34 +32,34 @@ class AssetsList(BaseModel):
 class AssetUpdated(BaseModel):
    id: str
    name: str
-    asset_hash: Optional[str]
+    asset_hash: str | None = None
    tags: list[str] = Field(default_factory=list)
    user_metadata: dict[str, Any] = Field(default_factory=dict)
-    updated_at: Optional[datetime] = None
+    updated_at: datetime | None = None

    model_config = ConfigDict(from_attributes=True)

    @field_serializer("updated_at")
-    def _ser_updated(self, v: Optional[datetime], _info):
+    def _ser_updated(self, v: datetime | None, _info):
        return v.isoformat() if v else None


 class AssetDetail(BaseModel):
    id: str
    name: str
-    asset_hash: Optional[str]
-    size: Optional[int] = None
-    mime_type: Optional[str] = None
+    asset_hash: str | None = None
+    size: int | None = None
+    mime_type: str | None = None
    tags: list[str] = Field(default_factory=list)
    user_metadata: dict[str, Any] = Field(default_factory=dict)
-    preview_id: Optional[str] = None
-    created_at: Optional[datetime] = None
-    last_access_time: Optional[datetime] = None
+    preview_id: str | None = None
+    created_at: datetime | None = None
+    last_access_time: datetime | None = None

    model_config = ConfigDict(from_attributes=True)

    @field_serializer("created_at", "last_access_time")
-    def _ser_dt(self, v: Optional[datetime], _info):
+    def _ser_dt(self, v: datetime | None, _info):
        return v.isoformat() if v else None


@ -91,25 +91,3 @@ class TagsRemove(BaseModel):
    removed: list[str] = Field(default_factory=list)
    not_present: list[str] = Field(default_factory=list)
    total_tags: list[str] = Field(default_factory=list)
-
-
-class AssetScanError(BaseModel):
-    path: str
-    message: str
-    at: Optional[str] = Field(None, description="ISO timestamp")
-
-
-class AssetScanStatus(BaseModel):
-    scan_id: str
-    root: Literal["models", "input", "output"]
-    status: Literal["scheduled", "running", "completed", "failed", "cancelled"]
-    scheduled_at: Optional[str] = None
-    started_at: Optional[str] = None
-    finished_at: Optional[str] = None
-    discovered: int = 0
-    processed: int = 0
-    file_errors: list[AssetScanError] = Field(default_factory=list)
-
-
-class AssetScanStatusResponse(BaseModel):
-    scans: list[AssetScanStatus] = Field(default_factory=list)
--- a/app/assets/database/helpers/bulk_ops.py
+++ b/app/assets/database/helpers/bulk_ops.py
@ -1,22 +1,34 @@
 import os
 import uuid
-from typing import Iterable, Sequence
+import sqlalchemy
+from typing import Iterable
+from sqlalchemy.orm import Session
+from sqlalchemy.dialects import sqlite

-import sqlalchemy as sa
-from sqlalchemy.dialects import postgresql as d_pg
-from sqlalchemy.dialects import sqlite as d_sqlite
-from sqlalchemy.ext.asyncio import AsyncSession
-
-from ..models import Asset, AssetCacheState, AssetInfo, AssetInfoMeta, AssetInfoTag
-from ..timeutil import utcnow
+from app.assets.helpers import utcnow
+from app.assets.database.models import Asset, AssetCacheState, AssetInfo, AssetInfoTag, AssetInfoMeta

 MAX_BIND_PARAMS = 800

+def _chunk_rows(rows: list[dict], cols_per_row: int, max_bind_params: int) -> Iterable[list[dict]]:
+    if not rows:
+        return []
+    rows_per_stmt = max(1, max_bind_params // max(1, cols_per_row))
+    for i in range(0, len(rows), rows_per_stmt):
+        yield rows[i:i + rows_per_stmt]

-async def seed_from_paths_batch(
-    session: AsyncSession,
+def _iter_chunks(seq, n: int):
+    for i in range(0, len(seq), n):
+        yield seq[i:i + n]
+
+def _rows_per_stmt(cols: int) -> int:
+    return max(1, MAX_BIND_PARAMS // max(1, cols))
+
+
+def seed_from_paths_batch(
+    session: Session,
    *,
-    specs: Sequence[dict],
+    specs: list[dict],
    owner_id: str = "",
 ) -> dict:
    """Each spec is a dict with keys:
@ -31,10 +43,6 @@ async def seed_from_paths_batch(
        return {"inserted_infos": 0, "won_states": 0, "lost_states": 0}

    now = utcnow()
-    dialect = session.bind.dialect.name
-    if dialect not in ("sqlite", "postgresql"):
-        raise NotImplementedError(f"Unsupported database dialect: {dialect}")
-
    asset_rows: list[dict] = []
    state_rows: list[dict] = []
    path_to_asset: dict[str, str] = {}
@ -79,55 +87,57 @@ async def seed_from_paths_batch(
        }

    # insert all seed Assets (hash=NULL)
-    ins_asset = d_sqlite.insert(Asset) if dialect == "sqlite" else d_pg.insert(Asset)
+    ins_asset = sqlite.insert(Asset)
    for chunk in _iter_chunks(asset_rows, _rows_per_stmt(5)):
-        await session.execute(ins_asset, chunk)
+        session.execute(ins_asset, chunk)

    # try to claim AssetCacheState (file_path)
-    winners_by_path: set[str] = set()
-    if dialect == "sqlite":
-        ins_state = (
-            d_sqlite.insert(AssetCacheState)
-            .on_conflict_do_nothing(index_elements=[AssetCacheState.file_path])
-            .returning(AssetCacheState.file_path)
-        )
-    else:
-        ins_state = (
-            d_pg.insert(AssetCacheState)
-            .on_conflict_do_nothing(index_elements=[AssetCacheState.file_path])
-            .returning(AssetCacheState.file_path)
-        )
+    # Insert with ON CONFLICT DO NOTHING, then query to find which paths were actually inserted
+    ins_state = (
+        sqlite.insert(AssetCacheState)
+        .on_conflict_do_nothing(index_elements=[AssetCacheState.file_path])
+    )
    for chunk in _iter_chunks(state_rows, _rows_per_stmt(3)):
-        winners_by_path.update((await session.execute(ins_state, chunk)).scalars().all())
+        session.execute(ins_state, chunk)
+
+    # Query to find which of our paths won (were actually inserted)
+    winners_by_path: set[str] = set()
+    for chunk in _iter_chunks(path_list, MAX_BIND_PARAMS):
+        result = session.execute(
+            sqlalchemy.select(AssetCacheState.file_path)
+            .where(AssetCacheState.file_path.in_(chunk))
+            .where(AssetCacheState.asset_id.in_([path_to_asset[p] for p in chunk]))
+        )
+        winners_by_path.update(result.scalars().all())

    all_paths_set = set(path_list)
    losers_by_path = all_paths_set - winners_by_path
    lost_assets = [path_to_asset[p] for p in losers_by_path]
    if lost_assets:  # losers get their Asset removed
        for id_chunk in _iter_chunks(lost_assets, MAX_BIND_PARAMS):
-            await session.execute(sa.delete(Asset).where(Asset.id.in_(id_chunk)))
+            session.execute(sqlalchemy.delete(Asset).where(Asset.id.in_(id_chunk)))

    if not winners_by_path:
        return {"inserted_infos": 0, "won_states": 0, "lost_states": len(losers_by_path)}

    # insert AssetInfo only for winners
+    # Insert with ON CONFLICT DO NOTHING, then query to find which were actually inserted
    winner_info_rows = [asset_to_info[path_to_asset[p]] for p in winners_by_path]
-    if dialect == "sqlite":
-        ins_info = (
-            d_sqlite.insert(AssetInfo)
-            .on_conflict_do_nothing(index_elements=[AssetInfo.asset_id, AssetInfo.owner_id, AssetInfo.name])
-            .returning(AssetInfo.id)
-        )
-    else:
-        ins_info = (
-            d_pg.insert(AssetInfo)
-            .on_conflict_do_nothing(index_elements=[AssetInfo.asset_id, AssetInfo.owner_id, AssetInfo.name])
-            .returning(AssetInfo.id)
-        )
-
-    inserted_info_ids: set[str] = set()
+    ins_info = (
+        sqlite.insert(AssetInfo)
+        .on_conflict_do_nothing(index_elements=[AssetInfo.asset_id, AssetInfo.owner_id, AssetInfo.name])
+    )
    for chunk in _iter_chunks(winner_info_rows, _rows_per_stmt(9)):
-        inserted_info_ids.update((await session.execute(ins_info, chunk)).scalars().all())
+        session.execute(ins_info, chunk)
+
+    # Query to find which info rows were actually inserted (by matching our generated IDs)
+    all_info_ids = [row["id"] for row in winner_info_rows]
+    inserted_info_ids: set[str] = set()
+    for chunk in _iter_chunks(all_info_ids, MAX_BIND_PARAMS):
+        result = session.execute(
+            sqlalchemy.select(AssetInfo.id).where(AssetInfo.id.in_(chunk))
+        )
+        inserted_info_ids.update(result.scalars().all())

    # build and insert tag + meta rows for the AssetInfo
    tag_rows: list[dict] = []
@ -157,7 +167,7 @@ async def seed_from_paths_batch(
                    }
                )

-    await bulk_insert_tags_and_meta(session, tag_rows=tag_rows, meta_rows=meta_rows, max_bind_params=MAX_BIND_PARAMS)
+    bulk_insert_tags_and_meta(session, tag_rows=tag_rows, meta_rows=meta_rows, max_bind_params=MAX_BIND_PARAMS)
    return {
        "inserted_infos": len(inserted_info_ids),
        "won_states": len(winners_by_path),
@ -165,8 +175,8 @@ async def seed_from_paths_batch(
    }


-async def bulk_insert_tags_and_meta(
-    session: AsyncSession,
+def bulk_insert_tags_and_meta(
+    session: Session,
    *,
    tag_rows: list[dict],
    meta_rows: list[dict],
@ -176,55 +186,19 @@ async def bulk_insert_tags_and_meta(
    - tag_rows keys: asset_info_id, tag_name, origin, added_at
    - meta_rows keys: asset_info_id, key, ordinal, val_str, val_num, val_bool, val_json
    """
-    dialect = session.bind.dialect.name
    if tag_rows:
-        if dialect == "sqlite":
-            ins_links = (
-                d_sqlite.insert(AssetInfoTag)
-                .on_conflict_do_nothing(index_elements=[AssetInfoTag.asset_info_id, AssetInfoTag.tag_name])
-            )
-        elif dialect == "postgresql":
-            ins_links = (
-                d_pg.insert(AssetInfoTag)
-                .on_conflict_do_nothing(index_elements=[AssetInfoTag.asset_info_id, AssetInfoTag.tag_name])
-            )
-        else:
-            raise NotImplementedError(f"Unsupported database dialect: {dialect}")
+        ins_links = (
+            sqlite.insert(AssetInfoTag)
+            .on_conflict_do_nothing(index_elements=[AssetInfoTag.asset_info_id, AssetInfoTag.tag_name])
+        )
        for chunk in _chunk_rows(tag_rows, cols_per_row=4, max_bind_params=max_bind_params):
-            await session.execute(ins_links, chunk)
+            session.execute(ins_links, chunk)
    if meta_rows:
-        if dialect == "sqlite":
-            ins_meta = (
-                d_sqlite.insert(AssetInfoMeta)
-                .on_conflict_do_nothing(
-                    index_elements=[AssetInfoMeta.asset_info_id, AssetInfoMeta.key, AssetInfoMeta.ordinal]
-                )
+        ins_meta = (
+            sqlite.insert(AssetInfoMeta)
+            .on_conflict_do_nothing(
+                index_elements=[AssetInfoMeta.asset_info_id, AssetInfoMeta.key, AssetInfoMeta.ordinal]
            )
-        elif dialect == "postgresql":
-            ins_meta = (
-                d_pg.insert(AssetInfoMeta)
-                .on_conflict_do_nothing(
-                    index_elements=[AssetInfoMeta.asset_info_id, AssetInfoMeta.key, AssetInfoMeta.ordinal]
-                )
-            )
-        else:
-            raise NotImplementedError(f"Unsupported database dialect: {dialect}")
+        )
        for chunk in _chunk_rows(meta_rows, cols_per_row=7, max_bind_params=max_bind_params):
-            await session.execute(ins_meta, chunk)
-
-
-def _chunk_rows(rows: list[dict], cols_per_row: int, max_bind_params: int) -> Iterable[list[dict]]:
-    if not rows:
-        return []
-    rows_per_stmt = max(1, max_bind_params // max(1, cols_per_row))
-    for i in range(0, len(rows), rows_per_stmt):
-        yield rows[i:i + rows_per_stmt]
-
-
-def _iter_chunks(seq, n: int):
-    for i in range(0, len(seq), n):
-        yield seq[i:i + n]
-
-
-def _rows_per_stmt(cols: int) -> int:
-    return max(1, MAX_BIND_PARAMS // max(1, cols))
+            session.execute(ins_meta, chunk)
--- a/app/assets/database/helpers/init.py
+++ b/app/assets/database/helpers/init.py
@ -1,25 +0,0 @@
-from .bulk_ops import seed_from_paths_batch
-from .escape_like import escape_like_prefix
-from .fast_check import fast_asset_file_check
-from .filters import apply_metadata_filter, apply_tag_filters
-from .ownership import visible_owner_clause
-from .projection import is_scalar, project_kv
-from .tags import (
-    add_missing_tag_for_asset_id,
-    ensure_tags_exist,
-    remove_missing_tag_for_asset_id,
-)
-
-__all__ = [
-    "apply_tag_filters",
-    "apply_metadata_filter",
-    "escape_like_prefix",
-    "fast_asset_file_check",
-    "is_scalar",
-    "project_kv",
-    "ensure_tags_exist",
-    "add_missing_tag_for_asset_id",
-    "remove_missing_tag_for_asset_id",
-    "seed_from_paths_batch",
-    "visible_owner_clause",
-]
--- a/app/assets/database/helpers/escape_like.py
+++ b/app/assets/database/helpers/escape_like.py
@ -1,7 +0,0 @@
-def escape_like_prefix(s: str, escape: str = "!") -> tuple[str, str]:
-    """Escapes %, _ and the escape char itself in a LIKE prefix.
-    Returns (escaped_prefix, escape_char). Caller should append '%' and pass escape=escape_char to .like().
-    """
-    s = s.replace(escape, escape + escape)  # escape the escape char first
-    s = s.replace("%", escape + "%").replace("_", escape + "_")  # escape LIKE wildcards
-    return s, escape
--- a/app/assets/database/helpers/fast_check.py
+++ b/app/assets/database/helpers/fast_check.py
@ -1,19 +0,0 @@
-import os
-from typing import Optional
-
-
-def fast_asset_file_check(
-    *,
-    mtime_db: Optional[int],
-    size_db: Optional[int],
-    stat_result: os.stat_result,
-) -> bool:
-    if mtime_db is None:
-        return False
-    actual_mtime_ns = getattr(stat_result, "st_mtime_ns", int(stat_result.st_mtime * 1_000_000_000))
-    if int(mtime_db) != int(actual_mtime_ns):
-        return False
-    sz = int(size_db or 0)
-    if sz > 0:
-        return int(stat_result.st_size) == sz
-    return True
--- a/app/assets/database/helpers/filters.py
+++ b/app/assets/database/helpers/filters.py
@ -1,87 +0,0 @@
-from typing import Optional, Sequence
-
-import sqlalchemy as sa
-from sqlalchemy import exists
-
-from ..._helpers import normalize_tags
-from ..models import AssetInfo, AssetInfoMeta, AssetInfoTag
-
-
-def apply_tag_filters(
-    stmt: sa.sql.Select,
-    include_tags: Optional[Sequence[str]],
-    exclude_tags: Optional[Sequence[str]],
-) -> sa.sql.Select:
-    """include_tags: every tag must be present; exclude_tags: none may be present."""
-    include_tags = normalize_tags(include_tags)
-    exclude_tags = normalize_tags(exclude_tags)
-
-    if include_tags:
-        for tag_name in include_tags:
-            stmt = stmt.where(
-                exists().where(
-                    (AssetInfoTag.asset_info_id == AssetInfo.id)
-                    & (AssetInfoTag.tag_name == tag_name)
-                )
-            )
-
-    if exclude_tags:
-        stmt = stmt.where(
-            ~exists().where(
-                (AssetInfoTag.asset_info_id == AssetInfo.id)
-                & (AssetInfoTag.tag_name.in_(exclude_tags))
-            )
-        )
-    return stmt
-
-
-def apply_metadata_filter(
-    stmt: sa.sql.Select,
-    metadata_filter: Optional[dict],
-) -> sa.sql.Select:
-    """Apply filters using asset_info_meta projection table."""
-    if not metadata_filter:
-        return stmt
-
-    def _exists_for_pred(key: str, *preds) -> sa.sql.ClauseElement:
-        return sa.exists().where(
-            AssetInfoMeta.asset_info_id == AssetInfo.id,
-            AssetInfoMeta.key == key,
-            *preds,
-        )
-
-    def _exists_clause_for_value(key: str, value) -> sa.sql.ClauseElement:
-        if value is None:
-            no_row_for_key = sa.not_(
-                sa.exists().where(
-                    AssetInfoMeta.asset_info_id == AssetInfo.id,
-                    AssetInfoMeta.key == key,
-                )
-            )
-            null_row = _exists_for_pred(
-                key,
-                AssetInfoMeta.val_json.is_(None),
-                AssetInfoMeta.val_str.is_(None),
-                AssetInfoMeta.val_num.is_(None),
-                AssetInfoMeta.val_bool.is_(None),
-            )
-            return sa.or_(no_row_for_key, null_row)
-
-        if isinstance(value, bool):
-            return _exists_for_pred(key, AssetInfoMeta.val_bool == bool(value))
-        if isinstance(value, (int, float)):
-            from decimal import Decimal
-            num = value if isinstance(value, Decimal) else Decimal(str(value))
-            return _exists_for_pred(key, AssetInfoMeta.val_num == num)
-        if isinstance(value, str):
-            return _exists_for_pred(key, AssetInfoMeta.val_str == value)
-        return _exists_for_pred(key, AssetInfoMeta.val_json == value)
-
-    for k, v in metadata_filter.items():
-        if isinstance(v, list):
-            ors = [_exists_clause_for_value(k, elem) for elem in v]
-            if ors:
-                stmt = stmt.where(sa.or_(*ors))
-        else:
-            stmt = stmt.where(_exists_clause_for_value(k, v))
-    return stmt
--- a/app/assets/database/helpers/ownership.py
+++ b/app/assets/database/helpers/ownership.py
@ -1,12 +0,0 @@
-import sqlalchemy as sa
-
-from ..models import AssetInfo
-
-
-def visible_owner_clause(owner_id: str) -> sa.sql.ClauseElement:
-    """Build owner visibility predicate for reads. Owner-less rows are visible to everyone."""
-
-    owner_id = (owner_id or "").strip()
-    if owner_id == "":
-        return AssetInfo.owner_id == ""
-    return AssetInfo.owner_id.in_(["", owner_id])
--- a/app/assets/database/helpers/projection.py
+++ b/app/assets/database/helpers/projection.py
@ -1,64 +0,0 @@
-from decimal import Decimal
-
-
-def is_scalar(v):
-    if v is None:
-        return True
-    if isinstance(v, bool):
-        return True
-    if isinstance(v, (int, float, Decimal, str)):
-        return True
-    return False
-
-
-def project_kv(key: str, value):
-    """
-    Turn a metadata key/value into typed projection rows.
-    Returns list[dict] with keys:
-      key, ordinal, and one of val_str / val_num / val_bool / val_json (others None)
-    """
-    rows: list[dict] = []
-
-    def _null_row(ordinal: int) -> dict:
-        return {
-            "key": key, "ordinal": ordinal,
-            "val_str": None, "val_num": None, "val_bool": None, "val_json": None
-        }
-
-    if value is None:
-        rows.append(_null_row(0))
-        return rows
-
-    if is_scalar(value):
-        if isinstance(value, bool):
-            rows.append({"key": key, "ordinal": 0, "val_bool": bool(value)})
-        elif isinstance(value, (int, float, Decimal)):
-            num = value if isinstance(value, Decimal) else Decimal(str(value))
-            rows.append({"key": key, "ordinal": 0, "val_num": num})
-        elif isinstance(value, str):
-            rows.append({"key": key, "ordinal": 0, "val_str": value})
-        else:
-            rows.append({"key": key, "ordinal": 0, "val_json": value})
-        return rows
-
-    if isinstance(value, list):
-        if all(is_scalar(x) for x in value):
-            for i, x in enumerate(value):
-                if x is None:
-                    rows.append(_null_row(i))
-                elif isinstance(x, bool):
-                    rows.append({"key": key, "ordinal": i, "val_bool": bool(x)})
-                elif isinstance(x, (int, float, Decimal)):
-                    num = x if isinstance(x, Decimal) else Decimal(str(x))
-                    rows.append({"key": key, "ordinal": i, "val_num": num})
-                elif isinstance(x, str):
-                    rows.append({"key": key, "ordinal": i, "val_str": x})
-                else:
-                    rows.append({"key": key, "ordinal": i, "val_json": x})
-            return rows
-        for i, x in enumerate(value):
-            rows.append({"key": key, "ordinal": i, "val_json": x})
-        return rows
-
-    rows.append({"key": key, "ordinal": 0, "val_json": value})
-    return rows
--- a/app/assets/database/helpers/tags.py
+++ b/app/assets/database/helpers/tags.py
@ -1,90 +0,0 @@
-from typing import Iterable
-
-import sqlalchemy as sa
-from sqlalchemy.dialects import postgresql as d_pg
-from sqlalchemy.dialects import sqlite as d_sqlite
-from sqlalchemy.ext.asyncio import AsyncSession
-
-from ..._helpers import normalize_tags
-from ..models import AssetInfo, AssetInfoTag, Tag
-from ..timeutil import utcnow
-
-
-async def ensure_tags_exist(session: AsyncSession, names: Iterable[str], tag_type: str = "user") -> None:
-    wanted = normalize_tags(list(names))
-    if not wanted:
-        return
-    rows = [{"name": n, "tag_type": tag_type} for n in list(dict.fromkeys(wanted))]
-    dialect = session.bind.dialect.name
-    if dialect == "sqlite":
-        ins = (
-            d_sqlite.insert(Tag)
-            .values(rows)
-            .on_conflict_do_nothing(index_elements=[Tag.name])
-        )
-    elif dialect == "postgresql":
-        ins = (
-            d_pg.insert(Tag)
-            .values(rows)
-            .on_conflict_do_nothing(index_elements=[Tag.name])
-        )
-    else:
-        raise NotImplementedError(f"Unsupported database dialect: {dialect}")
-    await session.execute(ins)
-
-
-async def add_missing_tag_for_asset_id(
-    session: AsyncSession,
-    *,
-    asset_id: str,
-    origin: str = "automatic",
-) -> None:
-    select_rows = (
-        sa.select(
-            AssetInfo.id.label("asset_info_id"),
-            sa.literal("missing").label("tag_name"),
-            sa.literal(origin).label("origin"),
-            sa.literal(utcnow()).label("added_at"),
-        )
-        .where(AssetInfo.asset_id == asset_id)
-        .where(
-            sa.not_(
-                sa.exists().where((AssetInfoTag.asset_info_id == AssetInfo.id) & (AssetInfoTag.tag_name == "missing"))
-            )
-        )
-    )
-    dialect = session.bind.dialect.name
-    if dialect == "sqlite":
-        ins = (
-            d_sqlite.insert(AssetInfoTag)
-            .from_select(
-                ["asset_info_id", "tag_name", "origin", "added_at"],
-                select_rows,
-            )
-            .on_conflict_do_nothing(index_elements=[AssetInfoTag.asset_info_id, AssetInfoTag.tag_name])
-        )
-    elif dialect == "postgresql":
-        ins = (
-            d_pg.insert(AssetInfoTag)
-            .from_select(
-                ["asset_info_id", "tag_name", "origin", "added_at"],
-                select_rows,
-            )
-            .on_conflict_do_nothing(index_elements=[AssetInfoTag.asset_info_id, AssetInfoTag.tag_name])
-        )
-    else:
-        raise NotImplementedError(f"Unsupported database dialect: {dialect}")
-    await session.execute(ins)
-
-
-async def remove_missing_tag_for_asset_id(
-    session: AsyncSession,
-    *,
-    asset_id: str,
-) -> None:
-    await session.execute(
-        sa.delete(AssetInfoTag).where(
-            AssetInfoTag.asset_info_id.in_(sa.select(AssetInfo.id).where(AssetInfo.asset_id == asset_id)),
-            AssetInfoTag.tag_name == "missing",
-        )
-    )
--- a/app/assets/database/models.py
+++ b/app/assets/database/models.py
@ -1,7 +1,9 @@
+from __future__ import annotations
+
 import uuid
 from datetime import datetime
-from typing import Any, Optional

+from typing import Any
 from sqlalchemy import (
    JSON,
    BigInteger,
@ -16,44 +18,24 @@ from sqlalchemy import (
    Text,
    UniqueConstraint,
 )
-from sqlalchemy.dialects.postgresql import JSONB
-from sqlalchemy.orm import DeclarativeBase, Mapped, foreign, mapped_column, relationship
+from sqlalchemy.orm import Mapped, foreign, mapped_column, relationship

-from .timeutil import utcnow
-
-JSONB_V = JSON(none_as_null=True).with_variant(JSONB(none_as_null=True), 'postgresql')
-
-
-class Base(DeclarativeBase):
-    pass
-
-
-def to_dict(obj: Any, include_none: bool = False) -> dict[str, Any]:
-    fields = obj.__table__.columns.keys()
-    out: dict[str, Any] = {}
-    for field in fields:
-        val = getattr(obj, field)
-        if val is None and not include_none:
-            continue
-        if isinstance(val, datetime):
-            out[field] = val.isoformat()
-        else:
-            out[field] = val
-    return out
+from app.assets.helpers import utcnow
+from app.database.models import to_dict, Base


 class Asset(Base):
    __tablename__ = "assets"

    id: Mapped[str] = mapped_column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
-    hash: Mapped[Optional[str]] = mapped_column(String(256), nullable=True)
+    hash: Mapped[str | None] = mapped_column(String(256), nullable=True)
    size_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False, default=0)
-    mime_type: Mapped[Optional[str]] = mapped_column(String(255))
+    mime_type: Mapped[str | None] = mapped_column(String(255))
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=False), nullable=False, default=utcnow
    )

-    infos: Mapped[list["AssetInfo"]] = relationship(
+    infos: Mapped[list[AssetInfo]] = relationship(
        "AssetInfo",
        back_populates="asset",
        primaryjoin=lambda: Asset.id == foreign(AssetInfo.asset_id),
@ -62,7 +44,7 @@ class Asset(Base):
        passive_deletes=True,
    )

-    preview_of: Mapped[list["AssetInfo"]] = relationship(
+    preview_of: Mapped[list[AssetInfo]] = relationship(
        "AssetInfo",
        back_populates="preview_asset",
        primaryjoin=lambda: Asset.id == foreign(AssetInfo.preview_id),
@ -70,7 +52,7 @@ class Asset(Base):
        viewonly=True,
    )

-    cache_states: Mapped[list["AssetCacheState"]] = relationship(
+    cache_states: Mapped[list[AssetCacheState]] = relationship(
        back_populates="asset",
        cascade="all, delete-orphan",
        passive_deletes=True,
@ -95,10 +77,10 @@ class AssetCacheState(Base):
    id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
    asset_id: Mapped[str] = mapped_column(String(36), ForeignKey("assets.id", ondelete="CASCADE"), nullable=False)
    file_path: Mapped[str] = mapped_column(Text, nullable=False)
-    mtime_ns: Mapped[Optional[int]] = mapped_column(BigInteger, nullable=True)
+    mtime_ns: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
    needs_verify: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)

-    asset: Mapped["Asset"] = relationship(back_populates="cache_states")
+    asset: Mapped[Asset] = relationship(back_populates="cache_states")

    __table_args__ = (
        Index("ix_asset_cache_state_file_path", "file_path"),
@ -121,8 +103,8 @@ class AssetInfo(Base):
    owner_id: Mapped[str] = mapped_column(String(128), nullable=False, default="")
    name: Mapped[str] = mapped_column(String(512), nullable=False)
    asset_id: Mapped[str] = mapped_column(String(36), ForeignKey("assets.id", ondelete="RESTRICT"), nullable=False)
-    preview_id: Mapped[Optional[str]] = mapped_column(String(36), ForeignKey("assets.id", ondelete="SET NULL"))
-    user_metadata: Mapped[Optional[dict[str, Any]]] = mapped_column(JSON(none_as_null=True))
+    preview_id: Mapped[str | None] = mapped_column(String(36), ForeignKey("assets.id", ondelete="SET NULL"))
+    user_metadata: Mapped[dict[str, Any] | None] = mapped_column(JSON(none_as_null=True))
    created_at: Mapped[datetime] = mapped_column(DateTime(timezone=False), nullable=False, default=utcnow)
    updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=False), nullable=False, default=utcnow)
    last_access_time: Mapped[datetime] = mapped_column(DateTime(timezone=False), nullable=False, default=utcnow)
@ -133,26 +115,26 @@ class AssetInfo(Base):
        foreign_keys=[asset_id],
        lazy="selectin",
    )
-    preview_asset: Mapped[Optional[Asset]] = relationship(
+    preview_asset: Mapped[Asset | None] = relationship(
        "Asset",
        back_populates="preview_of",
        foreign_keys=[preview_id],
    )

-    metadata_entries: Mapped[list["AssetInfoMeta"]] = relationship(
+    metadata_entries: Mapped[list[AssetInfoMeta]] = relationship(
        back_populates="asset_info",
        cascade="all,delete-orphan",
        passive_deletes=True,
    )

-    tag_links: Mapped[list["AssetInfoTag"]] = relationship(
+    tag_links: Mapped[list[AssetInfoTag]] = relationship(
        back_populates="asset_info",
        cascade="all,delete-orphan",
        passive_deletes=True,
        overlaps="tags,asset_infos",
    )

-    tags: Mapped[list["Tag"]] = relationship(
+    tags: Mapped[list[Tag]] = relationship(
        secondary="asset_info_tags",
        back_populates="asset_infos",
        lazy="selectin",
@ -188,12 +170,12 @@ class AssetInfoMeta(Base):
    key: Mapped[str] = mapped_column(String(256), primary_key=True)
    ordinal: Mapped[int] = mapped_column(Integer, primary_key=True, default=0)

-    val_str: Mapped[Optional[str]] = mapped_column(String(2048), nullable=True)
-    val_num: Mapped[Optional[float]] = mapped_column(Numeric(38, 10), nullable=True)
-    val_bool: Mapped[Optional[bool]] = mapped_column(Boolean, nullable=True)
-    val_json: Mapped[Optional[Any]] = mapped_column(JSONB_V, nullable=True)
+    val_str: Mapped[str | None] = mapped_column(String(2048), nullable=True)
+    val_num: Mapped[float | None] = mapped_column(Numeric(38, 10), nullable=True)
+    val_bool: Mapped[bool | None] = mapped_column(Boolean, nullable=True)
+    val_json: Mapped[Any | None] = mapped_column(JSON(none_as_null=True), nullable=True)

-    asset_info: Mapped["AssetInfo"] = relationship(back_populates="metadata_entries")
+    asset_info: Mapped[AssetInfo] = relationship(back_populates="metadata_entries")

    __table_args__ = (
        Index("ix_asset_info_meta_key", "key"),
@ -217,8 +199,8 @@ class AssetInfoTag(Base):
        DateTime(timezone=False), nullable=False, default=utcnow
    )

-    asset_info: Mapped["AssetInfo"] = relationship(back_populates="tag_links")
-    tag: Mapped["Tag"] = relationship(back_populates="asset_info_links")
+    asset_info: Mapped[AssetInfo] = relationship(back_populates="tag_links")
+    tag: Mapped[Tag] = relationship(back_populates="asset_info_links")

    __table_args__ = (
        Index("ix_asset_info_tags_tag_name", "tag_name"),
@ -232,11 +214,11 @@ class Tag(Base):
    name: Mapped[str] = mapped_column(String(512), primary_key=True)
    tag_type: Mapped[str] = mapped_column(String(32), nullable=False, default="user")

-    asset_info_links: Mapped[list["AssetInfoTag"]] = relationship(
+    asset_info_links: Mapped[list[AssetInfoTag]] = relationship(
        back_populates="tag",
        overlaps="asset_infos,tags",
    )
-    asset_infos: Mapped[list["AssetInfo"]] = relationship(
+    asset_infos: Mapped[list[AssetInfo]] = relationship(
        secondary="asset_info_tags",
        back_populates="tags",
        viewonly=True,
--- a/app/assets/database/queries.py
+++ b/app/assets/database/queries.py
@ -0,0 +1,975 @@
+import os
+import logging
+import sqlalchemy as sa
+from collections import defaultdict
+from datetime import datetime
+from typing import Iterable, Any
+from sqlalchemy import select, delete, exists, func
+from sqlalchemy.dialects import sqlite
+from sqlalchemy.exc import IntegrityError
+from sqlalchemy.orm import Session, contains_eager, noload
+from app.assets.database.models import Asset, AssetInfo, AssetCacheState, AssetInfoMeta, AssetInfoTag, Tag
+from app.assets.helpers import (
+    compute_relative_filename, escape_like_prefix, normalize_tags, project_kv, utcnow
+)
+from typing import Sequence
+
+
+def visible_owner_clause(owner_id: str) -> sa.sql.ClauseElement:
+    """Build owner visibility predicate for reads. Owner-less rows are visible to everyone."""
+    owner_id = (owner_id or "").strip()
+    if owner_id == "":
+        return AssetInfo.owner_id == ""
+    return AssetInfo.owner_id.in_(["", owner_id])
+
+
+def pick_best_live_path(states: Sequence[AssetCacheState]) -> str:
+    """
+    Return the best on-disk path among cache states:
+      1) Prefer a path that exists with needs_verify == False (already verified).
+      2) Otherwise, pick the first path that exists.
+      3) Otherwise return empty string.
+    """
+    alive = [s for s in states if getattr(s, "file_path", None) and os.path.isfile(s.file_path)]
+    if not alive:
+        return ""
+    for s in alive:
+        if not getattr(s, "needs_verify", False):
+            return s.file_path
+    return alive[0].file_path
+
+
+def apply_tag_filters(
+    stmt: sa.sql.Select,
+    include_tags: Sequence[str] | None = None,
+    exclude_tags: Sequence[str] | None = None,
+) -> sa.sql.Select:
+    """include_tags: every tag must be present; exclude_tags: none may be present."""
+    include_tags = normalize_tags(include_tags)
+    exclude_tags = normalize_tags(exclude_tags)
+
+    if include_tags:
+        for tag_name in include_tags:
+            stmt = stmt.where(
+                exists().where(
+                    (AssetInfoTag.asset_info_id == AssetInfo.id)
+                    & (AssetInfoTag.tag_name == tag_name)
+                )
+            )
+
+    if exclude_tags:
+        stmt = stmt.where(
+            ~exists().where(
+                (AssetInfoTag.asset_info_id == AssetInfo.id)
+                & (AssetInfoTag.tag_name.in_(exclude_tags))
+            )
+        )
+    return stmt
+
+
+def apply_metadata_filter(
+    stmt: sa.sql.Select,
+    metadata_filter: dict | None = None,
+) -> sa.sql.Select:
+    """Apply filters using asset_info_meta projection table."""
+    if not metadata_filter:
+        return stmt
+
+    def _exists_for_pred(key: str, *preds) -> sa.sql.ClauseElement:
+        return sa.exists().where(
+            AssetInfoMeta.asset_info_id == AssetInfo.id,
+            AssetInfoMeta.key == key,
+            *preds,
+        )
+
+    def _exists_clause_for_value(key: str, value) -> sa.sql.ClauseElement:
+        if value is None:
+            no_row_for_key = sa.not_(
+                sa.exists().where(
+                    AssetInfoMeta.asset_info_id == AssetInfo.id,
+                    AssetInfoMeta.key == key,
+                )
+            )
+            null_row = _exists_for_pred(
+                key,
+                AssetInfoMeta.val_json.is_(None),
+                AssetInfoMeta.val_str.is_(None),
+                AssetInfoMeta.val_num.is_(None),
+                AssetInfoMeta.val_bool.is_(None),
+            )
+            return sa.or_(no_row_for_key, null_row)
+
+        if isinstance(value, bool):
+            return _exists_for_pred(key, AssetInfoMeta.val_bool == bool(value))
+        if isinstance(value, (int, float)):
+            from decimal import Decimal
+            num = value if isinstance(value, Decimal) else Decimal(str(value))
+            return _exists_for_pred(key, AssetInfoMeta.val_num == num)
+        if isinstance(value, str):
+            return _exists_for_pred(key, AssetInfoMeta.val_str == value)
+        return _exists_for_pred(key, AssetInfoMeta.val_json == value)
+
+    for k, v in metadata_filter.items():
+        if isinstance(v, list):
+            ors = [_exists_clause_for_value(k, elem) for elem in v]
+            if ors:
+                stmt = stmt.where(sa.or_(*ors))
+        else:
+            stmt = stmt.where(_exists_clause_for_value(k, v))
+    return stmt
+
+
+def asset_exists_by_hash(
+    session: Session,
+    *,
+    asset_hash: str,
+) -> bool:
+    """
+    Check if an asset with a given hash exists in database.
+    """
+    row = (
+        session.execute(
+            select(sa.literal(True)).select_from(Asset).where(Asset.hash == asset_hash).limit(1)
+        )
+    ).first()
+    return row is not None
+
+
+def asset_info_exists_for_asset_id(
+    session: Session,
+    *,
+    asset_id: str,
+) -> bool:
+    q = (
+        select(sa.literal(True))
+        .select_from(AssetInfo)
+        .where(AssetInfo.asset_id == asset_id)
+        .limit(1)
+    )
+    return (session.execute(q)).first() is not None
+
+
+def get_asset_by_hash(
+    session: Session,
+    *,
+    asset_hash: str,
+) -> Asset | None:
+    return (
+        session.execute(select(Asset).where(Asset.hash == asset_hash).limit(1))
+    ).scalars().first()
+
+
+def get_asset_info_by_id(
+    session: Session,
+    *,
+    asset_info_id: str,
+) -> AssetInfo | None:
+    return session.get(AssetInfo, asset_info_id)
+
+
+def list_asset_infos_page(
+    session: Session,
+    owner_id: str = "",
+    include_tags: Sequence[str] | None = None,
+    exclude_tags: Sequence[str] | None = None,
+    name_contains: str | None = None,
+    metadata_filter: dict | None = None,
+    limit: int = 20,
+    offset: int = 0,
+    sort: str = "created_at",
+    order: str = "desc",
+) -> tuple[list[AssetInfo], dict[str, list[str]], int]:
+    base = (
+        select(AssetInfo)
+        .join(Asset, Asset.id == AssetInfo.asset_id)
+        .options(contains_eager(AssetInfo.asset), noload(AssetInfo.tags))
+        .where(visible_owner_clause(owner_id))
+    )
+
+    if name_contains:
+        escaped, esc = escape_like_prefix(name_contains)
+        base = base.where(AssetInfo.name.ilike(f"%{escaped}%", escape=esc))
+
+    base = apply_tag_filters(base, include_tags, exclude_tags)
+    base = apply_metadata_filter(base, metadata_filter)
+
+    sort = (sort or "created_at").lower()
+    order = (order or "desc").lower()
+    sort_map = {
+        "name": AssetInfo.name,
+        "created_at": AssetInfo.created_at,
+        "updated_at": AssetInfo.updated_at,
+        "last_access_time": AssetInfo.last_access_time,
+        "size": Asset.size_bytes,
+    }
+    sort_col = sort_map.get(sort, AssetInfo.created_at)
+    sort_exp = sort_col.desc() if order == "desc" else sort_col.asc()
+
+    base = base.order_by(sort_exp).limit(limit).offset(offset)
+
+    count_stmt = (
+        select(sa.func.count())
+        .select_from(AssetInfo)
+        .join(Asset, Asset.id == AssetInfo.asset_id)
+        .where(visible_owner_clause(owner_id))
+    )
+    if name_contains:
+        escaped, esc = escape_like_prefix(name_contains)
+        count_stmt = count_stmt.where(AssetInfo.name.ilike(f"%{escaped}%", escape=esc))
+    count_stmt = apply_tag_filters(count_stmt, include_tags, exclude_tags)
+    count_stmt = apply_metadata_filter(count_stmt, metadata_filter)
+
+    total = int((session.execute(count_stmt)).scalar_one() or 0)
+
+    infos = (session.execute(base)).unique().scalars().all()
+
+    id_list: list[str] = [i.id for i in infos]
+    tag_map: dict[str, list[str]] = defaultdict(list)
+    if id_list:
+        rows = session.execute(
+            select(AssetInfoTag.asset_info_id, Tag.name)
+            .join(Tag, Tag.name == AssetInfoTag.tag_name)
+            .where(AssetInfoTag.asset_info_id.in_(id_list))
+        )
+        for aid, tag_name in rows.all():
+            tag_map[aid].append(tag_name)
+
+    return infos, tag_map, total
+
+
+def fetch_asset_info_asset_and_tags(
+    session: Session,
+    asset_info_id: str,
+    owner_id: str = "",
+) -> tuple[AssetInfo, Asset, list[str]] | None:
+    stmt = (
+        select(AssetInfo, Asset, Tag.name)
+        .join(Asset, Asset.id == AssetInfo.asset_id)
+        .join(AssetInfoTag, AssetInfoTag.asset_info_id == AssetInfo.id, isouter=True)
+        .join(Tag, Tag.name == AssetInfoTag.tag_name, isouter=True)
+        .where(
+            AssetInfo.id == asset_info_id,
+            visible_owner_clause(owner_id),
+        )
+        .options(noload(AssetInfo.tags))
+        .order_by(Tag.name.asc())
+    )
+
+    rows = (session.execute(stmt)).all()
+    if not rows:
+        return None
+
+    first_info, first_asset, _ = rows[0]
+    tags: list[str] = []
+    seen: set[str] = set()
+    for _info, _asset, tag_name in rows:
+        if tag_name and tag_name not in seen:
+            seen.add(tag_name)
+            tags.append(tag_name)
+    return first_info, first_asset, tags
+
+
+def fetch_asset_info_and_asset(
+    session: Session,
+    *,
+    asset_info_id: str,
+    owner_id: str = "",
+) -> tuple[AssetInfo, Asset] | None:
+    stmt = (
+        select(AssetInfo, Asset)
+        .join(Asset, Asset.id == AssetInfo.asset_id)
+        .where(
+            AssetInfo.id == asset_info_id,
+            visible_owner_clause(owner_id),
+        )
+        .limit(1)
+        .options(noload(AssetInfo.tags))
+    )
+    row = session.execute(stmt)
+    pair = row.first()
+    if not pair:
+        return None
+    return pair[0], pair[1]
+
+def list_cache_states_by_asset_id(
+    session: Session, *, asset_id: str
+) -> Sequence[AssetCacheState]:
+    return (
+        session.execute(
+            select(AssetCacheState)
+            .where(AssetCacheState.asset_id == asset_id)
+            .order_by(AssetCacheState.id.asc())
+        )
+    ).scalars().all()
+
+
+def touch_asset_info_by_id(
+    session: Session,
+    *,
+    asset_info_id: str,
+    ts: datetime | None = None,
+    only_if_newer: bool = True,
+) -> None:
+    ts = ts or utcnow()
+    stmt = sa.update(AssetInfo).where(AssetInfo.id == asset_info_id)
+    if only_if_newer:
+        stmt = stmt.where(
+            sa.or_(AssetInfo.last_access_time.is_(None), AssetInfo.last_access_time < ts)
+        )
+    session.execute(stmt.values(last_access_time=ts))
+
+
+def create_asset_info_for_existing_asset(
+    session: Session,
+    *,
+    asset_hash: str,
+    name: str,
+    user_metadata: dict | None = None,
+    tags: Sequence[str] | None = None,
+    tag_origin: str = "manual",
+    owner_id: str = "",
+) -> AssetInfo:
+    """Create or return an existing AssetInfo for an Asset identified by asset_hash."""
+    now = utcnow()
+    asset = get_asset_by_hash(session, asset_hash=asset_hash)
+    if not asset:
+        raise ValueError(f"Unknown asset hash {asset_hash}")
+
+    info = AssetInfo(
+        owner_id=owner_id,
+        name=name,
+        asset_id=asset.id,
+        preview_id=None,
+        created_at=now,
+        updated_at=now,
+        last_access_time=now,
+    )
+    try:
+        with session.begin_nested():
+            session.add(info)
+            session.flush()
+    except IntegrityError:
+        existing = (
+            session.execute(
+                select(AssetInfo)
+                .options(noload(AssetInfo.tags))
+                .where(
+                    AssetInfo.asset_id == asset.id,
+                    AssetInfo.name == name,
+                    AssetInfo.owner_id == owner_id,
+                )
+                .limit(1)
+            )
+        ).unique().scalars().first()
+        if not existing:
+            raise RuntimeError("AssetInfo upsert failed to find existing row after conflict.")
+        return existing
+
+    # metadata["filename"] hack
+    new_meta = dict(user_metadata or {})
+    computed_filename = None
+    try:
+        p = pick_best_live_path(list_cache_states_by_asset_id(session, asset_id=asset.id))
+        if p:
+            computed_filename = compute_relative_filename(p)
+    except Exception:
+        computed_filename = None
+    if computed_filename:
+        new_meta["filename"] = computed_filename
+    if new_meta:
+        replace_asset_info_metadata_projection(
+            session,
+            asset_info_id=info.id,
+            user_metadata=new_meta,
+        )
+
+    if tags is not None:
+        set_asset_info_tags(
+            session,
+            asset_info_id=info.id,
+            tags=tags,
+            origin=tag_origin,
+        )
+    return info
+
+
+def set_asset_info_tags(
+    session: Session,
+    *,
+    asset_info_id: str,
+    tags: Sequence[str],
+    origin: str = "manual",
+) -> dict:
+    desired = normalize_tags(tags)
+
+    current = set(
+        tag_name for (tag_name,) in (
+            session.execute(select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == asset_info_id))
+        ).all()
+    )
+
+    to_add = [t for t in desired if t not in current]
+    to_remove = [t for t in current if t not in desired]
+
+    if to_add:
+        ensure_tags_exist(session, to_add, tag_type="user")
+        session.add_all([
+            AssetInfoTag(asset_info_id=asset_info_id, tag_name=t, origin=origin, added_at=utcnow())
+            for t in to_add
+        ])
+        session.flush()
+
+    if to_remove:
+        session.execute(
+            delete(AssetInfoTag)
+            .where(AssetInfoTag.asset_info_id == asset_info_id, AssetInfoTag.tag_name.in_(to_remove))
+        )
+        session.flush()
+
+    return {"added": to_add, "removed": to_remove, "total": desired}
+
+
+def replace_asset_info_metadata_projection(
+    session: Session,
+    *,
+    asset_info_id: str,
+    user_metadata: dict | None = None,
+) -> None:
+    info = session.get(AssetInfo, asset_info_id)
+    if not info:
+        raise ValueError(f"AssetInfo {asset_info_id} not found")
+
+    info.user_metadata = user_metadata or {}
+    info.updated_at = utcnow()
+    session.flush()
+
+    session.execute(delete(AssetInfoMeta).where(AssetInfoMeta.asset_info_id == asset_info_id))
+    session.flush()
+
+    if not user_metadata:
+        return
+
+    rows: list[AssetInfoMeta] = []
+    for k, v in user_metadata.items():
+        for r in project_kv(k, v):
+            rows.append(
+                AssetInfoMeta(
+                    asset_info_id=asset_info_id,
+                    key=r["key"],
+                    ordinal=int(r["ordinal"]),
+                    val_str=r.get("val_str"),
+                    val_num=r.get("val_num"),
+                    val_bool=r.get("val_bool"),
+                    val_json=r.get("val_json"),
+                )
+            )
+    if rows:
+        session.add_all(rows)
+        session.flush()
+
+
+def ingest_fs_asset(
+    session: Session,
+    *,
+    asset_hash: str,
+    abs_path: str,
+    size_bytes: int,
+    mtime_ns: int,
+    mime_type: str | None = None,
+    info_name: str | None = None,
+    owner_id: str = "",
+    preview_id: str | None = None,
+    user_metadata: dict | None = None,
+    tags: Sequence[str] = (),
+    tag_origin: str = "manual",
+    require_existing_tags: bool = False,
+) -> dict:
+    """
+    Idempotently upsert:
+      - Asset by content hash (create if missing)
+      - AssetCacheState(file_path) pointing to asset_id
+      - Optionally AssetInfo + tag links and metadata projection
+    Returns flags and ids.
+    """
+    locator = os.path.abspath(abs_path)
+    now = utcnow()
+
+    if preview_id:
+        if not session.get(Asset, preview_id):
+            preview_id = None
+
+    out: dict[str, Any] = {
+        "asset_created": False,
+        "asset_updated": False,
+        "state_created": False,
+        "state_updated": False,
+        "asset_info_id": None,
+    }
+
+    # 1) Asset by hash
+    asset = (
+        session.execute(select(Asset).where(Asset.hash == asset_hash).limit(1))
+    ).scalars().first()
+    if not asset:
+        vals = {
+            "hash": asset_hash,
+            "size_bytes": int(size_bytes),
+            "mime_type": mime_type,
+            "created_at": now,
+        }
+        res = session.execute(
+            sqlite.insert(Asset)
+            .values(**vals)
+            .on_conflict_do_nothing(index_elements=[Asset.hash])
+        )
+        if int(res.rowcount or 0) > 0:
+            out["asset_created"] = True
+        asset = (
+            session.execute(
+                select(Asset).where(Asset.hash == asset_hash).limit(1)
+            )
+        ).scalars().first()
+        if not asset:
+            raise RuntimeError("Asset row not found after upsert.")
+    else:
+        changed = False
+        if asset.size_bytes != int(size_bytes) and int(size_bytes) > 0:
+            asset.size_bytes = int(size_bytes)
+            changed = True
+        if mime_type and asset.mime_type != mime_type:
+            asset.mime_type = mime_type
+            changed = True
+        if changed:
+            out["asset_updated"] = True
+
+    # 2) AssetCacheState upsert by file_path (unique)
+    vals = {
+        "asset_id": asset.id,
+        "file_path": locator,
+        "mtime_ns": int(mtime_ns),
+    }
+    ins = (
+        sqlite.insert(AssetCacheState)
+        .values(**vals)
+        .on_conflict_do_nothing(index_elements=[AssetCacheState.file_path])
+    )
+
+    res = session.execute(ins)
+    if int(res.rowcount or 0) > 0:
+        out["state_created"] = True
+    else:
+        upd = (
+            sa.update(AssetCacheState)
+            .where(AssetCacheState.file_path == locator)
+            .where(
+                sa.or_(
+                    AssetCacheState.asset_id != asset.id,
+                    AssetCacheState.mtime_ns.is_(None),
+                    AssetCacheState.mtime_ns != int(mtime_ns),
+                )
+            )
+            .values(asset_id=asset.id, mtime_ns=int(mtime_ns))
+        )
+        res2 = session.execute(upd)
+        if int(res2.rowcount or 0) > 0:
+            out["state_updated"] = True
+
+    # 3) Optional AssetInfo + tags + metadata
+    if info_name:
+        try:
+            with session.begin_nested():
+                info = AssetInfo(
+                    owner_id=owner_id,
+                    name=info_name,
+                    asset_id=asset.id,
+                    preview_id=preview_id,
+                    created_at=now,
+                    updated_at=now,
+                    last_access_time=now,
+                )
+                session.add(info)
+                session.flush()
+                out["asset_info_id"] = info.id
+        except IntegrityError:
+            pass
+
+        existing_info = (
+            session.execute(
+                select(AssetInfo)
+                .where(
+                    AssetInfo.asset_id == asset.id,
+                    AssetInfo.name == info_name,
+                    (AssetInfo.owner_id == owner_id),
+                )
+                .limit(1)
+            )
+        ).unique().scalar_one_or_none()
+        if not existing_info:
+            raise RuntimeError("Failed to update or insert AssetInfo.")
+
+        if preview_id and existing_info.preview_id != preview_id:
+            existing_info.preview_id = preview_id
+
+        existing_info.updated_at = now
+        if existing_info.last_access_time < now:
+            existing_info.last_access_time = now
+        session.flush()
+        out["asset_info_id"] = existing_info.id
+
+        norm = [t.strip().lower() for t in (tags or []) if (t or "").strip()]
+        if norm and out["asset_info_id"] is not None:
+            if not require_existing_tags:
+                ensure_tags_exist(session, norm, tag_type="user")
+
+            existing_tag_names = set(
+                name for (name,) in (session.execute(select(Tag.name).where(Tag.name.in_(norm)))).all()
+            )
+            missing = [t for t in norm if t not in existing_tag_names]
+            if missing and require_existing_tags:
+                raise ValueError(f"Unknown tags: {missing}")
+
+            existing_links = set(
+                tag_name
+                for (tag_name,) in (
+                    session.execute(
+                        select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == out["asset_info_id"])
+                    )
+                ).all()
+            )
+            to_add = [t for t in norm if t in existing_tag_names and t not in existing_links]
+            if to_add:
+                session.add_all(
+                    [
+                        AssetInfoTag(
+                            asset_info_id=out["asset_info_id"],
+                            tag_name=t,
+                            origin=tag_origin,
+                            added_at=now,
+                        )
+                        for t in to_add
+                    ]
+                )
+                session.flush()
+
+        # metadata["filename"] hack
+        if out["asset_info_id"] is not None:
+            primary_path = pick_best_live_path(list_cache_states_by_asset_id(session, asset_id=asset.id))
+            computed_filename = compute_relative_filename(primary_path) if primary_path else None
+
+            current_meta = existing_info.user_metadata or {}
+            new_meta = dict(current_meta)
+            if user_metadata is not None:
+                for k, v in user_metadata.items():
+                    new_meta[k] = v
+            if computed_filename:
+                new_meta["filename"] = computed_filename
+
+            if new_meta != current_meta:
+                replace_asset_info_metadata_projection(
+                    session,
+                    asset_info_id=out["asset_info_id"],
+                    user_metadata=new_meta,
+                )
+
+    try:
+        remove_missing_tag_for_asset_id(session, asset_id=asset.id)
+    except Exception:
+        logging.exception("Failed to clear 'missing' tag for asset %s", asset.id)
+    return out
+
+
+def update_asset_info_full(
+    session: Session,
+    *,
+    asset_info_id: str,
+    name: str | None = None,
+    tags: Sequence[str] | None = None,
+    user_metadata: dict | None = None,
+    tag_origin: str = "manual",
+    asset_info_row: Any = None,
+) -> AssetInfo:
+    if not asset_info_row:
+        info = session.get(AssetInfo, asset_info_id)
+        if not info:
+            raise ValueError(f"AssetInfo {asset_info_id} not found")
+    else:
+        info = asset_info_row
+
+    touched = False
+    if name is not None and name != info.name:
+        info.name = name
+        touched = True
+
+    computed_filename = None
+    try:
+        p = pick_best_live_path(list_cache_states_by_asset_id(session, asset_id=info.asset_id))
+        if p:
+            computed_filename = compute_relative_filename(p)
+    except Exception:
+        computed_filename = None
+
+    if user_metadata is not None:
+        new_meta = dict(user_metadata)
+        if computed_filename:
+            new_meta["filename"] = computed_filename
+        replace_asset_info_metadata_projection(
+            session, asset_info_id=asset_info_id, user_metadata=new_meta
+        )
+        touched = True
+    else:
+        if computed_filename:
+            current_meta = info.user_metadata or {}
+            if current_meta.get("filename") != computed_filename:
+                new_meta = dict(current_meta)
+                new_meta["filename"] = computed_filename
+                replace_asset_info_metadata_projection(
+                    session, asset_info_id=asset_info_id, user_metadata=new_meta
+                )
+                touched = True
+
+    if tags is not None:
+        set_asset_info_tags(
+            session,
+            asset_info_id=asset_info_id,
+            tags=tags,
+            origin=tag_origin,
+        )
+        touched = True
+
+    if touched and user_metadata is None:
+        info.updated_at = utcnow()
+        session.flush()
+
+    return info
+
+
+def delete_asset_info_by_id(
+    session: Session,
+    *,
+    asset_info_id: str,
+    owner_id: str,
+) -> bool:
+    stmt = sa.delete(AssetInfo).where(
+        AssetInfo.id == asset_info_id,
+        visible_owner_clause(owner_id),
+    )
+    return int((session.execute(stmt)).rowcount or 0) > 0
+
+
+def list_tags_with_usage(
+    session: Session,
+    prefix: str | None = None,
+    limit: int = 100,
+    offset: int = 0,
+    include_zero: bool = True,
+    order: str = "count_desc",
+    owner_id: str = "",
+) -> tuple[list[tuple[str, str, int]], int]:
+    counts_sq = (
+        select(
+            AssetInfoTag.tag_name.label("tag_name"),
+            func.count(AssetInfoTag.asset_info_id).label("cnt"),
+        )
+        .select_from(AssetInfoTag)
+        .join(AssetInfo, AssetInfo.id == AssetInfoTag.asset_info_id)
+        .where(visible_owner_clause(owner_id))
+        .group_by(AssetInfoTag.tag_name)
+        .subquery()
+    )
+
+    q = (
+        select(
+            Tag.name,
+            Tag.tag_type,
+            func.coalesce(counts_sq.c.cnt, 0).label("count"),
+        )
+        .select_from(Tag)
+        .join(counts_sq, counts_sq.c.tag_name == Tag.name, isouter=True)
+    )
+
+    if prefix:
+        escaped, esc = escape_like_prefix(prefix.strip().lower())
+        q = q.where(Tag.name.like(escaped + "%", escape=esc))
+
+    if not include_zero:
+        q = q.where(func.coalesce(counts_sq.c.cnt, 0) > 0)
+
+    if order == "name_asc":
+        q = q.order_by(Tag.name.asc())
+    else:
+        q = q.order_by(func.coalesce(counts_sq.c.cnt, 0).desc(), Tag.name.asc())
+
+    total_q = select(func.count()).select_from(Tag)
+    if prefix:
+        escaped, esc = escape_like_prefix(prefix.strip().lower())
+        total_q = total_q.where(Tag.name.like(escaped + "%", escape=esc))
+    if not include_zero:
+        total_q = total_q.where(
+            Tag.name.in_(select(AssetInfoTag.tag_name).group_by(AssetInfoTag.tag_name))
+        )
+
+    rows = (session.execute(q.limit(limit).offset(offset))).all()
+    total = (session.execute(total_q)).scalar_one()
+
+    rows_norm = [(name, ttype, int(count or 0)) for (name, ttype, count) in rows]
+    return rows_norm, int(total or 0)
+
+
+def ensure_tags_exist(session: Session, names: Iterable[str], tag_type: str = "user") -> None:
+    wanted = normalize_tags(list(names))
+    if not wanted:
+        return
+    rows = [{"name": n, "tag_type": tag_type} for n in list(dict.fromkeys(wanted))]
+    ins = (
+        sqlite.insert(Tag)
+        .values(rows)
+        .on_conflict_do_nothing(index_elements=[Tag.name])
+    )
+    session.execute(ins)
+
+
+def get_asset_tags(session: Session, *, asset_info_id: str) -> list[str]:
+    return [
+        tag_name for (tag_name,) in (
+            session.execute(
+                select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == asset_info_id)
+            )
+        ).all()
+    ]
+
+
+def add_tags_to_asset_info(
+    session: Session,
+    *,
+    asset_info_id: str,
+    tags: Sequence[str],
+    origin: str = "manual",
+    create_if_missing: bool = True,
+    asset_info_row: Any = None,
+) -> dict:
+    if not asset_info_row:
+        info = session.get(AssetInfo, asset_info_id)
+        if not info:
+            raise ValueError(f"AssetInfo {asset_info_id} not found")
+
+    norm = normalize_tags(tags)
+    if not norm:
+        total = get_asset_tags(session, asset_info_id=asset_info_id)
+        return {"added": [], "already_present": [], "total_tags": total}
+
+    if create_if_missing:
+        ensure_tags_exist(session, norm, tag_type="user")
+
+    current = {
+        tag_name
+        for (tag_name,) in (
+            session.execute(
+                sa.select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == asset_info_id)
+            )
+        ).all()
+    }
+
+    want = set(norm)
+    to_add = sorted(want - current)
+
+    if to_add:
+        with session.begin_nested() as nested:
+            try:
+                session.add_all(
+                    [
+                        AssetInfoTag(
+                            asset_info_id=asset_info_id,
+                            tag_name=t,
+                            origin=origin,
+                            added_at=utcnow(),
+                        )
+                        for t in to_add
+                    ]
+                )
+                session.flush()
+            except IntegrityError:
+                nested.rollback()
+
+    after = set(get_asset_tags(session, asset_info_id=asset_info_id))
+    return {
+        "added": sorted(((after - current) & want)),
+        "already_present": sorted(want & current),
+        "total_tags": sorted(after),
+    }
+
+
+def remove_tags_from_asset_info(
+    session: Session,
+    *,
+    asset_info_id: str,
+    tags: Sequence[str],
+) -> dict:
+    info = session.get(AssetInfo, asset_info_id)
+    if not info:
+        raise ValueError(f"AssetInfo {asset_info_id} not found")
+
+    norm = normalize_tags(tags)
+    if not norm:
+        total = get_asset_tags(session, asset_info_id=asset_info_id)
+        return {"removed": [], "not_present": [], "total_tags": total}
+
+    existing = {
+        tag_name
+        for (tag_name,) in (
+            session.execute(
+                sa.select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == asset_info_id)
+            )
+        ).all()
+    }
+
+    to_remove = sorted(set(t for t in norm if t in existing))
+    not_present = sorted(set(t for t in norm if t not in existing))
+
+    if to_remove:
+        session.execute(
+            delete(AssetInfoTag)
+            .where(
+                AssetInfoTag.asset_info_id == asset_info_id,
+                AssetInfoTag.tag_name.in_(to_remove),
+            )
+        )
+        session.flush()
+
+    total = get_asset_tags(session, asset_info_id=asset_info_id)
+    return {"removed": to_remove, "not_present": not_present, "total_tags": total}
+
+
+def remove_missing_tag_for_asset_id(
+    session: Session,
+    *,
+    asset_id: str,
+) -> None:
+    session.execute(
+        sa.delete(AssetInfoTag).where(
+            AssetInfoTag.asset_info_id.in_(sa.select(AssetInfo.id).where(AssetInfo.asset_id == asset_id)),
+            AssetInfoTag.tag_name == "missing",
+        )
+    )
+
+
+def set_asset_info_preview(
+    session: Session,
+    *,
+    asset_info_id: str,
+    preview_asset_id: str | None = None,
+) -> None:
+    """Set or clear preview_id and bump updated_at. Raises on unknown IDs."""
+    info = session.get(AssetInfo, asset_info_id)
+    if not info:
+        raise ValueError(f"AssetInfo {asset_info_id} not found")
+
+    if preview_asset_id is None:
+        info.preview_id = None
+    else:
+        # validate preview asset exists
+        if not session.get(Asset, preview_asset_id):
+            raise ValueError(f"Preview Asset {preview_asset_id} not found")
+        info.preview_id = preview_asset_id
+
+    info.updated_at = utcnow()
+    session.flush()
--- a/app/assets/database/services/init.py
+++ b/app/assets/database/services/init.py
@ -1,57 +0,0 @@
-from .content import (
-    check_fs_asset_exists_quick,
-    compute_hash_and_dedup_for_cache_state,
-    ingest_fs_asset,
-    list_cache_states_with_asset_under_prefixes,
-    list_unhashed_candidates_under_prefixes,
-    list_verify_candidates_under_prefixes,
-    redirect_all_references_then_delete_asset,
-    touch_asset_infos_by_fs_path,
-)
-from .info import (
-    add_tags_to_asset_info,
-    create_asset_info_for_existing_asset,
-    delete_asset_info_by_id,
-    fetch_asset_info_and_asset,
-    fetch_asset_info_asset_and_tags,
-    get_asset_tags,
-    list_asset_infos_page,
-    list_tags_with_usage,
-    remove_tags_from_asset_info,
-    replace_asset_info_metadata_projection,
-    set_asset_info_preview,
-    set_asset_info_tags,
-    touch_asset_info_by_id,
-    update_asset_info_full,
-)
-from .queries import (
-    asset_exists_by_hash,
-    asset_info_exists_for_asset_id,
-    get_asset_by_hash,
-    get_asset_info_by_id,
-    get_cache_state_by_asset_id,
-    list_cache_states_by_asset_id,
-    pick_best_live_path,
-)
-
-__all__ = [
-    # queries
-    "asset_exists_by_hash", "get_asset_by_hash", "get_asset_info_by_id", "asset_info_exists_for_asset_id",
-    "get_cache_state_by_asset_id",
-    "list_cache_states_by_asset_id",
-    "pick_best_live_path",
-    # info
-    "list_asset_infos_page", "create_asset_info_for_existing_asset", "set_asset_info_tags",
-    "update_asset_info_full", "replace_asset_info_metadata_projection",
-    "touch_asset_info_by_id", "delete_asset_info_by_id",
-    "add_tags_to_asset_info", "remove_tags_from_asset_info",
-    "get_asset_tags", "list_tags_with_usage", "set_asset_info_preview",
-    "fetch_asset_info_and_asset", "fetch_asset_info_asset_and_tags",
-    # content
-    "check_fs_asset_exists_quick",
-    "redirect_all_references_then_delete_asset",
-    "compute_hash_and_dedup_for_cache_state",
-    "list_unhashed_candidates_under_prefixes", "list_verify_candidates_under_prefixes",
-    "ingest_fs_asset", "touch_asset_infos_by_fs_path",
-    "list_cache_states_with_asset_under_prefixes",
-]
--- a/app/assets/database/services/content.py
+++ b/app/assets/database/services/content.py
@ -1,721 +0,0 @@
-import contextlib
-import logging
-import os
-from datetime import datetime
-from typing import Any, Optional, Sequence, Union
-
-import sqlalchemy as sa
-from sqlalchemy import select
-from sqlalchemy.dialects import postgresql as d_pg
-from sqlalchemy.dialects import sqlite as d_sqlite
-from sqlalchemy.exc import IntegrityError
-from sqlalchemy.ext.asyncio import AsyncSession
-from sqlalchemy.orm import noload
-
-from ..._helpers import compute_relative_filename
-from ...storage import hashing as hashing_mod
-from ..helpers import (
-    ensure_tags_exist,
-    escape_like_prefix,
-    remove_missing_tag_for_asset_id,
-)
-from ..models import Asset, AssetCacheState, AssetInfo, AssetInfoTag, Tag
-from ..timeutil import utcnow
-from .info import replace_asset_info_metadata_projection
-from .queries import list_cache_states_by_asset_id, pick_best_live_path
-
-
-async def check_fs_asset_exists_quick(
-    session: AsyncSession,
-    *,
-    file_path: str,
-    size_bytes: Optional[int] = None,
-    mtime_ns: Optional[int] = None,
-) -> bool:
-    """Returns True if we already track this absolute path with a HASHED asset and the cached mtime/size match."""
-    locator = os.path.abspath(file_path)
-
-    stmt = (
-        sa.select(sa.literal(True))
-        .select_from(AssetCacheState)
-        .join(Asset, Asset.id == AssetCacheState.asset_id)
-        .where(
-            AssetCacheState.file_path == locator,
-            Asset.hash.isnot(None),
-            AssetCacheState.needs_verify.is_(False),
-        )
-        .limit(1)
-    )
-
-    conds = []
-    if mtime_ns is not None:
-        conds.append(AssetCacheState.mtime_ns == int(mtime_ns))
-    if size_bytes is not None:
-        conds.append(sa.or_(Asset.size_bytes == 0, Asset.size_bytes == int(size_bytes)))
-    if conds:
-        stmt = stmt.where(*conds)
-    return (await session.execute(stmt)).first() is not None
-
-
-async def redirect_all_references_then_delete_asset(
-    session: AsyncSession,
-    *,
-    duplicate_asset_id: str,
-    canonical_asset_id: str,
-) -> None:
-    """
-    Safely migrate all references from duplicate_asset_id to canonical_asset_id.
-
-    - If an AssetInfo for (owner_id, name) already exists on the canonical asset,
-      merge tags, metadata, times, and preview, then delete the duplicate AssetInfo.
-    - Otherwise, simply repoint the AssetInfo.asset_id.
-    - Always retarget AssetCacheState rows.
-    - Finally delete the duplicate Asset row.
-    """
-    if duplicate_asset_id == canonical_asset_id:
-        return
-
-    # 1) Migrate AssetInfo rows one-by-one to avoid UNIQUE conflicts.
-    dup_infos = (
-        await session.execute(
-            select(AssetInfo).options(noload(AssetInfo.tags)).where(AssetInfo.asset_id == duplicate_asset_id)
-        )
-    ).unique().scalars().all()
-
-    for info in dup_infos:
-        # Try to find an existing collision on canonical
-        existing = (
-            await session.execute(
-                select(AssetInfo)
-                .options(noload(AssetInfo.tags))
-                .where(
-                    AssetInfo.asset_id == canonical_asset_id,
-                    AssetInfo.owner_id == info.owner_id,
-                    AssetInfo.name == info.name,
-                )
-                .limit(1)
-            )
-        ).unique().scalars().first()
-
-        if existing:
-            merged_meta = dict(existing.user_metadata or {})
-            other_meta = info.user_metadata or {}
-            for k, v in other_meta.items():
-                if k not in merged_meta:
-                    merged_meta[k] = v
-            if merged_meta != (existing.user_metadata or {}):
-                await replace_asset_info_metadata_projection(
-                    session,
-                    asset_info_id=existing.id,
-                    user_metadata=merged_meta,
-                )
-
-            existing_tags = {
-                t for (t,) in (
-                    await session.execute(
-                        select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == existing.id)
-                    )
-                ).all()
-            }
-            from_tags = {
-                t for (t,) in (
-                    await session.execute(
-                        select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == info.id)
-                    )
-                ).all()
-            }
-            to_add = sorted(from_tags - existing_tags)
-            if to_add:
-                await ensure_tags_exist(session, to_add, tag_type="user")
-                now = utcnow()
-                session.add_all([
-                    AssetInfoTag(asset_info_id=existing.id, tag_name=t, origin="automatic", added_at=now)
-                    for t in to_add
-                ])
-                await session.flush()
-
-            if existing.preview_id is None and info.preview_id is not None:
-                existing.preview_id = info.preview_id
-            if info.last_access_time and (
-                existing.last_access_time is None or info.last_access_time > existing.last_access_time
-            ):
-                existing.last_access_time = info.last_access_time
-            existing.updated_at = utcnow()
-            await session.flush()
-
-            # Delete the duplicate AssetInfo (cascades will clean its tags/meta)
-            await session.delete(info)
-            await session.flush()
-        else:
-            # Simple retarget
-            info.asset_id = canonical_asset_id
-            info.updated_at = utcnow()
-            await session.flush()
-
-    # 2) Repoint cache states and previews
-    await session.execute(
-        sa.update(AssetCacheState)
-        .where(AssetCacheState.asset_id == duplicate_asset_id)
-        .values(asset_id=canonical_asset_id)
-    )
-    await session.execute(
-        sa.update(AssetInfo)
-        .where(AssetInfo.preview_id == duplicate_asset_id)
-        .values(preview_id=canonical_asset_id)
-    )
-
-    # 3) Remove duplicate Asset
-    dup = await session.get(Asset, duplicate_asset_id)
-    if dup:
-        await session.delete(dup)
-    await session.flush()
-
-
-async def compute_hash_and_dedup_for_cache_state(
-    session: AsyncSession,
-    *,
-    state_id: int,
-) -> Optional[str]:
-    """
-    Compute hash for the given cache state, deduplicate, and settle verify cases.
-
-    Returns the asset_id that this state ends up pointing to, or None if file disappeared.
-    """
-    state = await session.get(AssetCacheState, state_id)
-    if not state:
-        return None
-
-    path = state.file_path
-    try:
-        if not os.path.isfile(path):
-            # File vanished: drop the state. If the Asset has hash=NULL and has no other states, drop the Asset too.
-            asset = await session.get(Asset, state.asset_id)
-            await session.delete(state)
-            await session.flush()
-
-            if asset and asset.hash is None:
-                remaining = (
-                    await session.execute(
-                        sa.select(sa.func.count())
-                        .select_from(AssetCacheState)
-                        .where(AssetCacheState.asset_id == asset.id)
-                    )
-                ).scalar_one()
-                if int(remaining or 0) == 0:
-                    await session.delete(asset)
-                    await session.flush()
-                else:
-                    await _recompute_and_apply_filename_for_asset(session, asset_id=asset.id)
-            return None
-
-        digest = await hashing_mod.blake3_hash(path)
-        new_hash = f"blake3:{digest}"
-
-        st = os.stat(path, follow_symlinks=True)
-        new_size = int(st.st_size)
-        mtime_ns = getattr(st, "st_mtime_ns", int(st.st_mtime * 1_000_000_000))
-
-        # Current asset of this state
-        this_asset = await session.get(Asset, state.asset_id)
-
-        # If the state got orphaned somehow (race), just reattach appropriately.
-        if not this_asset:
-            canonical = (
-                await session.execute(sa.select(Asset).where(Asset.hash == new_hash).limit(1))
-            ).scalars().first()
-            if canonical:
-                state.asset_id = canonical.id
-            else:
-                now = utcnow()
-                new_asset = Asset(hash=new_hash, size_bytes=new_size, mime_type=None, created_at=now)
-                session.add(new_asset)
-                await session.flush()
-                state.asset_id = new_asset.id
-            state.mtime_ns = mtime_ns
-            state.needs_verify = False
-            with contextlib.suppress(Exception):
-                await remove_missing_tag_for_asset_id(session, asset_id=state.asset_id)
-            await session.flush()
-            return state.asset_id
-
-        # 1) Seed asset case (hash is NULL): claim or merge into canonical
-        if this_asset.hash is None:
-            canonical = (
-                await session.execute(sa.select(Asset).where(Asset.hash == new_hash).limit(1))
-            ).scalars().first()
-
-            if canonical and canonical.id != this_asset.id:
-                # Merge seed asset into canonical (safe, collision-aware)
-                await redirect_all_references_then_delete_asset(
-                    session,
-                    duplicate_asset_id=this_asset.id,
-                    canonical_asset_id=canonical.id,
-                )
-                state = await session.get(AssetCacheState, state_id)
-                if state:
-                    state.mtime_ns = mtime_ns
-                    state.needs_verify = False
-                    with contextlib.suppress(Exception):
-                        await remove_missing_tag_for_asset_id(session, asset_id=canonical.id)
-                    await _recompute_and_apply_filename_for_asset(session, asset_id=canonical.id)
-                    await session.flush()
-                return canonical.id
-
-            # No canonical: try to claim the hash; handle races with a SAVEPOINT
-            try:
-                async with session.begin_nested():
-                    this_asset.hash = new_hash
-                    if int(this_asset.size_bytes or 0) == 0 and new_size > 0:
-                        this_asset.size_bytes = new_size
-                    await session.flush()
-            except IntegrityError:
-                # Someone else claimed it concurrently; fetch canonical and merge
-                canonical = (
-                    await session.execute(sa.select(Asset).where(Asset.hash == new_hash).limit(1))
-                ).scalars().first()
-                if canonical and canonical.id != this_asset.id:
-                    await redirect_all_references_then_delete_asset(
-                        session,
-                        duplicate_asset_id=this_asset.id,
-                        canonical_asset_id=canonical.id,
-                    )
-                    state = await session.get(AssetCacheState, state_id)
-                    if state:
-                        state.mtime_ns = mtime_ns
-                        state.needs_verify = False
-                        with contextlib.suppress(Exception):
-                            await remove_missing_tag_for_asset_id(session, asset_id=canonical.id)
-                        await _recompute_and_apply_filename_for_asset(session, asset_id=canonical.id)
-                        await session.flush()
-                    return canonical.id
-                # If we got here, the integrity error was not about hash uniqueness
-                raise
-
-            # Claimed successfully
-            state.mtime_ns = mtime_ns
-            state.needs_verify = False
-            with contextlib.suppress(Exception):
-                await remove_missing_tag_for_asset_id(session, asset_id=this_asset.id)
-            await _recompute_and_apply_filename_for_asset(session, asset_id=this_asset.id)
-            await session.flush()
-            return this_asset.id
-
-        # 2) Verify case for hashed assets
-        if this_asset.hash == new_hash:
-            if int(this_asset.size_bytes or 0) == 0 and new_size > 0:
-                this_asset.size_bytes = new_size
-            state.mtime_ns = mtime_ns
-            state.needs_verify = False
-            with contextlib.suppress(Exception):
-                await remove_missing_tag_for_asset_id(session, asset_id=this_asset.id)
-            await _recompute_and_apply_filename_for_asset(session, asset_id=this_asset.id)
-            await session.flush()
-            return this_asset.id
-
-        # Content changed on this path only: retarget THIS state, do not move AssetInfo rows
-        canonical = (
-            await session.execute(sa.select(Asset).where(Asset.hash == new_hash).limit(1))
-        ).scalars().first()
-        if canonical:
-            target_id = canonical.id
-        else:
-            now = utcnow()
-            new_asset = Asset(hash=new_hash, size_bytes=new_size, mime_type=None, created_at=now)
-            session.add(new_asset)
-            await session.flush()
-            target_id = new_asset.id
-
-        state.asset_id = target_id
-        state.mtime_ns = mtime_ns
-        state.needs_verify = False
-        with contextlib.suppress(Exception):
-            await remove_missing_tag_for_asset_id(session, asset_id=target_id)
-        await _recompute_and_apply_filename_for_asset(session, asset_id=target_id)
-        await session.flush()
-        return target_id
-    except Exception:
-        raise
-
-
-async def list_unhashed_candidates_under_prefixes(session: AsyncSession, *, prefixes: list[str]) -> list[int]:
-    if not prefixes:
-        return []
-
-    conds = []
-    for p in prefixes:
-        base = os.path.abspath(p)
-        if not base.endswith(os.sep):
-            base += os.sep
-        escaped, esc = escape_like_prefix(base)
-        conds.append(AssetCacheState.file_path.like(escaped + "%", escape=esc))
-
-    path_filter = sa.or_(*conds) if len(conds) > 1 else conds[0]
-    if session.bind.dialect.name == "postgresql":
-        stmt = (
-            sa.select(AssetCacheState.id)
-            .join(Asset, Asset.id == AssetCacheState.asset_id)
-            .where(Asset.hash.is_(None), path_filter)
-            .order_by(AssetCacheState.asset_id.asc(), AssetCacheState.id.asc())
-            .distinct(AssetCacheState.asset_id)
-        )
-    else:
-        first_id = sa.func.min(AssetCacheState.id).label("first_id")
-        stmt = (
-            sa.select(first_id)
-            .join(Asset, Asset.id == AssetCacheState.asset_id)
-            .where(Asset.hash.is_(None), path_filter)
-            .group_by(AssetCacheState.asset_id)
-            .order_by(first_id.asc())
-        )
-    return [int(x) for x in (await session.execute(stmt)).scalars().all()]
-
-
-async def list_verify_candidates_under_prefixes(
-    session: AsyncSession, *, prefixes: Sequence[str]
-) -> Union[list[int], Sequence[int]]:
-    if not prefixes:
-        return []
-    conds = []
-    for p in prefixes:
-        base = os.path.abspath(p)
-        if not base.endswith(os.sep):
-            base += os.sep
-        escaped, esc = escape_like_prefix(base)
-        conds.append(AssetCacheState.file_path.like(escaped + "%", escape=esc))
-
-    return (
-        await session.execute(
-            sa.select(AssetCacheState.id)
-            .where(AssetCacheState.needs_verify.is_(True))
-            .where(sa.or_(*conds))
-            .order_by(AssetCacheState.id.asc())
-        )
-    ).scalars().all()
-
-
-async def ingest_fs_asset(
-    session: AsyncSession,
-    *,
-    asset_hash: str,
-    abs_path: str,
-    size_bytes: int,
-    mtime_ns: int,
-    mime_type: Optional[str] = None,
-    info_name: Optional[str] = None,
-    owner_id: str = "",
-    preview_id: Optional[str] = None,
-    user_metadata: Optional[dict] = None,
-    tags: Sequence[str] = (),
-    tag_origin: str = "manual",
-    require_existing_tags: bool = False,
-) -> dict:
-    """
-    Idempotently upsert:
-      - Asset by content hash (create if missing)
-      - AssetCacheState(file_path) pointing to asset_id
-      - Optionally AssetInfo + tag links and metadata projection
-    Returns flags and ids.
-    """
-    locator = os.path.abspath(abs_path)
-    now = utcnow()
-    dialect = session.bind.dialect.name
-
-    if preview_id:
-        if not await session.get(Asset, preview_id):
-            preview_id = None
-
-    out: dict[str, Any] = {
-        "asset_created": False,
-        "asset_updated": False,
-        "state_created": False,
-        "state_updated": False,
-        "asset_info_id": None,
-    }
-
-    # 1) Asset by hash
-    asset = (
-        await session.execute(select(Asset).where(Asset.hash == asset_hash).limit(1))
-    ).scalars().first()
-    if not asset:
-        vals = {
-            "hash": asset_hash,
-            "size_bytes": int(size_bytes),
-            "mime_type": mime_type,
-            "created_at": now,
-        }
-        if dialect == "sqlite":
-            res = await session.execute(
-                d_sqlite.insert(Asset)
-                .values(**vals)
-                .on_conflict_do_nothing(index_elements=[Asset.hash])
-            )
-            if int(res.rowcount or 0) > 0:
-                out["asset_created"] = True
-            asset = (
-                await session.execute(
-                    select(Asset).where(Asset.hash == asset_hash).limit(1)
-                )
-            ).scalars().first()
-        elif dialect == "postgresql":
-            res = await session.execute(
-                d_pg.insert(Asset)
-                .values(**vals)
-                .on_conflict_do_nothing(
-                    index_elements=[Asset.hash],
-                    index_where=Asset.__table__.c.hash.isnot(None),
-                )
-                .returning(Asset.id)
-            )
-            inserted_id = res.scalar_one_or_none()
-            if inserted_id:
-                out["asset_created"] = True
-                asset = await session.get(Asset, inserted_id)
-            else:
-                asset = (
-                    await session.execute(
-                        select(Asset).where(Asset.hash == asset_hash).limit(1)
-                    )
-                ).scalars().first()
-        else:
-            raise NotImplementedError(f"Unsupported database dialect: {dialect}")
-        if not asset:
-            raise RuntimeError("Asset row not found after upsert.")
-    else:
-        changed = False
-        if asset.size_bytes != int(size_bytes) and int(size_bytes) > 0:
-            asset.size_bytes = int(size_bytes)
-            changed = True
-        if mime_type and asset.mime_type != mime_type:
-            asset.mime_type = mime_type
-            changed = True
-        if changed:
-            out["asset_updated"] = True
-
-    # 2) AssetCacheState upsert by file_path (unique)
-    vals = {
-        "asset_id": asset.id,
-        "file_path": locator,
-        "mtime_ns": int(mtime_ns),
-    }
-    if dialect == "sqlite":
-        ins = (
-            d_sqlite.insert(AssetCacheState)
-            .values(**vals)
-            .on_conflict_do_nothing(index_elements=[AssetCacheState.file_path])
-        )
-    elif dialect == "postgresql":
-        ins = (
-            d_pg.insert(AssetCacheState)
-            .values(**vals)
-            .on_conflict_do_nothing(index_elements=[AssetCacheState.file_path])
-        )
-    else:
-        raise NotImplementedError(f"Unsupported database dialect: {dialect}")
-
-    res = await session.execute(ins)
-    if int(res.rowcount or 0) > 0:
-        out["state_created"] = True
-    else:
-        upd = (
-            sa.update(AssetCacheState)
-            .where(AssetCacheState.file_path == locator)
-            .where(
-                sa.or_(
-                    AssetCacheState.asset_id != asset.id,
-                    AssetCacheState.mtime_ns.is_(None),
-                    AssetCacheState.mtime_ns != int(mtime_ns),
-                )
-            )
-            .values(asset_id=asset.id, mtime_ns=int(mtime_ns))
-        )
-        res2 = await session.execute(upd)
-        if int(res2.rowcount or 0) > 0:
-            out["state_updated"] = True
-
-    # 3) Optional AssetInfo + tags + metadata
-    if info_name:
-        try:
-            async with session.begin_nested():
-                info = AssetInfo(
-                    owner_id=owner_id,
-                    name=info_name,
-                    asset_id=asset.id,
-                    preview_id=preview_id,
-                    created_at=now,
-                    updated_at=now,
-                    last_access_time=now,
-                )
-                session.add(info)
-                await session.flush()
-                out["asset_info_id"] = info.id
-        except IntegrityError:
-            pass
-
-        existing_info = (
-            await session.execute(
-                select(AssetInfo)
-                .where(
-                    AssetInfo.asset_id == asset.id,
-                    AssetInfo.name == info_name,
-                    (AssetInfo.owner_id == owner_id),
-                )
-                .limit(1)
-            )
-        ).unique().scalar_one_or_none()
-        if not existing_info:
-            raise RuntimeError("Failed to update or insert AssetInfo.")
-
-        if preview_id and existing_info.preview_id != preview_id:
-            existing_info.preview_id = preview_id
-
-        existing_info.updated_at = now
-        if existing_info.last_access_time < now:
-            existing_info.last_access_time = now
-        await session.flush()
-        out["asset_info_id"] = existing_info.id
-
-        norm = [t.strip().lower() for t in (tags or []) if (t or "").strip()]
-        if norm and out["asset_info_id"] is not None:
-            if not require_existing_tags:
-                await ensure_tags_exist(session, norm, tag_type="user")
-
-            existing_tag_names = set(
-                name for (name,) in (await session.execute(select(Tag.name).where(Tag.name.in_(norm)))).all()
-            )
-            missing = [t for t in norm if t not in existing_tag_names]
-            if missing and require_existing_tags:
-                raise ValueError(f"Unknown tags: {missing}")
-
-            existing_links = set(
-                tag_name
-                for (tag_name,) in (
-                    await session.execute(
-                        select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == out["asset_info_id"])
-                    )
-                ).all()
-            )
-            to_add = [t for t in norm if t in existing_tag_names and t not in existing_links]
-            if to_add:
-                session.add_all(
-                    [
-                        AssetInfoTag(
-                            asset_info_id=out["asset_info_id"],
-                            tag_name=t,
-                            origin=tag_origin,
-                            added_at=now,
-                        )
-                        for t in to_add
-                    ]
-                )
-                await session.flush()
-
-        # metadata["filename"] hack
-        if out["asset_info_id"] is not None:
-            primary_path = pick_best_live_path(await list_cache_states_by_asset_id(session, asset_id=asset.id))
-            computed_filename = compute_relative_filename(primary_path) if primary_path else None
-
-            current_meta = existing_info.user_metadata or {}
-            new_meta = dict(current_meta)
-            if user_metadata is not None:
-                for k, v in user_metadata.items():
-                    new_meta[k] = v
-            if computed_filename:
-                new_meta["filename"] = computed_filename
-
-            if new_meta != current_meta:
-                await replace_asset_info_metadata_projection(
-                    session,
-                    asset_info_id=out["asset_info_id"],
-                    user_metadata=new_meta,
-                )
-
-    try:
-        await remove_missing_tag_for_asset_id(session, asset_id=asset.id)
-    except Exception:
-        logging.exception("Failed to clear 'missing' tag for asset %s", asset.id)
-    return out
-
-
-async def touch_asset_infos_by_fs_path(
-    session: AsyncSession,
-    *,
-    file_path: str,
-    ts: Optional[datetime] = None,
-    only_if_newer: bool = True,
-) -> None:
-    locator = os.path.abspath(file_path)
-    ts = ts or utcnow()
-    stmt = sa.update(AssetInfo).where(
-        sa.exists(
-            sa.select(sa.literal(1))
-            .select_from(AssetCacheState)
-            .where(
-                AssetCacheState.asset_id == AssetInfo.asset_id,
-                AssetCacheState.file_path == locator,
-            )
-        )
-    )
-    if only_if_newer:
-        stmt = stmt.where(
-            sa.or_(
-                AssetInfo.last_access_time.is_(None),
-                AssetInfo.last_access_time < ts,
-            )
-        )
-    await session.execute(stmt.values(last_access_time=ts))
-
-
-async def list_cache_states_with_asset_under_prefixes(
-    session: AsyncSession,
-    *,
-    prefixes: Sequence[str],
-) -> list[tuple[AssetCacheState, Optional[str], int]]:
-    """Return (AssetCacheState, asset_hash, size_bytes) for rows under any prefix."""
-    if not prefixes:
-        return []
-
-    conds = []
-    for p in prefixes:
-        if not p:
-            continue
-        base = os.path.abspath(p)
-        if not base.endswith(os.sep):
-            base = base + os.sep
-        escaped, esc = escape_like_prefix(base)
-        conds.append(AssetCacheState.file_path.like(escaped + "%", escape=esc))
-
-    if not conds:
-        return []
-
-    rows = (
-        await session.execute(
-            select(AssetCacheState, Asset.hash, Asset.size_bytes)
-            .join(Asset, Asset.id == AssetCacheState.asset_id)
-            .where(sa.or_(*conds))
-            .order_by(AssetCacheState.id.asc())
-        )
-    ).all()
-    return [(r[0], r[1], int(r[2] or 0)) for r in rows]
-
-
-async def _recompute_and_apply_filename_for_asset(session: AsyncSession, *, asset_id: str) -> None:
-    """Compute filename from the first *existing* cache state path and apply it to all AssetInfo (if changed)."""
-    try:
-        primary_path = pick_best_live_path(await list_cache_states_by_asset_id(session, asset_id=asset_id))
-        if not primary_path:
-            return
-        new_filename = compute_relative_filename(primary_path)
-        if not new_filename:
-            return
-        infos = (
-            await session.execute(select(AssetInfo).where(AssetInfo.asset_id == asset_id))
-        ).scalars().all()
-        for info in infos:
-            current_meta = info.user_metadata or {}
-            if current_meta.get("filename") == new_filename:
-                continue
-            updated = dict(current_meta)
-            updated["filename"] = new_filename
-            await replace_asset_info_metadata_projection(session, asset_info_id=info.id, user_metadata=updated)
-    except Exception:
-        logging.exception("Failed to recompute filename metadata for asset %s", asset_id)
--- a/app/assets/database/services/info.py
+++ b/app/assets/database/services/info.py
@ -1,586 +0,0 @@
-from collections import defaultdict
-from datetime import datetime
-from typing import Any, Optional, Sequence
-
-import sqlalchemy as sa
-from sqlalchemy import delete, func, select
-from sqlalchemy.exc import IntegrityError
-from sqlalchemy.ext.asyncio import AsyncSession
-from sqlalchemy.orm import contains_eager, noload
-
-from ..._helpers import compute_relative_filename, normalize_tags
-from ..helpers import (
-    apply_metadata_filter,
-    apply_tag_filters,
-    ensure_tags_exist,
-    escape_like_prefix,
-    project_kv,
-    visible_owner_clause,
-)
-from ..models import Asset, AssetInfo, AssetInfoMeta, AssetInfoTag, Tag
-from ..timeutil import utcnow
-from .queries import (
-    get_asset_by_hash,
-    list_cache_states_by_asset_id,
-    pick_best_live_path,
-)
-
-
-async def list_asset_infos_page(
-    session: AsyncSession,
-    *,
-    owner_id: str = "",
-    include_tags: Optional[Sequence[str]] = None,
-    exclude_tags: Optional[Sequence[str]] = None,
-    name_contains: Optional[str] = None,
-    metadata_filter: Optional[dict] = None,
-    limit: int = 20,
-    offset: int = 0,
-    sort: str = "created_at",
-    order: str = "desc",
-) -> tuple[list[AssetInfo], dict[str, list[str]], int]:
-    base = (
-        select(AssetInfo)
-        .join(Asset, Asset.id == AssetInfo.asset_id)
-        .options(contains_eager(AssetInfo.asset), noload(AssetInfo.tags))
-        .where(visible_owner_clause(owner_id))
-    )
-
-    if name_contains:
-        escaped, esc = escape_like_prefix(name_contains)
-        base = base.where(AssetInfo.name.ilike(f"%{escaped}%", escape=esc))
-
-    base = apply_tag_filters(base, include_tags, exclude_tags)
-    base = apply_metadata_filter(base, metadata_filter)
-
-    sort = (sort or "created_at").lower()
-    order = (order or "desc").lower()
-    sort_map = {
-        "name": AssetInfo.name,
-        "created_at": AssetInfo.created_at,
-        "updated_at": AssetInfo.updated_at,
-        "last_access_time": AssetInfo.last_access_time,
-        "size": Asset.size_bytes,
-    }
-    sort_col = sort_map.get(sort, AssetInfo.created_at)
-    sort_exp = sort_col.desc() if order == "desc" else sort_col.asc()
-
-    base = base.order_by(sort_exp).limit(limit).offset(offset)
-
-    count_stmt = (
-        select(func.count())
-        .select_from(AssetInfo)
-        .join(Asset, Asset.id == AssetInfo.asset_id)
-        .where(visible_owner_clause(owner_id))
-    )
-    if name_contains:
-        escaped, esc = escape_like_prefix(name_contains)
-        count_stmt = count_stmt.where(AssetInfo.name.ilike(f"%{escaped}%", escape=esc))
-    count_stmt = apply_tag_filters(count_stmt, include_tags, exclude_tags)
-    count_stmt = apply_metadata_filter(count_stmt, metadata_filter)
-
-    total = int((await session.execute(count_stmt)).scalar_one() or 0)
-
-    infos = (await session.execute(base)).unique().scalars().all()
-
-    id_list: list[str] = [i.id for i in infos]
-    tag_map: dict[str, list[str]] = defaultdict(list)
-    if id_list:
-        rows = await session.execute(
-            select(AssetInfoTag.asset_info_id, Tag.name)
-            .join(Tag, Tag.name == AssetInfoTag.tag_name)
-            .where(AssetInfoTag.asset_info_id.in_(id_list))
-        )
-        for aid, tag_name in rows.all():
-            tag_map[aid].append(tag_name)
-
-    return infos, tag_map, total
-
-
-async def fetch_asset_info_and_asset(
-    session: AsyncSession,
-    *,
-    asset_info_id: str,
-    owner_id: str = "",
-) -> Optional[tuple[AssetInfo, Asset]]:
-    stmt = (
-        select(AssetInfo, Asset)
-        .join(Asset, Asset.id == AssetInfo.asset_id)
-        .where(
-            AssetInfo.id == asset_info_id,
-            visible_owner_clause(owner_id),
-        )
-        .limit(1)
-        .options(noload(AssetInfo.tags))
-    )
-    row = await session.execute(stmt)
-    pair = row.first()
-    if not pair:
-        return None
-    return pair[0], pair[1]
-
-
-async def fetch_asset_info_asset_and_tags(
-    session: AsyncSession,
-    *,
-    asset_info_id: str,
-    owner_id: str = "",
-) -> Optional[tuple[AssetInfo, Asset, list[str]]]:
-    stmt = (
-        select(AssetInfo, Asset, Tag.name)
-        .join(Asset, Asset.id == AssetInfo.asset_id)
-        .join(AssetInfoTag, AssetInfoTag.asset_info_id == AssetInfo.id, isouter=True)
-        .join(Tag, Tag.name == AssetInfoTag.tag_name, isouter=True)
-        .where(
-            AssetInfo.id == asset_info_id,
-            visible_owner_clause(owner_id),
-        )
-        .options(noload(AssetInfo.tags))
-        .order_by(Tag.name.asc())
-    )
-
-    rows = (await session.execute(stmt)).all()
-    if not rows:
-        return None
-
-    first_info, first_asset, _ = rows[0]
-    tags: list[str] = []
-    seen: set[str] = set()
-    for _info, _asset, tag_name in rows:
-        if tag_name and tag_name not in seen:
-            seen.add(tag_name)
-            tags.append(tag_name)
-    return first_info, first_asset, tags
-
-
-async def create_asset_info_for_existing_asset(
-    session: AsyncSession,
-    *,
-    asset_hash: str,
-    name: str,
-    user_metadata: Optional[dict] = None,
-    tags: Optional[Sequence[str]] = None,
-    tag_origin: str = "manual",
-    owner_id: str = "",
-) -> AssetInfo:
-    """Create or return an existing AssetInfo for an Asset identified by asset_hash."""
-    now = utcnow()
-    asset = await get_asset_by_hash(session, asset_hash=asset_hash)
-    if not asset:
-        raise ValueError(f"Unknown asset hash {asset_hash}")
-
-    info = AssetInfo(
-        owner_id=owner_id,
-        name=name,
-        asset_id=asset.id,
-        preview_id=None,
-        created_at=now,
-        updated_at=now,
-        last_access_time=now,
-    )
-    try:
-        async with session.begin_nested():
-            session.add(info)
-            await session.flush()
-    except IntegrityError:
-        existing = (
-            await session.execute(
-                select(AssetInfo)
-                .options(noload(AssetInfo.tags))
-                .where(
-                    AssetInfo.asset_id == asset.id,
-                    AssetInfo.name == name,
-                    AssetInfo.owner_id == owner_id,
-                )
-                .limit(1)
-            )
-        ).unique().scalars().first()
-        if not existing:
-            raise RuntimeError("AssetInfo upsert failed to find existing row after conflict.")
-        return existing
-
-    # metadata["filename"] hack
-    new_meta = dict(user_metadata or {})
-    computed_filename = None
-    try:
-        p = pick_best_live_path(await list_cache_states_by_asset_id(session, asset_id=asset.id))
-        if p:
-            computed_filename = compute_relative_filename(p)
-    except Exception:
-        computed_filename = None
-    if computed_filename:
-        new_meta["filename"] = computed_filename
-    if new_meta:
-        await replace_asset_info_metadata_projection(
-            session,
-            asset_info_id=info.id,
-            user_metadata=new_meta,
-        )
-
-    if tags is not None:
-        await set_asset_info_tags(
-            session,
-            asset_info_id=info.id,
-            tags=tags,
-            origin=tag_origin,
-        )
-    return info
-
-
-async def set_asset_info_tags(
-    session: AsyncSession,
-    *,
-    asset_info_id: str,
-    tags: Sequence[str],
-    origin: str = "manual",
-) -> dict:
-    desired = normalize_tags(tags)
-
-    current = set(
-        tag_name for (tag_name,) in (
-            await session.execute(select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == asset_info_id))
-        ).all()
-    )
-
-    to_add = [t for t in desired if t not in current]
-    to_remove = [t for t in current if t not in desired]
-
-    if to_add:
-        await ensure_tags_exist(session, to_add, tag_type="user")
-        session.add_all([
-            AssetInfoTag(asset_info_id=asset_info_id, tag_name=t, origin=origin, added_at=utcnow())
-            for t in to_add
-        ])
-        await session.flush()
-
-    if to_remove:
-        await session.execute(
-            delete(AssetInfoTag)
-            .where(AssetInfoTag.asset_info_id == asset_info_id, AssetInfoTag.tag_name.in_(to_remove))
-        )
-        await session.flush()
-
-    return {"added": to_add, "removed": to_remove, "total": desired}
-
-
-async def update_asset_info_full(
-    session: AsyncSession,
-    *,
-    asset_info_id: str,
-    name: Optional[str] = None,
-    tags: Optional[Sequence[str]] = None,
-    user_metadata: Optional[dict] = None,
-    tag_origin: str = "manual",
-    asset_info_row: Any = None,
-) -> AssetInfo:
-    if not asset_info_row:
-        info = await session.get(AssetInfo, asset_info_id)
-        if not info:
-            raise ValueError(f"AssetInfo {asset_info_id} not found")
-    else:
-        info = asset_info_row
-
-    touched = False
-    if name is not None and name != info.name:
-        info.name = name
-        touched = True
-
-    computed_filename = None
-    try:
-        p = pick_best_live_path(await list_cache_states_by_asset_id(session, asset_id=info.asset_id))
-        if p:
-            computed_filename = compute_relative_filename(p)
-    except Exception:
-        computed_filename = None
-
-    if user_metadata is not None:
-        new_meta = dict(user_metadata)
-        if computed_filename:
-            new_meta["filename"] = computed_filename
-        await replace_asset_info_metadata_projection(
-            session, asset_info_id=asset_info_id, user_metadata=new_meta
-        )
-        touched = True
-    else:
-        if computed_filename:
-            current_meta = info.user_metadata or {}
-            if current_meta.get("filename") != computed_filename:
-                new_meta = dict(current_meta)
-                new_meta["filename"] = computed_filename
-                await replace_asset_info_metadata_projection(
-                    session, asset_info_id=asset_info_id, user_metadata=new_meta
-                )
-                touched = True
-
-    if tags is not None:
-        await set_asset_info_tags(
-            session,
-            asset_info_id=asset_info_id,
-            tags=tags,
-            origin=tag_origin,
-        )
-        touched = True
-
-    if touched and user_metadata is None:
-        info.updated_at = utcnow()
-        await session.flush()
-
-    return info
-
-
-async def replace_asset_info_metadata_projection(
-    session: AsyncSession,
-    *,
-    asset_info_id: str,
-    user_metadata: Optional[dict],
-) -> None:
-    info = await session.get(AssetInfo, asset_info_id)
-    if not info:
-        raise ValueError(f"AssetInfo {asset_info_id} not found")
-
-    info.user_metadata = user_metadata or {}
-    info.updated_at = utcnow()
-    await session.flush()
-
-    await session.execute(delete(AssetInfoMeta).where(AssetInfoMeta.asset_info_id == asset_info_id))
-    await session.flush()
-
-    if not user_metadata:
-        return
-
-    rows: list[AssetInfoMeta] = []
-    for k, v in user_metadata.items():
-        for r in project_kv(k, v):
-            rows.append(
-                AssetInfoMeta(
-                    asset_info_id=asset_info_id,
-                    key=r["key"],
-                    ordinal=int(r["ordinal"]),
-                    val_str=r.get("val_str"),
-                    val_num=r.get("val_num"),
-                    val_bool=r.get("val_bool"),
-                    val_json=r.get("val_json"),
-                )
-            )
-    if rows:
-        session.add_all(rows)
-        await session.flush()
-
-
-async def touch_asset_info_by_id(
-    session: AsyncSession,
-    *,
-    asset_info_id: str,
-    ts: Optional[datetime] = None,
-    only_if_newer: bool = True,
-) -> None:
-    ts = ts or utcnow()
-    stmt = sa.update(AssetInfo).where(AssetInfo.id == asset_info_id)
-    if only_if_newer:
-        stmt = stmt.where(
-            sa.or_(AssetInfo.last_access_time.is_(None), AssetInfo.last_access_time < ts)
-        )
-    await session.execute(stmt.values(last_access_time=ts))
-
-
-async def delete_asset_info_by_id(session: AsyncSession, *, asset_info_id: str, owner_id: str) -> bool:
-    stmt = sa.delete(AssetInfo).where(
-        AssetInfo.id == asset_info_id,
-        visible_owner_clause(owner_id),
-    )
-    return int((await session.execute(stmt)).rowcount or 0) > 0
-
-
-async def add_tags_to_asset_info(
-    session: AsyncSession,
-    *,
-    asset_info_id: str,
-    tags: Sequence[str],
-    origin: str = "manual",
-    create_if_missing: bool = True,
-    asset_info_row: Any = None,
-) -> dict:
-    if not asset_info_row:
-        info = await session.get(AssetInfo, asset_info_id)
-        if not info:
-            raise ValueError(f"AssetInfo {asset_info_id} not found")
-
-    norm = normalize_tags(tags)
-    if not norm:
-        total = await get_asset_tags(session, asset_info_id=asset_info_id)
-        return {"added": [], "already_present": [], "total_tags": total}
-
-    if create_if_missing:
-        await ensure_tags_exist(session, norm, tag_type="user")
-
-    current = {
-        tag_name
-        for (tag_name,) in (
-            await session.execute(
-                sa.select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == asset_info_id)
-            )
-        ).all()
-    }
-
-    want = set(norm)
-    to_add = sorted(want - current)
-
-    if to_add:
-        async with session.begin_nested() as nested:
-            try:
-                session.add_all(
-                    [
-                        AssetInfoTag(
-                            asset_info_id=asset_info_id,
-                            tag_name=t,
-                            origin=origin,
-                            added_at=utcnow(),
-                        )
-                        for t in to_add
-                    ]
-                )
-                await session.flush()
-            except IntegrityError:
-                await nested.rollback()
-
-    after = set(await get_asset_tags(session, asset_info_id=asset_info_id))
-    return {
-        "added": sorted(((after - current) & want)),
-        "already_present": sorted(want & current),
-        "total_tags": sorted(after),
-    }
-
-
-async def remove_tags_from_asset_info(
-    session: AsyncSession,
-    *,
-    asset_info_id: str,
-    tags: Sequence[str],
-) -> dict:
-    info = await session.get(AssetInfo, asset_info_id)
-    if not info:
-        raise ValueError(f"AssetInfo {asset_info_id} not found")
-
-    norm = normalize_tags(tags)
-    if not norm:
-        total = await get_asset_tags(session, asset_info_id=asset_info_id)
-        return {"removed": [], "not_present": [], "total_tags": total}
-
-    existing = {
-        tag_name
-        for (tag_name,) in (
-            await session.execute(
-                sa.select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == asset_info_id)
-            )
-        ).all()
-    }
-
-    to_remove = sorted(set(t for t in norm if t in existing))
-    not_present = sorted(set(t for t in norm if t not in existing))
-
-    if to_remove:
-        await session.execute(
-            delete(AssetInfoTag)
-            .where(
-                AssetInfoTag.asset_info_id == asset_info_id,
-                AssetInfoTag.tag_name.in_(to_remove),
-            )
-        )
-        await session.flush()
-
-    total = await get_asset_tags(session, asset_info_id=asset_info_id)
-    return {"removed": to_remove, "not_present": not_present, "total_tags": total}
-
-
-async def list_tags_with_usage(
-    session: AsyncSession,
-    *,
-    prefix: Optional[str] = None,
-    limit: int = 100,
-    offset: int = 0,
-    include_zero: bool = True,
-    order: str = "count_desc",
-    owner_id: str = "",
-) -> tuple[list[tuple[str, str, int]], int]:
-    counts_sq = (
-        select(
-            AssetInfoTag.tag_name.label("tag_name"),
-            func.count(AssetInfoTag.asset_info_id).label("cnt"),
-        )
-        .select_from(AssetInfoTag)
-        .join(AssetInfo, AssetInfo.id == AssetInfoTag.asset_info_id)
-        .where(visible_owner_clause(owner_id))
-        .group_by(AssetInfoTag.tag_name)
-        .subquery()
-    )
-
-    q = (
-        select(
-            Tag.name,
-            Tag.tag_type,
-            func.coalesce(counts_sq.c.cnt, 0).label("count"),
-        )
-        .select_from(Tag)
-        .join(counts_sq, counts_sq.c.tag_name == Tag.name, isouter=True)
-    )
-
-    if prefix:
-        escaped, esc = escape_like_prefix(prefix.strip().lower())
-        q = q.where(Tag.name.like(escaped + "%", escape=esc))
-
-    if not include_zero:
-        q = q.where(func.coalesce(counts_sq.c.cnt, 0) > 0)
-
-    if order == "name_asc":
-        q = q.order_by(Tag.name.asc())
-    else:
-        q = q.order_by(func.coalesce(counts_sq.c.cnt, 0).desc(), Tag.name.asc())
-
-    total_q = select(func.count()).select_from(Tag)
-    if prefix:
-        escaped, esc = escape_like_prefix(prefix.strip().lower())
-        total_q = total_q.where(Tag.name.like(escaped + "%", escape=esc))
-    if not include_zero:
-        total_q = total_q.where(
-            Tag.name.in_(select(AssetInfoTag.tag_name).group_by(AssetInfoTag.tag_name))
-        )
-
-    rows = (await session.execute(q.limit(limit).offset(offset))).all()
-    total = (await session.execute(total_q)).scalar_one()
-
-    rows_norm = [(name, ttype, int(count or 0)) for (name, ttype, count) in rows]
-    return rows_norm, int(total or 0)
-
-
-async def get_asset_tags(session: AsyncSession, *, asset_info_id: str) -> list[str]:
-    return [
-        tag_name
-        for (tag_name,) in (
-            await session.execute(
-                sa.select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == asset_info_id)
-            )
-        ).all()
-    ]
-
-
-async def set_asset_info_preview(
-    session: AsyncSession,
-    *,
-    asset_info_id: str,
-    preview_asset_id: Optional[str],
-) -> None:
-    """Set or clear preview_id and bump updated_at. Raises on unknown IDs."""
-    info = await session.get(AssetInfo, asset_info_id)
-    if not info:
-        raise ValueError(f"AssetInfo {asset_info_id} not found")
-
-    if preview_asset_id is None:
-        info.preview_id = None
-    else:
-        # validate preview asset exists
-        if not await session.get(Asset, preview_asset_id):
-            raise ValueError(f"Preview Asset {preview_asset_id} not found")
-        info.preview_id = preview_asset_id
-
-    info.updated_at = utcnow()
-    await session.flush()
--- a/app/assets/database/services/queries.py
+++ b/app/assets/database/services/queries.py
@ -1,76 +0,0 @@
-import os
-from typing import Optional, Sequence, Union
-
-import sqlalchemy as sa
-from sqlalchemy import select
-from sqlalchemy.ext.asyncio import AsyncSession
-
-from ..models import Asset, AssetCacheState, AssetInfo
-
-
-async def asset_exists_by_hash(session: AsyncSession, *, asset_hash: str) -> bool:
-    row = (
-        await session.execute(
-            select(sa.literal(True)).select_from(Asset).where(Asset.hash == asset_hash).limit(1)
-        )
-    ).first()
-    return row is not None
-
-
-async def get_asset_by_hash(session: AsyncSession, *, asset_hash: str) -> Optional[Asset]:
-    return (
-        await session.execute(select(Asset).where(Asset.hash == asset_hash).limit(1))
-    ).scalars().first()
-
-
-async def get_asset_info_by_id(session: AsyncSession, *, asset_info_id: str) -> Optional[AssetInfo]:
-    return await session.get(AssetInfo, asset_info_id)
-
-
-async def asset_info_exists_for_asset_id(session: AsyncSession, *, asset_id: str) -> bool:
-    q = (
-        select(sa.literal(True))
-        .select_from(AssetInfo)
-        .where(AssetInfo.asset_id == asset_id)
-        .limit(1)
-    )
-    return (await session.execute(q)).first() is not None
-
-
-async def get_cache_state_by_asset_id(session: AsyncSession, *, asset_id: str) -> Optional[AssetCacheState]:
-    return (
-        await session.execute(
-            select(AssetCacheState)
-            .where(AssetCacheState.asset_id == asset_id)
-            .order_by(AssetCacheState.id.asc())
-            .limit(1)
-        )
-    ).scalars().first()
-
-
-async def list_cache_states_by_asset_id(
-    session: AsyncSession, *, asset_id: str
-) -> Union[list[AssetCacheState], Sequence[AssetCacheState]]:
-    return (
-        await session.execute(
-            select(AssetCacheState)
-            .where(AssetCacheState.asset_id == asset_id)
-            .order_by(AssetCacheState.id.asc())
-        )
-    ).scalars().all()
-
-
-def pick_best_live_path(states: Union[list[AssetCacheState], Sequence[AssetCacheState]]) -> str:
-    """
-    Return the best on-disk path among cache states:
-      1) Prefer a path that exists with needs_verify == False (already verified).
-      2) Otherwise, pick the first path that exists.
-      3) Otherwise return empty string.
-    """
-    alive = [s for s in states if getattr(s, "file_path", None) and os.path.isfile(s.file_path)]
-    if not alive:
-        return ""
-    for s in alive:
-        if not getattr(s, "needs_verify", False):
-            return s.file_path
-    return alive[0].file_path
--- a/app/assets/database/tags.py
+++ b/app/assets/database/tags.py
@ -0,0 +1,62 @@
+from typing import Iterable
+
+import sqlalchemy
+from sqlalchemy.orm import Session
+from sqlalchemy.dialects import sqlite
+
+from app.assets.helpers import normalize_tags, utcnow
+from app.assets.database.models import Tag, AssetInfoTag, AssetInfo
+
+
+def ensure_tags_exist(session: Session, names: Iterable[str], tag_type: str = "user") -> None:
+    wanted = normalize_tags(list(names))
+    if not wanted:
+        return
+    rows = [{"name": n, "tag_type": tag_type} for n in list(dict.fromkeys(wanted))]
+    ins = (
+            sqlite.insert(Tag)
+            .values(rows)
+            .on_conflict_do_nothing(index_elements=[Tag.name])
+        )
+    return session.execute(ins)
+
+def add_missing_tag_for_asset_id(
+    session: Session,
+    *,
+    asset_id: str,
+    origin: str = "automatic",
+) -> None:
+    select_rows = (
+        sqlalchemy.select(
+            AssetInfo.id.label("asset_info_id"),
+            sqlalchemy.literal("missing").label("tag_name"),
+            sqlalchemy.literal(origin).label("origin"),
+            sqlalchemy.literal(utcnow()).label("added_at"),
+        )
+        .where(AssetInfo.asset_id == asset_id)
+        .where(
+            sqlalchemy.not_(
+                sqlalchemy.exists().where((AssetInfoTag.asset_info_id == AssetInfo.id) & (AssetInfoTag.tag_name == "missing"))
+            )
+        )
+    )
+    session.execute(
+        sqlite.insert(AssetInfoTag)
+        .from_select(
+            ["asset_info_id", "tag_name", "origin", "added_at"],
+            select_rows,
+        )
+        .on_conflict_do_nothing(index_elements=[AssetInfoTag.asset_info_id, AssetInfoTag.tag_name])
+    )
+
+def remove_missing_tag_for_asset_id(
+    session: Session,
+    *,
+    asset_id: str,
+) -> None:
+    session.execute(
+        sqlalchemy.delete(AssetInfoTag).where(
+            AssetInfoTag.asset_info_id.in_(sqlalchemy.select(AssetInfo.id).where(AssetInfo.asset_id == asset_id)),
+            AssetInfoTag.tag_name == "missing",
+        )
+    )
--- a/app/assets/database/timeutil.py
+++ b/app/assets/database/timeutil.py
@ -1,6 +0,0 @@
-from datetime import datetime, timezone
-
-
-def utcnow() -> datetime:
-    """Naive UTC timestamp (no tzinfo). We always treat DB datetimes as UTC."""
-    return datetime.now(timezone.utc).replace(tzinfo=None)
--- a/app/assets/storage/hashing.py
+++ b/app/assets/storage/hashing.py
@ -1,26 +1,65 @@
-import asyncio
-import os
-from typing import IO, Union
-
 from blake3 import blake3
-
-DEFAULT_CHUNK = 8 * 1024 * 1024  # 8 MiB
+from typing import IO
+import os
+import asyncio


-def _hash_file_obj_sync(file_obj: IO[bytes], chunk_size: int) -> str:
-    """Hash an already-open binary file object by streaming in chunks.
+DEFAULT_CHUNK = 8 * 1024 *1024 # 8MB
+
+# NOTE: this allows hashing different representations of a file-like object
+def blake3_hash(
+    fp: str | IO[bytes],
+    chunk_size: int = DEFAULT_CHUNK,
+) -> str:
+    """
+    Returns a BLAKE3 hex digest for ``fp``, which may be:
+      - a filename (str/bytes) or PathLike
+      - an open binary file object
+    If ``fp`` is a file object, it must be opened in **binary** mode and support
+    ``read``, ``seek``, and ``tell``. The function will seek to the start before
+    reading and will attempt to restore the original position afterward.
+    """
+    # duck typing to check if input is a file-like object
+    if hasattr(fp, "read"):
+        return _hash_file_obj(fp, chunk_size)
+
+    with open(os.fspath(fp), "rb") as f:
+        return _hash_file_obj(f, chunk_size)
+
+
+async def blake3_hash_async(
+    fp: str | IO[bytes],
+    chunk_size: int = DEFAULT_CHUNK,
+) -> str:
+    """Async wrapper for ``blake3_hash_sync``.
+    Uses a worker thread so the event loop remains responsive.
+    """
+    # If it is a path, open inside the worker thread to keep I/O off the loop.
+    if hasattr(fp, "read"):
+        return await asyncio.to_thread(blake3_hash, fp, chunk_size)
+
+    def _worker() -> str:
+        with open(os.fspath(fp), "rb") as f:
+            return _hash_file_obj(f, chunk_size)
+
+    return await asyncio.to_thread(_worker)
+
+
+def _hash_file_obj(file_obj: IO, chunk_size: int = DEFAULT_CHUNK) -> str:
+    """
+    Hash an already-open binary file object by streaming in chunks.
    - Seeks to the beginning before reading (if supported).
    - Restores the original position afterward (if tell/seek are supported).
    """
    if chunk_size <= 0:
        chunk_size = DEFAULT_CHUNK

-    orig_pos = None
-    if hasattr(file_obj, "tell"):
-        orig_pos = file_obj.tell()
+    # in case file object is already open and not at the beginning, track so can be restored after hashing
+    orig_pos = file_obj.tell()

    try:
-        if hasattr(file_obj, "seek"):
+        # seek to the beginning before reading
+        if orig_pos != 0:
            file_obj.seek(0)

        h = blake3()
@ -31,42 +70,6 @@ def _hash_file_obj_sync(file_obj: IO[bytes], chunk_size: int) -> str:
            h.update(chunk)
        return h.hexdigest()
    finally:
-        if hasattr(file_obj, "seek") and orig_pos is not None:
+        # restore original position in file object, if needed
+        if orig_pos != 0:
            file_obj.seek(orig_pos)
-
-
-def blake3_hash_sync(
-    fp: Union[str, bytes, os.PathLike[str], os.PathLike[bytes], IO[bytes]],
-    chunk_size: int = DEFAULT_CHUNK,
-) -> str:
-    """Returns a BLAKE3 hex digest for ``fp``, which may be:
-      - a filename (str/bytes) or PathLike
-      - an open binary file object
-
-    If ``fp`` is a file object, it must be opened in **binary** mode and support
-    ``read``, ``seek``, and ``tell``. The function will seek to the start before
-    reading and will attempt to restore the original position afterward.
-    """
-    if hasattr(fp, "read"):
-        return _hash_file_obj_sync(fp, chunk_size)
-
-    with open(os.fspath(fp), "rb") as f:
-        return _hash_file_obj_sync(f, chunk_size)
-
-
-async def blake3_hash(
-    fp: Union[str, bytes, os.PathLike[str], os.PathLike[bytes], IO[bytes]],
-    chunk_size: int = DEFAULT_CHUNK,
-) -> str:
-    """Async wrapper for ``blake3_hash_sync``.
-    Uses a worker thread so the event loop remains responsive.
-    """
-    # If it is a path, open inside the worker thread to keep I/O off the loop.
-    if hasattr(fp, "read"):
-        return await asyncio.to_thread(blake3_hash_sync, fp, chunk_size)
-
-    def _worker() -> str:
-        with open(os.fspath(fp), "rb") as f:
-            return _hash_file_obj_sync(f, chunk_size)
-
-    return await asyncio.to_thread(_worker)
--- a/app/assets/_helpers.py
+++ b/app/assets/_helpers.py
@ -1,14 +1,78 @@
 import contextlib
 import os
-import uuid
+from decimal import Decimal
+from aiohttp import web
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import Literal, Optional, Sequence
+from typing import Literal, Any

 import folder_paths

-from .api import schemas_in

+RootType = Literal["models", "input", "output"]
+ALLOWED_ROOTS: tuple[RootType, ...] = ("models", "input", "output")
+
+def get_query_dict(request: web.Request) -> dict[str, Any]:
+    """
+    Gets a dictionary of query parameters from the request.
+
+    'request.query' is a MultiMapping[str], needs to be converted to a dictionary to be validated by Pydantic.
+    """
+    query_dict = {
+        key: request.query.getall(key) if len(request.query.getall(key)) > 1 else request.query.get(key)
+        for key in request.query.keys()
+    }
+    return query_dict
+
+def list_tree(base_dir: str) -> list[str]:
+    out: list[str] = []
+    base_abs = os.path.abspath(base_dir)
+    if not os.path.isdir(base_abs):
+        return out
+    for dirpath, _subdirs, filenames in os.walk(base_abs, topdown=True, followlinks=False):
+        for name in filenames:
+            out.append(os.path.abspath(os.path.join(dirpath, name)))
+    return out
+
+def prefixes_for_root(root: RootType) -> list[str]:
+    if root == "models":
+        bases: list[str] = []
+        for _bucket, paths in get_comfy_models_folders():
+            bases.extend(paths)
+        return [os.path.abspath(p) for p in bases]
+    if root == "input":
+        return [os.path.abspath(folder_paths.get_input_directory())]
+    if root == "output":
+        return [os.path.abspath(folder_paths.get_output_directory())]
+    return []
+
+def escape_like_prefix(s: str, escape: str = "!") -> tuple[str, str]:
+    """Escapes %, _ and the escape char itself in a LIKE prefix.
+    Returns (escaped_prefix, escape_char). Caller should append '%' and pass escape=escape_char to .like().
+    """
+    s = s.replace(escape, escape + escape)  # escape the escape char first
+    s = s.replace("%", escape + "%").replace("_", escape + "_")  # escape LIKE wildcards
+    return s, escape
+
+def fast_asset_file_check(
+    *,
+    mtime_db: int | None,
+    size_db: int | None,
+    stat_result: os.stat_result,
+) -> bool:
+    if mtime_db is None:
+        return False
+    actual_mtime_ns = getattr(stat_result, "st_mtime_ns", int(stat_result.st_mtime * 1_000_000_000))
+    if int(mtime_db) != int(actual_mtime_ns):
+        return False
+    sz = int(size_db or 0)
+    if sz > 0:
+        return int(stat_result.st_size) == sz
+    return True
+
+def utcnow() -> datetime:
+    """Naive UTC timestamp (no tzinfo). We always treat DB datetimes as UTC."""
+    return datetime.now(timezone.utc).replace(tzinfo=None)

 def get_comfy_models_folders() -> list[tuple[str, list[str]]]:
    """Build a list of (folder_name, base_paths[]) categories that are configured for model locations.
@ -18,11 +82,71 @@ def get_comfy_models_folders() -> list[tuple[str, list[str]]]:
    """
    targets: list[tuple[str, list[str]]] = []
    models_root = os.path.abspath(folder_paths.models_dir)
-    for name, (paths, _exts) in folder_paths.folder_names_and_paths.items():
+    for name, values in folder_paths.folder_names_and_paths.items():
+        paths, _exts = values[0], values[1]  # NOTE: this prevents nodepacks that hackily edit folder_... from breaking ComfyUI
        if any(os.path.abspath(p).startswith(models_root + os.sep) for p in paths):
            targets.append((name, paths))
    return targets

+def resolve_destination_from_tags(tags: list[str]) -> tuple[str, list[str]]:
+    """Validates and maps tags -> (base_dir, subdirs_for_fs)"""
+    root = tags[0]
+    if root == "models":
+        if len(tags) < 2:
+            raise ValueError("at least two tags required for model asset")
+        try:
+            bases = folder_paths.folder_names_and_paths[tags[1]][0]
+        except KeyError:
+            raise ValueError(f"unknown model category '{tags[1]}'")
+        if not bases:
+            raise ValueError(f"no base path configured for category '{tags[1]}'")
+        base_dir = os.path.abspath(bases[0])
+        raw_subdirs = tags[2:]
+    else:
+        base_dir = os.path.abspath(
+            folder_paths.get_input_directory() if root == "input" else folder_paths.get_output_directory()
+        )
+        raw_subdirs = tags[1:]
+    for i in raw_subdirs:
+        if i in (".", ".."):
+            raise ValueError("invalid path component in tags")
+
+    return base_dir, raw_subdirs if raw_subdirs else []
+
+def ensure_within_base(candidate: str, base: str) -> None:
+    cand_abs = os.path.abspath(candidate)
+    base_abs = os.path.abspath(base)
+    try:
+        if os.path.commonpath([cand_abs, base_abs]) != base_abs:
+            raise ValueError("destination escapes base directory")
+    except Exception:
+        raise ValueError("invalid destination path")
+
+def compute_relative_filename(file_path: str) -> str | None:
+    """
+    Return the model's path relative to the last well-known folder (the model category),
+    using forward slashes, eg:
+      /.../models/checkpoints/flux/123/flux.safetensors -> "flux/123/flux.safetensors"
+      /.../models/text_encoders/clip_g.safetensors -> "clip_g.safetensors"
+
+    For non-model paths, returns None.
+    NOTE: this is a temporary helper, used only for initializing metadata["filename"] field.
+    """
+    try:
+        root_category, rel_path = get_relative_to_root_category_path_of_asset(file_path)
+    except ValueError:
+        return None
+
+    p = Path(rel_path)
+    parts = [seg for seg in p.parts if seg not in (".", "..", p.anchor)]
+    if not parts:
+        return None
+
+    if root_category == "models":
+        # parts[0] is the category ("checkpoints", "vae", etc) – drop it
+        inside = parts[1:] if len(parts) > 1 else [parts[0]]
+        return "/".join(inside)
+    return "/".join(parts)  # input/output: keep all parts

 def get_relative_to_root_category_path_of_asset(file_path: str) -> tuple[Literal["input", "output", "models"], str]:
    """Given an absolute or relative file path, determine which root category the path belongs to:
@ -60,7 +184,7 @@ def get_relative_to_root_category_path_of_asset(file_path: str) -> tuple[Literal
        return "output", _rel(fp_abs, output_base)

    # 3) models (check deepest matching base to avoid ambiguity)
-    best: Optional[tuple[int, str, str]] = None  # (base_len, bucket, rel_inside_bucket)
+    best: tuple[int, str, str] | None = None  # (base_len, bucket, rel_inside_bucket)
    for bucket, bases in get_comfy_models_folders():
        for b in bases:
            base_abs = os.path.abspath(b)
@ -77,7 +201,6 @@ def get_relative_to_root_category_path_of_asset(file_path: str) -> tuple[Literal

    raise ValueError(f"Path is not within input, output, or configured model bases: {file_path}")

-
 def get_name_and_tags_from_asset_path(file_path: str) -> tuple[str, list[str]]:
    """Return a tuple (name, tags) derived from a filesystem path.

@ -99,111 +222,14 @@ def get_name_and_tags_from_asset_path(file_path: str) -> tuple[str, list[str]]:
    parent_parts = [part for part in p.parent.parts if part not in (".", "..", p.anchor)]
    return p.name, list(dict.fromkeys(normalize_tags([root_category, *parent_parts])))

-
-def normalize_tags(tags: Optional[Sequence[str]]) -> list[str]:
+def normalize_tags(tags: list[str] | None) -> list[str]:
+    """
+    Normalize a list of tags by:
+      - Stripping whitespace and converting to lowercase.
+      - Removing duplicates.
+    """
    return [t.strip().lower() for t in (tags or []) if (t or "").strip()]

-
-def resolve_destination_from_tags(tags: list[str]) -> tuple[str, list[str]]:
-    """Validates and maps tags -> (base_dir, subdirs_for_fs)"""
-    root = tags[0]
-    if root == "models":
-        if len(tags) < 2:
-            raise ValueError("at least two tags required for model asset")
-        try:
-            bases = folder_paths.folder_names_and_paths[tags[1]][0]
-        except KeyError:
-            raise ValueError(f"unknown model category '{tags[1]}'")
-        if not bases:
-            raise ValueError(f"no base path configured for category '{tags[1]}'")
-        base_dir = os.path.abspath(bases[0])
-        raw_subdirs = tags[2:]
-    else:
-        base_dir = os.path.abspath(
-            folder_paths.get_input_directory() if root == "input" else folder_paths.get_output_directory()
-        )
-        raw_subdirs = tags[1:]
-    for i in raw_subdirs:
-        if i in (".", ".."):
-            raise ValueError("invalid path component in tags")
-
-    return base_dir, raw_subdirs if raw_subdirs else []
-
-
-def ensure_within_base(candidate: str, base: str) -> None:
-    cand_abs = os.path.abspath(candidate)
-    base_abs = os.path.abspath(base)
-    try:
-        if os.path.commonpath([cand_abs, base_abs]) != base_abs:
-            raise ValueError("destination escapes base directory")
-    except Exception:
-        raise ValueError("invalid destination path")
-
-
-def compute_relative_filename(file_path: str) -> Optional[str]:
-    """
-    Return the model's path relative to the last well-known folder (the model category),
-    using forward slashes, eg:
-      /.../models/checkpoints/flux/123/flux.safetensors -> "flux/123/flux.safetensors"
-      /.../models/text_encoders/clip_g.safetensors -> "clip_g.safetensors"
-
-    For non-model paths, returns None.
-    NOTE: this is a temporary helper, used only for initializing metadata["filename"] field.
-    """
-    try:
-        root_category, rel_path = get_relative_to_root_category_path_of_asset(file_path)
-    except ValueError:
-        return None
-
-    p = Path(rel_path)
-    parts = [seg for seg in p.parts if seg not in (".", "..", p.anchor)]
-    if not parts:
-        return None
-
-    if root_category == "models":
-        # parts[0] is the category ("checkpoints", "vae", etc) – drop it
-        inside = parts[1:] if len(parts) > 1 else [parts[0]]
-        return "/".join(inside)
-    return "/".join(parts)  # input/output: keep all parts
-
-
-def list_tree(base_dir: str) -> list[str]:
-    out: list[str] = []
-    base_abs = os.path.abspath(base_dir)
-    if not os.path.isdir(base_abs):
-        return out
-    for dirpath, _subdirs, filenames in os.walk(base_abs, topdown=True, followlinks=False):
-        for name in filenames:
-            out.append(os.path.abspath(os.path.join(dirpath, name)))
-    return out
-
-
-def prefixes_for_root(root: schemas_in.RootType) -> list[str]:
-    if root == "models":
-        bases: list[str] = []
-        for _bucket, paths in get_comfy_models_folders():
-            bases.extend(paths)
-        return [os.path.abspath(p) for p in bases]
-    if root == "input":
-        return [os.path.abspath(folder_paths.get_input_directory())]
-    if root == "output":
-        return [os.path.abspath(folder_paths.get_output_directory())]
-    return []
-
-
-def ts_to_iso(ts: Optional[float]) -> Optional[str]:
-    if ts is None:
-        return None
-    try:
-        return datetime.fromtimestamp(float(ts), tz=timezone.utc).replace(tzinfo=None).isoformat()
-    except Exception:
-        return None
-
-
-def new_scan_id(root: schemas_in.RootType) -> str:
-    return f"scan-{root}-{uuid.uuid4().hex[:8]}"
-
-
 def collect_models_files() -> list[str]:
    out: list[str] = []
    for folder_name, bases in get_comfy_models_folders():
@ -223,3 +249,64 @@ def collect_models_files() -> list[str]:
            if allowed:
                out.append(abs_path)
    return out
+
+def is_scalar(v):
+    if v is None:
+        return True
+    if isinstance(v, bool):
+        return True
+    if isinstance(v, (int, float, Decimal, str)):
+        return True
+    return False
+
+def project_kv(key: str, value):
+    """
+    Turn a metadata key/value into typed projection rows.
+    Returns list[dict] with keys:
+      key, ordinal, and one of val_str / val_num / val_bool / val_json (others None)
+    """
+    rows: list[dict] = []
+
+    def _null_row(ordinal: int) -> dict:
+        return {
+            "key": key, "ordinal": ordinal,
+            "val_str": None, "val_num": None, "val_bool": None, "val_json": None
+        }
+
+    if value is None:
+        rows.append(_null_row(0))
+        return rows
+
+    if is_scalar(value):
+        if isinstance(value, bool):
+            rows.append({"key": key, "ordinal": 0, "val_bool": bool(value)})
+        elif isinstance(value, (int, float, Decimal)):
+            num = value if isinstance(value, Decimal) else Decimal(str(value))
+            rows.append({"key": key, "ordinal": 0, "val_num": num})
+        elif isinstance(value, str):
+            rows.append({"key": key, "ordinal": 0, "val_str": value})
+        else:
+            rows.append({"key": key, "ordinal": 0, "val_json": value})
+        return rows
+
+    if isinstance(value, list):
+        if all(is_scalar(x) for x in value):
+            for i, x in enumerate(value):
+                if x is None:
+                    rows.append(_null_row(i))
+                elif isinstance(x, bool):
+                    rows.append({"key": key, "ordinal": i, "val_bool": bool(x)})
+                elif isinstance(x, (int, float, Decimal)):
+                    num = x if isinstance(x, Decimal) else Decimal(str(x))
+                    rows.append({"key": key, "ordinal": i, "val_num": num})
+                elif isinstance(x, str):
+                    rows.append({"key": key, "ordinal": i, "val_str": x})
+                else:
+                    rows.append({"key": key, "ordinal": i, "val_json": x})
+            return rows
+        for i, x in enumerate(value):
+            rows.append({"key": key, "ordinal": i, "val_json": x})
+        return rows
+
+    rows.append({"key": key, "ordinal": 0, "val_json": value})
+    return rows
--- a/app/assets/manager.py
+++ b/app/assets/manager.py
@ -1,100 +1,71 @@
-import contextlib
-import logging
-import mimetypes
 import os
-from typing import Optional, Sequence
+import mimetypes
+import contextlib
+from typing import Sequence

-from comfy_api.internal import async_to_sync
-
-from ..db import create_session
-from ._helpers import (
-    ensure_within_base,
-    get_name_and_tags_from_asset_path,
-    resolve_destination_from_tags,
-)
-from .api import schemas_in, schemas_out
-from .database.models import Asset
-from .database.services import (
-    add_tags_to_asset_info,
+from app.database.db import create_session
+from app.assets.api import schemas_out, schemas_in
+from app.assets.database.queries import (
    asset_exists_by_hash,
    asset_info_exists_for_asset_id,
-    check_fs_asset_exists_quick,
-    create_asset_info_for_existing_asset,
-    delete_asset_info_by_id,
-    fetch_asset_info_and_asset,
-    fetch_asset_info_asset_and_tags,
    get_asset_by_hash,
    get_asset_info_by_id,
-    get_asset_tags,
-    ingest_fs_asset,
-    list_asset_infos_page,
-    list_cache_states_by_asset_id,
-    list_tags_with_usage,
-    pick_best_live_path,
-    remove_tags_from_asset_info,
-    set_asset_info_preview,
+    fetch_asset_info_asset_and_tags,
+    fetch_asset_info_and_asset,
+    create_asset_info_for_existing_asset,
    touch_asset_info_by_id,
-    touch_asset_infos_by_fs_path,
    update_asset_info_full,
+    delete_asset_info_by_id,
+    list_cache_states_by_asset_id,
+    list_asset_infos_page,
+    list_tags_with_usage,
+    get_asset_tags,
+    add_tags_to_asset_info,
+    remove_tags_from_asset_info,
+    pick_best_live_path,
+    ingest_fs_asset,
+    set_asset_info_preview,
 )
-from .storage import hashing
+from app.assets.helpers import resolve_destination_from_tags, ensure_within_base
+from app.assets.database.models import Asset
+import app.assets.hashing as hashing


-async def asset_exists(*, asset_hash: str) -> bool:
-    async with await create_session() as session:
-        return await asset_exists_by_hash(session, asset_hash=asset_hash)
+def _safe_sort_field(requested: str | None) -> str:
+    if not requested:
+        return "created_at"
+    v = requested.lower()
+    if v in {"name", "created_at", "updated_at", "size", "last_access_time"}:
+        return v
+    return "created_at"


-def populate_db_with_asset(file_path: str, tags: Optional[list[str]] = None) -> None:
-    if tags is None:
-        tags = []
-    try:
-        asset_name, path_tags = get_name_and_tags_from_asset_path(file_path)
-        async_to_sync.AsyncToSyncConverter.run_async_in_thread(
-            add_local_asset,
-            tags=list(dict.fromkeys([*path_tags, *tags])),
-            file_name=asset_name,
-            file_path=file_path,
-        )
-    except ValueError as e:
-        logging.warning("Skipping non-asset path %s: %s", file_path, e)
+def _get_size_mtime_ns(path: str) -> tuple[int, int]:
+    st = os.stat(path, follow_symlinks=True)
+    return st.st_size, getattr(st, "st_mtime_ns", int(st.st_mtime * 1_000_000_000))


-async def add_local_asset(tags: list[str], file_name: str, file_path: str) -> None:
-    abs_path = os.path.abspath(file_path)
-    size_bytes, mtime_ns = _get_size_mtime_ns(abs_path)
-    if not size_bytes:
-        return
-
-    async with await create_session() as session:
-        if await check_fs_asset_exists_quick(session, file_path=abs_path, size_bytes=size_bytes, mtime_ns=mtime_ns):
-            await touch_asset_infos_by_fs_path(session, file_path=abs_path)
-            await session.commit()
-            return
-
-    asset_hash = hashing.blake3_hash_sync(abs_path)
-
-    async with await create_session() as session:
-        await ingest_fs_asset(
-            session,
-            asset_hash="blake3:" + asset_hash,
-            abs_path=abs_path,
-            size_bytes=size_bytes,
-            mtime_ns=mtime_ns,
-            mime_type=None,
-            info_name=file_name,
-            tag_origin="automatic",
-            tags=tags,
-        )
-        await session.commit()
+def _safe_filename(name: str | None, fallback: str) -> str:
+    n = os.path.basename((name or "").strip() or fallback)
+    if n:
+        return n
+    return fallback


-async def list_assets(
+def asset_exists(*, asset_hash: str) -> bool:
+    """
+    Check if an asset with a given hash exists in database.
+    """
+    with create_session() as session:
+        return asset_exists_by_hash(session, asset_hash=asset_hash)
+
+
+def list_assets(
    *,
-    include_tags: Optional[Sequence[str]] = None,
-    exclude_tags: Optional[Sequence[str]] = None,
-    name_contains: Optional[str] = None,
-    metadata_filter: Optional[dict] = None,
+    include_tags: Sequence[str] | None = None,
+    exclude_tags: Sequence[str] | None = None,
+    name_contains: str | None = None,
+    metadata_filter: dict | None = None,
    limit: int = 20,
    offset: int = 0,
    sort: str = "created_at",
@ -104,8 +75,8 @@ async def list_assets(
    sort = _safe_sort_field(sort)
    order = "desc" if (order or "desc").lower() not in {"asc", "desc"} else order.lower()

-    async with await create_session() as session:
-        infos, tag_map, total = await list_asset_infos_page(
+    with create_session() as session:
+        infos, tag_map, total = list_asset_infos_page(
            session,
            owner_id=owner_id,
            include_tags=include_tags,
@ -144,9 +115,13 @@ async def list_assets(
    )


-async def get_asset(*, asset_info_id: str, owner_id: str = "") -> schemas_out.AssetDetail:
-    async with await create_session() as session:
-        res = await fetch_asset_info_asset_and_tags(session, asset_info_id=asset_info_id, owner_id=owner_id)
+def get_asset(
+    *,
+    asset_info_id: str,
+    owner_id: str = "",
+) -> schemas_out.AssetDetail:
+    with create_session() as session:
+        res = fetch_asset_info_asset_and_tags(session, asset_info_id=asset_info_id, owner_id=owner_id)
        if not res:
            raise ValueError(f"AssetInfo {asset_info_id} not found")
        info, asset, tag_names = res
@ -166,40 +141,40 @@ async def get_asset(*, asset_info_id: str, owner_id: str = "") -> schemas_out.As
    )


-async def resolve_asset_content_for_download(
+def resolve_asset_content_for_download(
    *,
    asset_info_id: str,
    owner_id: str = "",
 ) -> tuple[str, str, str]:
-    async with await create_session() as session:
-        pair = await fetch_asset_info_and_asset(session, asset_info_id=asset_info_id, owner_id=owner_id)
+    with create_session() as session:
+        pair = fetch_asset_info_and_asset(session, asset_info_id=asset_info_id, owner_id=owner_id)
        if not pair:
            raise ValueError(f"AssetInfo {asset_info_id} not found")

        info, asset = pair
-        states = await list_cache_states_by_asset_id(session, asset_id=asset.id)
+        states = list_cache_states_by_asset_id(session, asset_id=asset.id)
        abs_path = pick_best_live_path(states)
        if not abs_path:
            raise FileNotFoundError

-        await touch_asset_info_by_id(session, asset_info_id=asset_info_id)
-        await session.commit()
+        touch_asset_info_by_id(session, asset_info_id=asset_info_id)
+        session.commit()

    ctype = asset.mime_type or mimetypes.guess_type(info.name or abs_path)[0] or "application/octet-stream"
    download_name = info.name or os.path.basename(abs_path)
    return abs_path, ctype, download_name


-async def upload_asset_from_temp_path(
+def upload_asset_from_temp_path(
    spec: schemas_in.UploadAssetSpec,
    *,
    temp_path: str,
-    client_filename: Optional[str] = None,
+    client_filename: str | None = None,
    owner_id: str = "",
-    expected_asset_hash: Optional[str] = None,
+    expected_asset_hash: str | None = None,
 ) -> schemas_out.AssetCreated:
    try:
-        digest = await hashing.blake3_hash(temp_path)
+        digest = hashing.blake3_hash(temp_path)
    except Exception as e:
        raise RuntimeError(f"failed to hash uploaded file: {e}")
    asset_hash = "blake3:" + digest
@ -207,15 +182,15 @@ async def upload_asset_from_temp_path(
    if expected_asset_hash and asset_hash != expected_asset_hash.strip().lower():
        raise ValueError("HASH_MISMATCH")

-    async with await create_session() as session:
-        existing = await get_asset_by_hash(session, asset_hash=asset_hash)
+    with create_session() as session:
+        existing = get_asset_by_hash(session, asset_hash=asset_hash)
        if existing is not None:
            with contextlib.suppress(Exception):
                if temp_path and os.path.exists(temp_path):
                    os.remove(temp_path)

            display_name = _safe_filename(spec.name or (client_filename or ""), fallback=digest)
-            info = await create_asset_info_for_existing_asset(
+            info = create_asset_info_for_existing_asset(
                session,
                asset_hash=asset_hash,
                name=display_name,
@ -224,8 +199,8 @@ async def upload_asset_from_temp_path(
                tag_origin="manual",
                owner_id=owner_id,
            )
-            tag_names = await get_asset_tags(session, asset_info_id=info.id)
-            await session.commit()
+            tag_names = get_asset_tags(session, asset_info_id=info.id)
+            session.commit()

            return schemas_out.AssetCreated(
                id=info.id,
@ -268,8 +243,8 @@ async def upload_asset_from_temp_path(
    except OSError as e:
        raise RuntimeError(f"failed to stat destination file: {e}")

-    async with await create_session() as session:
-        result = await ingest_fs_asset(
+    with create_session() as session:
+        result = ingest_fs_asset(
            session,
            asset_hash=asset_hash,
            abs_path=dest_abs,
@ -288,12 +263,12 @@ async def upload_asset_from_temp_path(
        if not info_id:
            raise RuntimeError("failed to create asset metadata")

-        pair = await fetch_asset_info_and_asset(session, asset_info_id=info_id, owner_id=owner_id)
+        pair = fetch_asset_info_and_asset(session, asset_info_id=info_id, owner_id=owner_id)
        if not pair:
            raise RuntimeError("inconsistent DB state after ingest")
        info, asset = pair
-        tag_names = await get_asset_tags(session, asset_info_id=info.id)
-        await session.commit()
+        tag_names = get_asset_tags(session, asset_info_id=info.id)
+        session.commit()

    return schemas_out.AssetCreated(
        id=info.id,
@ -310,22 +285,22 @@ async def upload_asset_from_temp_path(
    )


-async def update_asset(
+def update_asset(
    *,
    asset_info_id: str,
-    name: Optional[str] = None,
-    tags: Optional[list[str]] = None,
-    user_metadata: Optional[dict] = None,
+    name: str | None = None,
+    tags: list[str] | None = None,
+    user_metadata: dict | None = None,
    owner_id: str = "",
 ) -> schemas_out.AssetUpdated:
-    async with await create_session() as session:
-        info_row = await get_asset_info_by_id(session, asset_info_id=asset_info_id)
+    with create_session() as session:
+        info_row = get_asset_info_by_id(session, asset_info_id=asset_info_id)
        if not info_row:
            raise ValueError(f"AssetInfo {asset_info_id} not found")
        if info_row.owner_id and info_row.owner_id != owner_id:
            raise PermissionError("not owner")

-        info = await update_asset_info_full(
+        info = update_asset_info_full(
            session,
            asset_info_id=asset_info_id,
            name=name,
@ -335,8 +310,8 @@ async def update_asset(
            asset_info_row=info_row,
        )

-        tag_names = await get_asset_tags(session, asset_info_id=asset_info_id)
-        await session.commit()
+        tag_names = get_asset_tags(session, asset_info_id=asset_info_id)
+        session.commit()

    return schemas_out.AssetUpdated(
        id=info.id,
@ -348,30 +323,30 @@ async def update_asset(
    )


-async def set_asset_preview(
+def set_asset_preview(
    *,
    asset_info_id: str,
-    preview_asset_id: Optional[str],
+    preview_asset_id: str | None = None,
    owner_id: str = "",
 ) -> schemas_out.AssetDetail:
-    async with await create_session() as session:
-        info_row = await get_asset_info_by_id(session, asset_info_id=asset_info_id)
+    with create_session() as session:
+        info_row = get_asset_info_by_id(session, asset_info_id=asset_info_id)
        if not info_row:
            raise ValueError(f"AssetInfo {asset_info_id} not found")
        if info_row.owner_id and info_row.owner_id != owner_id:
            raise PermissionError("not owner")

-        await set_asset_info_preview(
+        set_asset_info_preview(
            session,
            asset_info_id=asset_info_id,
            preview_asset_id=preview_asset_id,
        )

-        res = await fetch_asset_info_asset_and_tags(session, asset_info_id=asset_info_id, owner_id=owner_id)
+        res = fetch_asset_info_asset_and_tags(session, asset_info_id=asset_info_id, owner_id=owner_id)
        if not res:
            raise RuntimeError("State changed during preview update")
        info, asset, tags = res
-        await session.commit()
+        session.commit()

    return schemas_out.AssetDetail(
        id=info.id,
@ -387,32 +362,32 @@ async def set_asset_preview(
    )


-async def delete_asset_reference(*, asset_info_id: str, owner_id: str, delete_content_if_orphan: bool = True) -> bool:
-    async with await create_session() as session:
-        info_row = await get_asset_info_by_id(session, asset_info_id=asset_info_id)
+def delete_asset_reference(*, asset_info_id: str, owner_id: str, delete_content_if_orphan: bool = True) -> bool:
+    with create_session() as session:
+        info_row = get_asset_info_by_id(session, asset_info_id=asset_info_id)
        asset_id = info_row.asset_id if info_row else None
-        deleted = await delete_asset_info_by_id(session, asset_info_id=asset_info_id, owner_id=owner_id)
+        deleted = delete_asset_info_by_id(session, asset_info_id=asset_info_id, owner_id=owner_id)
        if not deleted:
-            await session.commit()
+            session.commit()
            return False

        if not delete_content_if_orphan or not asset_id:
-            await session.commit()
+            session.commit()
            return True

-        still_exists = await asset_info_exists_for_asset_id(session, asset_id=asset_id)
+        still_exists = asset_info_exists_for_asset_id(session, asset_id=asset_id)
        if still_exists:
-            await session.commit()
+            session.commit()
            return True

-        states = await list_cache_states_by_asset_id(session, asset_id=asset_id)
+        states = list_cache_states_by_asset_id(session, asset_id=asset_id)
        file_paths = [s.file_path for s in (states or []) if getattr(s, "file_path", None)]

-        asset_row = await session.get(Asset, asset_id)
+        asset_row = session.get(Asset, asset_id)
        if asset_row is not None:
-            await session.delete(asset_row)
+            session.delete(asset_row)

-        await session.commit()
+        session.commit()
        for p in file_paths:
            with contextlib.suppress(Exception):
                if p and os.path.isfile(p):
@ -420,21 +395,21 @@ async def delete_asset_reference(*, asset_info_id: str, owner_id: str, delete_co
    return True


-async def create_asset_from_hash(
+def create_asset_from_hash(
    *,
    hash_str: str,
    name: str,
-    tags: Optional[list[str]] = None,
-    user_metadata: Optional[dict] = None,
+    tags: list[str] | None = None,
+    user_metadata: dict | None = None,
    owner_id: str = "",
-) -> Optional[schemas_out.AssetCreated]:
+) -> schemas_out.AssetCreated | None:
    canonical = hash_str.strip().lower()
-    async with await create_session() as session:
-        asset = await get_asset_by_hash(session, asset_hash=canonical)
+    with create_session() as session:
+        asset = get_asset_by_hash(session, asset_hash=canonical)
        if not asset:
            return None

-        info = await create_asset_info_for_existing_asset(
+        info = create_asset_info_for_existing_asset(
            session,
            asset_hash=canonical,
            name=_safe_filename(name, fallback=canonical.split(":", 1)[1]),
@ -443,8 +418,8 @@ async def create_asset_from_hash(
            tag_origin="manual",
            owner_id=owner_id,
        )
-        tag_names = await get_asset_tags(session, asset_info_id=info.id)
-        await session.commit()
+        tag_names = get_asset_tags(session, asset_info_id=info.id)
+        session.commit()

    return schemas_out.AssetCreated(
        id=info.id,
@ -461,9 +436,55 @@ async def create_asset_from_hash(
    )


-async def list_tags(
+def add_tags_to_asset(
    *,
-    prefix: Optional[str] = None,
+    asset_info_id: str,
+    tags: list[str],
+    origin: str = "manual",
+    owner_id: str = "",
+) -> schemas_out.TagsAdd:
+    with create_session() as session:
+        info_row = get_asset_info_by_id(session, asset_info_id=asset_info_id)
+        if not info_row:
+            raise ValueError(f"AssetInfo {asset_info_id} not found")
+        if info_row.owner_id and info_row.owner_id != owner_id:
+            raise PermissionError("not owner")
+        data = add_tags_to_asset_info(
+            session,
+            asset_info_id=asset_info_id,
+            tags=tags,
+            origin=origin,
+            create_if_missing=True,
+            asset_info_row=info_row,
+        )
+        session.commit()
+    return schemas_out.TagsAdd(**data)
+
+
+def remove_tags_from_asset(
+    *,
+    asset_info_id: str,
+    tags: list[str],
+    owner_id: str = "",
+) -> schemas_out.TagsRemove:
+    with create_session() as session:
+        info_row = get_asset_info_by_id(session, asset_info_id=asset_info_id)
+        if not info_row:
+            raise ValueError(f"AssetInfo {asset_info_id} not found")
+        if info_row.owner_id and info_row.owner_id != owner_id:
+            raise PermissionError("not owner")
+
+        data = remove_tags_from_asset_info(
+            session,
+            asset_info_id=asset_info_id,
+            tags=tags,
+        )
+        session.commit()
+    return schemas_out.TagsRemove(**data)
+
+
+def list_tags(
+    prefix: str | None = None,
    limit: int = 100,
    offset: int = 0,
    order: str = "count_desc",
@ -473,8 +494,8 @@ async def list_tags(
    limit = max(1, min(1000, limit))
    offset = max(0, offset)

-    async with await create_session() as session:
-        rows, total = await list_tags_with_usage(
+    with create_session() as session:
+        rows, total = list_tags_with_usage(
            session,
            prefix=prefix,
            limit=limit,
@ -486,71 +507,3 @@ async def list_tags(

    tags = [schemas_out.TagUsage(name=name, count=count, type=tag_type) for (name, tag_type, count) in rows]
    return schemas_out.TagsList(tags=tags, total=total, has_more=(offset + len(tags)) < total)
-
-
-async def add_tags_to_asset(
-    *,
-    asset_info_id: str,
-    tags: list[str],
-    origin: str = "manual",
-    owner_id: str = "",
-) -> schemas_out.TagsAdd:
-    async with await create_session() as session:
-        info_row = await get_asset_info_by_id(session, asset_info_id=asset_info_id)
-        if not info_row:
-            raise ValueError(f"AssetInfo {asset_info_id} not found")
-        if info_row.owner_id and info_row.owner_id != owner_id:
-            raise PermissionError("not owner")
-        data = await add_tags_to_asset_info(
-            session,
-            asset_info_id=asset_info_id,
-            tags=tags,
-            origin=origin,
-            create_if_missing=True,
-            asset_info_row=info_row,
-        )
-        await session.commit()
-    return schemas_out.TagsAdd(**data)
-
-
-async def remove_tags_from_asset(
-    *,
-    asset_info_id: str,
-    tags: list[str],
-    owner_id: str = "",
-) -> schemas_out.TagsRemove:
-    async with await create_session() as session:
-        info_row = await get_asset_info_by_id(session, asset_info_id=asset_info_id)
-        if not info_row:
-            raise ValueError(f"AssetInfo {asset_info_id} not found")
-        if info_row.owner_id and info_row.owner_id != owner_id:
-            raise PermissionError("not owner")
-
-        data = await remove_tags_from_asset_info(
-            session,
-            asset_info_id=asset_info_id,
-            tags=tags,
-        )
-        await session.commit()
-    return schemas_out.TagsRemove(**data)
-
-
-def _safe_sort_field(requested: Optional[str]) -> str:
-    if not requested:
-        return "created_at"
-    v = requested.lower()
-    if v in {"name", "created_at", "updated_at", "size", "last_access_time"}:
-        return v
-    return "created_at"
-
-
-def _get_size_mtime_ns(path: str) -> tuple[int, int]:
-    st = os.stat(path, follow_symlinks=True)
-    return st.st_size, getattr(st, "st_mtime_ns", int(st.st_mtime * 1_000_000_000))
-
-
-def _safe_filename(name: Optional[str], fallback: str) -> str:
-    n = os.path.basename((name or "").strip() or fallback)
-    if n:
-        return n
-    return fallback
--- a/app/assets/scanner.py
+++ b/app/assets/scanner.py
@ -1,105 +1,30 @@
-import asyncio
 import contextlib
+import time
 import logging
 import os
-import time
-from dataclasses import dataclass, field
-from typing import Literal, Optional
-
-import sqlalchemy as sa
+import sqlalchemy

 import folder_paths
-
-from ..db import create_session
-from ._helpers import (
-    collect_models_files,
-    compute_relative_filename,
-    get_comfy_models_folders,
-    get_name_and_tags_from_asset_path,
-    list_tree,
-    new_scan_id,
-    prefixes_for_root,
-    ts_to_iso,
+from app.database.db import create_session, dependencies_available
+from app.assets.helpers import (
+    collect_models_files, compute_relative_filename, fast_asset_file_check, get_name_and_tags_from_asset_path,
+    list_tree,prefixes_for_root, escape_like_prefix,
+    RootType
 )
-from .api import schemas_in, schemas_out
-from .database.helpers import (
-    add_missing_tag_for_asset_id,
-    ensure_tags_exist,
-    escape_like_prefix,
-    fast_asset_file_check,
-    remove_missing_tag_for_asset_id,
-    seed_from_paths_batch,
-)
-from .database.models import Asset, AssetCacheState, AssetInfo
-from .database.services import (
-    compute_hash_and_dedup_for_cache_state,
-    list_cache_states_by_asset_id,
-    list_cache_states_with_asset_under_prefixes,
-    list_unhashed_candidates_under_prefixes,
-    list_verify_candidates_under_prefixes,
-)
-
-LOGGER = logging.getLogger(__name__)
-
-SLOW_HASH_CONCURRENCY = 1
+from app.assets.database.tags import add_missing_tag_for_asset_id, ensure_tags_exist, remove_missing_tag_for_asset_id
+from app.assets.database.bulk_ops import seed_from_paths_batch
+from app.assets.database.models import Asset, AssetCacheState, AssetInfo


-@dataclass
-class ScanProgress:
-    scan_id: str
-    root: schemas_in.RootType
-    status: Literal["scheduled", "running", "completed", "failed", "cancelled"] = "scheduled"
-    scheduled_at: float = field(default_factory=lambda: time.time())
-    started_at: Optional[float] = None
-    finished_at: Optional[float] = None
-    discovered: int = 0
-    processed: int = 0
-    file_errors: list[dict] = field(default_factory=list)
-
-
-@dataclass
-class SlowQueueState:
-    queue: asyncio.Queue
-    workers: list[asyncio.Task] = field(default_factory=list)
-    closed: bool = False
-
-
-RUNNING_TASKS: dict[schemas_in.RootType, asyncio.Task] = {}
-PROGRESS_BY_ROOT: dict[schemas_in.RootType, ScanProgress] = {}
-SLOW_STATE_BY_ROOT: dict[schemas_in.RootType, SlowQueueState] = {}
-
-
-def current_statuses() -> schemas_out.AssetScanStatusResponse:
-    scans = []
-    for root in schemas_in.ALLOWED_ROOTS:
-        prog = PROGRESS_BY_ROOT.get(root)
-        if not prog:
-            continue
-        scans.append(_scan_progress_to_scan_status_model(prog))
-    return schemas_out.AssetScanStatusResponse(scans=scans)
-
-
-async def schedule_scans(roots: list[schemas_in.RootType]) -> schemas_out.AssetScanStatusResponse:
-    results: list[ScanProgress] = []
-    for root in roots:
-        if root in RUNNING_TASKS and not RUNNING_TASKS[root].done():
-            results.append(PROGRESS_BY_ROOT[root])
-            continue
-
-        prog = ScanProgress(scan_id=new_scan_id(root), root=root, status="scheduled")
-        PROGRESS_BY_ROOT[root] = prog
-        state = SlowQueueState(queue=asyncio.Queue())
-        SLOW_STATE_BY_ROOT[root] = state
-        RUNNING_TASKS[root] = asyncio.create_task(
-            _run_hash_verify_pipeline(root, prog, state),
-            name=f"asset-scan:{root}",
-        )
-        results.append(prog)
-    return _status_response_for(results)
-
-
-async def sync_seed_assets(roots: list[schemas_in.RootType]) -> None:
-    t_total = time.perf_counter()
+def seed_assets(roots: tuple[RootType, ...], enable_logging: bool = False) -> None:
+    """
+    Scan the given roots and seed the assets into the database.
+    """
+    if not dependencies_available():
+        if enable_logging:
+            logging.warning("Database dependencies not available, skipping assets scan")
+        return
+    t_start = time.perf_counter()
    created = 0
    skipped_existing = 0
    paths: list[str] = []
@ -107,11 +32,11 @@ async def sync_seed_assets(roots: list[schemas_in.RootType]) -> None:
        existing_paths: set[str] = set()
        for r in roots:
            try:
-                survivors = await _fast_db_consistency_pass(r, collect_existing_paths=True, update_missing_tags=True)
+                survivors: set[str] = _fast_db_consistency_pass(r, collect_existing_paths=True, update_missing_tags=True)
                if survivors:
                    existing_paths.update(survivors)
-            except Exception as ex:
-                LOGGER.exception("fast DB reconciliation failed for %s: %s", r, ex)
+            except Exception as e:
+                logging.exception("fast DB scan failed for %s: %s", r, e)

        if "models" in roots:
            paths.extend(collect_models_files())
@ -123,255 +48,58 @@ async def sync_seed_assets(roots: list[schemas_in.RootType]) -> None:
        specs: list[dict] = []
        tag_pool: set[str] = set()
        for p in paths:
-            ap = os.path.abspath(p)
-            if ap in existing_paths:
+            abs_p = os.path.abspath(p)
+            if abs_p in existing_paths:
                skipped_existing += 1
                continue
            try:
-                st = os.stat(ap, follow_symlinks=True)
+                stat_p = os.stat(abs_p, follow_symlinks=False)
            except OSError:
                continue
-            if not st.st_size:
+            # skip empty files
+            if not stat_p.st_size:
                continue
-            name, tags = get_name_and_tags_from_asset_path(ap)
+            name, tags = get_name_and_tags_from_asset_path(abs_p)
            specs.append(
                {
-                    "abs_path": ap,
-                    "size_bytes": st.st_size,
-                    "mtime_ns": getattr(st, "st_mtime_ns", int(st.st_mtime * 1_000_000_000)),
+                    "abs_path": abs_p,
+                    "size_bytes": stat_p.st_size,
+                    "mtime_ns": getattr(stat_p, "st_mtime_ns", int(stat_p.st_mtime * 1_000_000_000)),
                    "info_name": name,
                    "tags": tags,
-                    "fname": compute_relative_filename(ap),
+                    "fname": compute_relative_filename(abs_p),
                }
            )
            for t in tags:
                tag_pool.add(t)
-
+        # if no file specs, nothing to do
        if not specs:
            return
-        async with await create_session() as sess:
+        with create_session() as sess:
            if tag_pool:
-                await ensure_tags_exist(sess, tag_pool, tag_type="user")
+                ensure_tags_exist(sess, tag_pool, tag_type="user")

-            result = await seed_from_paths_batch(sess, specs=specs, owner_id="")
+            result = seed_from_paths_batch(sess, specs=specs, owner_id="")
            created += result["inserted_infos"]
-            await sess.commit()
+            sess.commit()
    finally:
-        LOGGER.info(
-            "Assets scan(roots=%s) completed in %.3fs (created=%d, skipped_existing=%d, total_seen=%d)",
-            roots,
-            time.perf_counter() - t_total,
-            created,
-            skipped_existing,
-            len(paths),
-        )
-
-
-def _status_response_for(progresses: list[ScanProgress]) -> schemas_out.AssetScanStatusResponse:
-    return schemas_out.AssetScanStatusResponse(scans=[_scan_progress_to_scan_status_model(p) for p in progresses])
-
-
-def _scan_progress_to_scan_status_model(progress: ScanProgress) -> schemas_out.AssetScanStatus:
-    return schemas_out.AssetScanStatus(
-        scan_id=progress.scan_id,
-        root=progress.root,
-        status=progress.status,
-        scheduled_at=ts_to_iso(progress.scheduled_at),
-        started_at=ts_to_iso(progress.started_at),
-        finished_at=ts_to_iso(progress.finished_at),
-        discovered=progress.discovered,
-        processed=progress.processed,
-        file_errors=[
-            schemas_out.AssetScanError(
-                path=e.get("path", ""),
-                message=e.get("message", ""),
-                at=e.get("at"),
+        if enable_logging:
+            logging.info(
+                "Assets scan(roots=%s) completed in %.3fs (created=%d, skipped_existing=%d, total_seen=%d)",
+                roots,
+                time.perf_counter() - t_start,
+                created,
+                skipped_existing,
+                len(paths),
            )
-            for e in (progress.file_errors or [])
-        ],
-    )


-async def _run_hash_verify_pipeline(root: schemas_in.RootType, prog: ScanProgress, state: SlowQueueState) -> None:
-    prog.status = "running"
-    prog.started_at = time.time()
-    try:
-        prefixes = prefixes_for_root(root)
-
-        await _fast_db_consistency_pass(root)
-
-        # collect candidates from DB
-        async with await create_session() as sess:
-            verify_ids = await list_verify_candidates_under_prefixes(sess, prefixes=prefixes)
-            unhashed_ids = await list_unhashed_candidates_under_prefixes(sess, prefixes=prefixes)
-        # dedupe: prioritize verification first
-        seen = set()
-        ordered: list[int] = []
-        for lst in (verify_ids, unhashed_ids):
-            for sid in lst:
-                if sid not in seen:
-                    seen.add(sid)
-                    ordered.append(sid)
-
-        prog.discovered = len(ordered)
-
-        # queue up work
-        for sid in ordered:
-            await state.queue.put(sid)
-        state.closed = True
-        _start_state_workers(root, prog, state)
-        await _await_state_workers_then_finish(root, prog, state)
-    except asyncio.CancelledError:
-        prog.status = "cancelled"
-        raise
-    except Exception as exc:
-        _append_error(prog, path="", message=str(exc))
-        prog.status = "failed"
-        prog.finished_at = time.time()
-        LOGGER.exception("Asset scan failed for %s", root)
-    finally:
-        RUNNING_TASKS.pop(root, None)
-
-
-async def _reconcile_missing_tags_for_root(root: schemas_in.RootType, prog: ScanProgress) -> None:
-    """
-    Detect missing files quickly and toggle 'missing' tag per asset_id.
-
-    Rules:
-      - Only hashed assets (assets.hash != NULL) participate in missing tagging.
-      - We consider ALL cache states of the asset (across roots) before tagging.
-    """
-    if root == "models":
-        bases: list[str] = []
-        for _bucket, paths in get_comfy_models_folders():
-            bases.extend(paths)
-    elif root == "input":
-        bases = [folder_paths.get_input_directory()]
-    else:
-        bases = [folder_paths.get_output_directory()]
-
-    try:
-        async with await create_session() as sess:
-            # state + hash + size for the current root
-            rows = await list_cache_states_with_asset_under_prefixes(sess, prefixes=bases)
-
-            # Track fast_ok within the scanned root and whether the asset is hashed
-            by_asset: dict[str, dict[str, bool]] = {}
-            for state, a_hash, size_db in rows:
-                aid = state.asset_id
-                acc = by_asset.get(aid)
-                if acc is None:
-                    acc = {"any_fast_ok_here": False, "hashed": (a_hash is not None), "size_db": int(size_db or 0)}
-                    by_asset[aid] = acc
-                try:
-                    if acc["hashed"]:
-                        st = os.stat(state.file_path, follow_symlinks=True)
-                        if fast_asset_file_check(mtime_db=state.mtime_ns, size_db=acc["size_db"], stat_result=st):
-                            acc["any_fast_ok_here"] = True
-                except FileNotFoundError:
-                    pass
-                except OSError as e:
-                    _append_error(prog, path=state.file_path, message=str(e))
-
-            # Decide per asset, considering ALL its states (not just this root)
-            for aid, acc in by_asset.items():
-                try:
-                    if not acc["hashed"]:
-                        # Never tag seed assets as missing
-                        continue
-
-                    any_fast_ok_global = acc["any_fast_ok_here"]
-                    if not any_fast_ok_global:
-                        # Check other states outside this root
-                        others = await list_cache_states_by_asset_id(sess, asset_id=aid)
-                        for st in others:
-                            try:
-                                any_fast_ok_global = fast_asset_file_check(
-                                    mtime_db=st.mtime_ns,
-                                    size_db=acc["size_db"],
-                                    stat_result=os.stat(st.file_path, follow_symlinks=True),
-                                )
-                            except OSError:
-                                continue
-
-                    if any_fast_ok_global:
-                        await remove_missing_tag_for_asset_id(sess, asset_id=aid)
-                    else:
-                        await add_missing_tag_for_asset_id(sess, asset_id=aid, origin="automatic")
-                except Exception as ex:
-                    _append_error(prog, path="", message=f"reconcile {aid[:8]}: {ex}")
-
-            await sess.commit()
-    except Exception as e:
-        _append_error(prog, path="", message=f"reconcile failed: {e}")
-
-
-def _start_state_workers(root: schemas_in.RootType, prog: ScanProgress, state: SlowQueueState) -> None:
-    if state.workers:
-        return
-
-    async def _worker(_wid: int):
-        while True:
-            sid = await state.queue.get()
-            try:
-                if sid is None:
-                    return
-                try:
-                    async with await create_session() as sess:
-                        # Optional: fetch path for better error messages
-                        st = await sess.get(AssetCacheState, sid)
-                        try:
-                            await compute_hash_and_dedup_for_cache_state(sess, state_id=sid)
-                            await sess.commit()
-                        except Exception as e:
-                            path = st.file_path if st else f"state:{sid}"
-                            _append_error(prog, path=path, message=str(e))
-                            raise
-                except Exception:
-                    pass
-                finally:
-                    prog.processed += 1
-            finally:
-                state.queue.task_done()
-
-    state.workers = [
-        asyncio.create_task(_worker(i), name=f"asset-hash:{root}:{i}")
-        for i in range(SLOW_HASH_CONCURRENCY)
-    ]
-
-    async def _close_when_ready():
-        while not state.closed:
-            await asyncio.sleep(0.05)
-        for _ in range(SLOW_HASH_CONCURRENCY):
-            await state.queue.put(None)
-
-    asyncio.create_task(_close_when_ready())
-
-
-async def _await_state_workers_then_finish(
-    root: schemas_in.RootType, prog: ScanProgress, state: SlowQueueState
-) -> None:
-    if state.workers:
-        await asyncio.gather(*state.workers, return_exceptions=True)
-    await _reconcile_missing_tags_for_root(root, prog)
-    prog.finished_at = time.time()
-    prog.status = "completed"
-
-
-def _append_error(prog: ScanProgress, *, path: str, message: str) -> None:
-    prog.file_errors.append({
-        "path": path,
-        "message": message,
-        "at": ts_to_iso(time.time()),
-    })
-
-
-async def _fast_db_consistency_pass(
-    root: schemas_in.RootType,
+def _fast_db_consistency_pass(
+    root: RootType,
    *,
    collect_existing_paths: bool = False,
    update_missing_tags: bool = False,
-) -> Optional[set[str]]:
+) -> set[str] | None:
    """Fast DB+FS pass for a root:
      - Toggle needs_verify per state using fast check
      - For hashed assets with at least one fast-ok state in this root: delete stale missing states
@ -391,10 +119,10 @@ async def _fast_db_consistency_pass(
        escaped, esc = escape_like_prefix(base)
        conds.append(AssetCacheState.file_path.like(escaped + "%", escape=esc))

-    async with await create_session() as sess:
+    with create_session() as sess:
        rows = (
-            await sess.execute(
-                sa.select(
+            sess.execute(
+                sqlalchemy.select(
                    AssetCacheState.id,
                    AssetCacheState.file_path,
                    AssetCacheState.mtime_ns,
@ -404,7 +132,7 @@ async def _fast_db_consistency_pass(
                    Asset.size_bytes,
                )
                .join(Asset, Asset.id == AssetCacheState.asset_id)
-                .where(sa.or_(*conds))
+                .where(sqlalchemy.or_(*conds))
                .order_by(AssetCacheState.asset_id.asc(), AssetCacheState.id.asc())
            )
        ).all()
@ -458,10 +186,10 @@ async def _fast_db_consistency_pass(

            if a_hash is None:
                if states and all_missing:  # remove seed Asset completely, if no valid AssetCache exists
-                    await sess.execute(sa.delete(AssetInfo).where(AssetInfo.asset_id == aid))
-                    asset = await sess.get(Asset, aid)
+                    sess.execute(sqlalchemy.delete(AssetInfo).where(AssetInfo.asset_id == aid))
+                    asset = sess.get(Asset, aid)
                    if asset:
-                        await sess.delete(asset)
+                        sess.delete(asset)
                else:
                    for s in states:
                        if s["exists"]:
@ -474,28 +202,28 @@ async def _fast_db_consistency_pass(
                        stale_state_ids.append(s["sid"])
                if update_missing_tags:
                    with contextlib.suppress(Exception):
-                        await remove_missing_tag_for_asset_id(sess, asset_id=aid)
+                        remove_missing_tag_for_asset_id(sess, asset_id=aid)
            elif update_missing_tags:
                with contextlib.suppress(Exception):
-                    await add_missing_tag_for_asset_id(sess, asset_id=aid, origin="automatic")
+                    add_missing_tag_for_asset_id(sess, asset_id=aid, origin="automatic")

            for s in states:
                if s["exists"]:
                    survivors.add(os.path.abspath(s["fp"]))

        if stale_state_ids:
-            await sess.execute(sa.delete(AssetCacheState).where(AssetCacheState.id.in_(stale_state_ids)))
+            sess.execute(sqlalchemy.delete(AssetCacheState).where(AssetCacheState.id.in_(stale_state_ids)))
        if to_set_verify:
-            await sess.execute(
-                sa.update(AssetCacheState)
+            sess.execute(
+                sqlalchemy.update(AssetCacheState)
                .where(AssetCacheState.id.in_(to_set_verify))
                .values(needs_verify=True)
            )
        if to_clear_verify:
-            await sess.execute(
-                sa.update(AssetCacheState)
+            sess.execute(
+                sqlalchemy.update(AssetCacheState)
                .where(AssetCacheState.id.in_(to_clear_verify))
                .values(needs_verify=False)
            )
-        await sess.commit()
+        sess.commit()
        return survivors if collect_existing_paths else None
--- a/app/database/db.py
+++ b/app/database/db.py
@ -0,0 +1,112 @@
+import logging
+import os
+import shutil
+from app.logger import log_startup_warning
+from utils.install_util import get_missing_requirements_message
+from comfy.cli_args import args
+
+_DB_AVAILABLE = False
+Session = None
+
+
+try:
+    from alembic import command
+    from alembic.config import Config
+    from alembic.runtime.migration import MigrationContext
+    from alembic.script import ScriptDirectory
+    from sqlalchemy import create_engine
+    from sqlalchemy.orm import sessionmaker
+
+    _DB_AVAILABLE = True
+except ImportError as e:
+    log_startup_warning(
+        f"""
+------------------------------------------------------------------------
+Error importing dependencies: {e}
+{get_missing_requirements_message()}
+This error is happening because ComfyUI now uses a local sqlite database.
+------------------------------------------------------------------------
+""".strip()
+    )
+
+
+def dependencies_available():
+    """
+    Temporary function to check if the dependencies are available
+    """
+    return _DB_AVAILABLE
+
+
+def can_create_session():
+    """
+    Temporary function to check if the database is available to create a session
+    During initial release there may be environmental issues (or missing dependencies) that prevent the database from being created
+    """
+    return dependencies_available() and Session is not None
+
+
+def get_alembic_config():
+    root_path = os.path.join(os.path.dirname(__file__), "../..")
+    config_path = os.path.abspath(os.path.join(root_path, "alembic.ini"))
+    scripts_path = os.path.abspath(os.path.join(root_path, "alembic_db"))
+
+    config = Config(config_path)
+    config.set_main_option("script_location", scripts_path)
+    config.set_main_option("sqlalchemy.url", args.database_url)
+
+    return config
+
+
+def get_db_path():
+    url = args.database_url
+    if url.startswith("sqlite:///"):
+        return url.split("///")[1]
+    else:
+        raise ValueError(f"Unsupported database URL '{url}'.")
+
+
+def init_db():
+    db_url = args.database_url
+    logging.debug(f"Database URL: {db_url}")
+    db_path = get_db_path()
+    db_exists = os.path.exists(db_path)
+
+    config = get_alembic_config()
+
+    # Check if we need to upgrade
+    engine = create_engine(db_url)
+    conn = engine.connect()
+
+    context = MigrationContext.configure(conn)
+    current_rev = context.get_current_revision()
+
+    script = ScriptDirectory.from_config(config)
+    target_rev = script.get_current_head()
+
+    if target_rev is None:
+        logging.warning("No target revision found.")
+    elif current_rev != target_rev:
+        # Backup the database pre upgrade
+        backup_path = db_path + ".bkp"
+        if db_exists:
+            shutil.copy(db_path, backup_path)
+        else:
+            backup_path = None
+
+        try:
+            command.upgrade(config, target_rev)
+            logging.info(f"Database upgraded from {current_rev} to {target_rev}")
+        except Exception as e:
+            if backup_path:
+                # Restore the database from backup if upgrade fails
+                shutil.copy(backup_path, db_path)
+                os.remove(backup_path)
+            logging.exception("Error upgrading database: ")
+            raise e
+
+    global Session
+    Session = sessionmaker(bind=engine)
+
+
+def create_session():
+    return Session()
--- a/app/database/models.py
+++ b/app/database/models.py
@ -0,0 +1,21 @@
+from typing import Any
+from datetime import datetime
+from sqlalchemy.orm import DeclarativeBase
+
+class Base(DeclarativeBase):
+    pass
+
+def to_dict(obj: Any, include_none: bool = False) -> dict[str, Any]:
+    fields = obj.__table__.columns.keys()
+    out: dict[str, Any] = {}
+    for field in fields:
+        val = getattr(obj, field)
+        if val is None and not include_none:
+            continue
+        if isinstance(val, datetime):
+            out[field] = val.isoformat()
+        else:
+            out[field] = val
+    return out
+
+# TODO: Define models here
--- a/app/db.py
+++ b/app/db.py
@ -1,255 +0,0 @@
-import logging
-import os
-import shutil
-from contextlib import asynccontextmanager
-from typing import Optional
-
-from alembic import command
-from alembic.config import Config
-from alembic.runtime.migration import MigrationContext
-from alembic.script import ScriptDirectory
-from sqlalchemy import create_engine, text
-from sqlalchemy.engine import make_url
-from sqlalchemy.ext.asyncio import (
-    AsyncEngine,
-    AsyncSession,
-    async_sessionmaker,
-    create_async_engine,
-)
-
-from comfy.cli_args import args
-
-LOGGER = logging.getLogger(__name__)
-ENGINE: Optional[AsyncEngine] = None
-SESSION: Optional[async_sessionmaker] = None
-
-
-def _root_paths():
-    """Resolve alembic.ini and migrations script folder."""
-    root_path = os.path.abspath(os.path.dirname(__file__))
-    config_path = os.path.abspath(os.path.join(root_path, "../alembic.ini"))
-    scripts_path = os.path.abspath(os.path.join(root_path, "alembic_db"))
-    return config_path, scripts_path
-
-
-def _absolutize_sqlite_url(db_url: str) -> str:
-    """Make SQLite database path absolute. No-op for non-SQLite URLs."""
-    try:
-        u = make_url(db_url)
-    except Exception:
-        return db_url
-
-    if not u.drivername.startswith("sqlite"):
-        return db_url
-
-    db_path: str = u.database or ""
-    if isinstance(db_path, str) and db_path.startswith("file:"):
-        return str(u)  # Do not touch SQLite URI databases like: "file:xxx?mode=memory&cache=shared"
-    if not os.path.isabs(db_path):
-        db_path = os.path.abspath(os.path.join(os.getcwd(), db_path))
-        u = u.set(database=db_path)
-    return str(u)
-
-
-def _normalize_sqlite_memory_url(db_url: str) -> tuple[str, bool]:
-    """
-    If db_url points at an in-memory SQLite DB (":memory:" or file:... mode=memory),
-    rewrite it to a *named* shared in-memory URI and ensure 'uri=true' is present.
-    Returns: (normalized_url, is_memory)
-    """
-    try:
-        u = make_url(db_url)
-    except Exception:
-        return db_url, False
-    if not u.drivername.startswith("sqlite"):
-        return db_url, False
-
-    db = u.database or ""
-    if db == ":memory:":
-        u = u.set(database=f"file:comfyui_db_{os.getpid()}?mode=memory&cache=shared&uri=true")
-        return str(u), True
-    if isinstance(db, str) and db.startswith("file:") and "mode=memory" in db:
-        if "uri=true" not in db:
-            u = u.set(database=(db + ("&" if "?" in db else "?") + "uri=true"))
-        return str(u), True
-    return str(u), False
-
-
-def _get_sqlite_file_path(sync_url: str) -> Optional[str]:
-    """Return the on-disk path for a SQLite URL, else None."""
-    try:
-        u = make_url(sync_url)
-    except Exception:
-        return None
-
-    if not u.drivername.startswith("sqlite"):
-        return None
-    db_path = u.database
-    if isinstance(db_path, str) and db_path.startswith("file:"):
-        return None  # Not a real file if it is a URI like "file:...?"
-    return db_path
-
-
-def _get_alembic_config(sync_url: str) -> Config:
-    """Prepare Alembic Config with script location and DB URL."""
-    config_path, scripts_path = _root_paths()
-    cfg = Config(config_path)
-    cfg.set_main_option("script_location", scripts_path)
-    cfg.set_main_option("sqlalchemy.url", sync_url)
-    return cfg
-
-
-async def init_db_engine() -> None:
-    """Initialize async engine + sessionmaker and run migrations to head.
-
-    This must be called once on application startup before any DB usage.
-    """
-    global ENGINE, SESSION
-
-    if ENGINE is not None:
-        return
-
-    raw_url = args.database_url
-    if not raw_url:
-        raise RuntimeError("Database URL is not configured.")
-
-    db_url, is_mem = _normalize_sqlite_memory_url(raw_url)
-    db_url = _absolutize_sqlite_url(db_url)
-
-    # Prepare async engine
-    connect_args = {}
-    if db_url.startswith("sqlite"):
-        connect_args = {
-            "check_same_thread": False,
-            "timeout": 12,
-        }
-        if is_mem:
-            connect_args["uri"] = True
-
-    ENGINE = create_async_engine(
-        db_url,
-        connect_args=connect_args,
-        pool_pre_ping=True,
-        future=True,
-    )
-
-    # Enforce SQLite pragmas on the async engine
-    if db_url.startswith("sqlite"):
-        async with ENGINE.begin() as conn:
-            if not is_mem:
-                # WAL for concurrency and durability, Foreign Keys for referential integrity
-                current_mode = (await conn.execute(text("PRAGMA journal_mode;"))).scalar()
-                if str(current_mode).lower() != "wal":
-                    new_mode = (await conn.execute(text("PRAGMA journal_mode=WAL;"))).scalar()
-                    if str(new_mode).lower() != "wal":
-                        raise RuntimeError("Failed to set SQLite journal mode to WAL.")
-                    LOGGER.info("SQLite journal mode set to WAL.")
-
-            await conn.execute(text("PRAGMA foreign_keys = ON;"))
-            await conn.execute(text("PRAGMA synchronous = NORMAL;"))
-
-    await _run_migrations(database_url=db_url, connect_args=connect_args)
-
-    SESSION = async_sessionmaker(
-        bind=ENGINE,
-        class_=AsyncSession,
-        expire_on_commit=False,
-        autoflush=False,
-        autocommit=False,
-    )
-
-
-async def _run_migrations(database_url: str, connect_args: dict) -> None:
-    if database_url.find("postgresql+psycopg") == -1:
-        """SQLite: Convert an async SQLAlchemy URL to a sync URL for Alembic."""
-        u = make_url(database_url)
-        driver = u.drivername
-        if not driver.startswith("sqlite+aiosqlite"):
-            raise ValueError(f"Unsupported DB driver: {driver}")
-        database_url, is_mem = _normalize_sqlite_memory_url(str(u.set(drivername="sqlite")))
-        database_url = _absolutize_sqlite_url(database_url)
-
-    cfg = _get_alembic_config(database_url)
-    engine = create_engine(database_url, future=True, connect_args=connect_args)
-    with engine.connect() as conn:
-        context = MigrationContext.configure(conn)
-        current_rev = context.get_current_revision()
-
-    script = ScriptDirectory.from_config(cfg)
-    target_rev = script.get_current_head()
-
-    if target_rev is None:
-        LOGGER.warning("Alembic: no target revision found.")
-        return
-
-    if current_rev == target_rev:
-        LOGGER.debug("Alembic: database already at head %s", target_rev)
-        return
-
-    LOGGER.info("Alembic: upgrading database from %s to %s", current_rev, target_rev)
-
-    # Optional backup for SQLite file DBs
-    backup_path = None
-    sqlite_path = _get_sqlite_file_path(database_url)
-    if sqlite_path and os.path.exists(sqlite_path):
-        backup_path = sqlite_path + ".bkp"
-        try:
-            shutil.copy(sqlite_path, backup_path)
-        except Exception as exc:
-            LOGGER.warning("Failed to create SQLite backup before migration: %s", exc)
-
-    try:
-        command.upgrade(cfg, target_rev)
-    except Exception:
-        if backup_path and os.path.exists(backup_path):
-            LOGGER.exception("Error upgrading database, attempting restore from backup.")
-            try:
-                shutil.copy(backup_path, sqlite_path)  # restore
-                os.remove(backup_path)
-            except Exception as re:
-                LOGGER.error("Failed to restore SQLite backup: %s", re)
-        else:
-            LOGGER.exception("Error upgrading database, backup is not available.")
-        raise
-
-
-def get_engine():
-    """Return the global async engine (initialized after init_db_engine())."""
-    if ENGINE is None:
-        raise RuntimeError("Engine is not initialized. Call init_db_engine() first.")
-    return ENGINE
-
-
-def get_session_maker():
-    """Return the global async_sessionmaker (initialized after init_db_engine())."""
-    if SESSION is None:
-        raise RuntimeError("Session maker is not initialized. Call init_db_engine() first.")
-    return SESSION
-
-
-@asynccontextmanager
-async def session_scope():
-    """Async context manager for a unit of work:
-
-    async with session_scope() as sess:
-        ... use sess ...
-    """
-    maker = get_session_maker()
-    async with maker() as sess:
-        try:
-            yield sess
-            await sess.commit()
-        except Exception:
-            await sess.rollback()
-            raise
-
-
-async def create_session():
-    """Convenience helper to acquire a single AsyncSession instance.
-
-    Typical usage:
-        async with (await create_session()) as sess:
-            ...
-    """
-    maker = get_session_maker()
-    return maker()
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@ -10,7 +10,8 @@ import importlib
 from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
-from typing import TypedDict, Optional
+from typing import Dict, TypedDict, Optional
+from aiohttp import web
 from importlib.metadata import version

 import requests
@ -42,6 +43,7 @@ def get_installed_frontend_version():
    frontend_version_str = version("comfyui-frontend-package")
    return frontend_version_str

+
 def get_required_frontend_version():
    """Get the required frontend version from requirements.txt."""
    try:
@ -63,6 +65,7 @@ def get_required_frontend_version():
        logging.error(f"Error reading requirements.txt: {e}")
        return None

+
 def check_frontend_version():
    """Check if the frontend version is up to date."""

@ -196,17 +199,6 @@ def download_release_asset_zip(release: Release, destination_path: str) -> None:


 class FrontendManager:
-    """
-    A class to manage ComfyUI frontend versions and installations.
-
-    This class handles the initialization and management of different frontend versions,
-    including the default frontend from the pip package and custom frontend versions
-    from GitHub repositories.
-
-    Attributes:
-        CUSTOM_FRONTENDS_ROOT (str): The root directory where custom frontend versions are stored.
-    """
-
    CUSTOM_FRONTENDS_ROOT = str(Path(__file__).parents[1] / "web_custom_versions")

    @classmethod
@ -214,17 +206,39 @@ class FrontendManager:
        """Get the required frontend package version."""
        return get_required_frontend_version()

+    @classmethod
+    def get_installed_templates_version(cls) -> str:
+        """Get the currently installed workflow templates package version."""
+        try:
+            templates_version_str = version("comfyui-workflow-templates")
+            return templates_version_str
+        except Exception:
+            return None
+
+    @classmethod
+    def get_required_templates_version(cls) -> str:
+        """Get the required workflow templates version from requirements.txt."""
+        try:
+            with open(requirements_path, "r", encoding="utf-8") as f:
+                for line in f:
+                    line = line.strip()
+                    if line.startswith("comfyui-workflow-templates=="):
+                        version_str = line.split("==")[-1]
+                        if not is_valid_version(version_str):
+                            logging.error(f"Invalid templates version format in requirements.txt: {version_str}")
+                            return None
+                        return version_str
+                logging.error("comfyui-workflow-templates not found in requirements.txt")
+                return None
+        except FileNotFoundError:
+            logging.error("requirements.txt not found. Cannot determine required templates version.")
+            return None
+        except Exception as e:
+            logging.error(f"Error reading requirements.txt: {e}")
+            return None
+
    @classmethod
    def default_frontend_path(cls) -> str:
-        """
-        Get the path to the default frontend installation from the pip package.
-
-        Returns:
-            str: The path to the default frontend static files.
-
-        Raises:
-            SystemExit: If the comfyui-frontend-package is not installed.
-        """
        try:
            import comfyui_frontend_package

@ -244,16 +258,54 @@ comfyui-frontend-package is not installed.
            sys.exit(-1)

    @classmethod
-    def templates_path(cls) -> str:
-        """
-        Get the path to the workflow templates.
+    def template_asset_map(cls) -> Optional[Dict[str, str]]:
+        """Return a mapping of template asset names to their absolute paths."""
+        try:
+            from comfyui_workflow_templates import (
+                get_asset_path,
+                iter_templates,
+            )
+        except ImportError:
+            logging.error(
+                f"""
+********** ERROR ***********

-        Returns:
-            str: The path to the workflow templates directory.
+comfyui-workflow-templates is not installed.

-        Raises:
-            SystemExit: If the comfyui-workflow-templates package is not installed.
-        """
+{frontend_install_warning_message()}
+
+********** ERROR ***********
+""".strip()
+            )
+            return None
+
+        try:
+            template_entries = list(iter_templates())
+        except Exception as exc:
+            logging.error(f"Failed to enumerate workflow templates: {exc}")
+            return None
+
+        asset_map: Dict[str, str] = {}
+        try:
+            for entry in template_entries:
+                for asset in entry.assets:
+                    asset_map[asset.filename] = get_asset_path(
+                        entry.template_id, asset.filename
+                    )
+        except Exception as exc:
+            logging.error(f"Failed to resolve template asset paths: {exc}")
+            return None
+
+        if not asset_map:
+            logging.error("No workflow template assets found. Did the packages install correctly?")
+            return None
+
+        return asset_map
+
+
+    @classmethod
+    def legacy_templates_path(cls) -> Optional[str]:
+        """Return the legacy templates directory shipped inside the meta package."""
        try:
            import comfyui_workflow_templates

@ -272,6 +324,7 @@ comfyui-workflow-templates is not installed.
 ********** ERROR ***********
 """.strip()
            )
+            return None

    @classmethod
    def embedded_docs_path(cls) -> str:
@ -289,16 +342,11 @@ comfyui-workflow-templates is not installed.
    @classmethod
    def parse_version_string(cls, value: str) -> tuple[str, str, str]:
        """
-        Parse a version string into its components.
-
-        The version string should be in the format: 'owner/repo@version'
-        where version can be either a semantic version (v1.2.3) or 'latest'.
-
        Args:
            value (str): The version string to parse.

        Returns:
-            tuple[str, str, str]: A tuple containing (owner, repo, version).
+            tuple[str, str]: A tuple containing provider name and version.

        Raises:
            argparse.ArgumentTypeError: If the version string is invalid.
@ -315,22 +363,18 @@ comfyui-workflow-templates is not installed.
        cls, version_string: str, provider: Optional[FrontEndProvider] = None
    ) -> str:
        """
-        Initialize a frontend version without error handling.
-
-        This method attempts to initialize a specific frontend version, either from
-        the default pip package or from a custom GitHub repository. It will download
-        and extract the frontend files if necessary.
+        Initializes the frontend for the specified version.

        Args:
-            version_string (str): The version string specifying which frontend to use.
-            provider (FrontEndProvider, optional): The provider to use for custom frontends.
+            version_string (str): The version string.
+            provider (FrontEndProvider, optional): The provider to use. Defaults to None.

        Returns:
            str: The path to the initialized frontend.

        Raises:
-            Exception: If there is an error during initialization (e.g., network timeout,
-                      invalid URL, or missing assets).
+            Exception: If there is an error during the initialization process.
+            main error source might be request timeout or invalid URL.
        """
        if version_string == DEFAULT_VERSION_STRING:
            check_frontend_version()
@ -382,17 +426,13 @@ comfyui-workflow-templates is not installed.
    @classmethod
    def init_frontend(cls, version_string: str) -> str:
        """
-        Initialize a frontend version with error handling.
-
-        This is the main method to initialize a frontend version. It wraps init_frontend_unsafe
-        with error handling, falling back to the default frontend if initialization fails.
+        Initializes the frontend with the specified version string.

        Args:
-            version_string (str): The version string specifying which frontend to use.
+            version_string (str): The version string to initialize the frontend with.

        Returns:
-            str: The path to the initialized frontend. If initialization fails,
-                 returns the path to the default frontend.
+            str: The path of the initialized frontend.
        """
        try:
            return cls.init_frontend_unsafe(version_string)
@ -401,3 +441,17 @@ comfyui-workflow-templates is not installed.
            logging.info("Falling back to the default frontend.")
            check_frontend_version()
            return cls.default_frontend_path()
+    @classmethod
+    def template_asset_handler(cls):
+        assets = cls.template_asset_map()
+        if not assets:
+            return None
+
+        async def serve_template(request: web.Request) -> web.StreamResponse:
+            rel_path = request.match_info.get("path", "")
+            target = assets.get(rel_path)
+            if target is None:
+                raise web.HTTPNotFound()
+            return web.FileResponse(target)
+
+        return serve_template
--- a/app/model_manager.py
+++ b/app/model_manager.py
@ -44,7 +44,7 @@ class ModelFileManager:
        @routes.get("/experiment/models/{folder}")
        async def get_all_models(request):
            folder = request.match_info.get("folder", None)
-            if not folder in folder_paths.folder_names_and_paths:
+            if folder not in folder_paths.folder_names_and_paths:
                return web.Response(status=404)
            files = self.get_model_file_list(folder)
            return web.json_response(files)
@ -55,7 +55,7 @@ class ModelFileManager:
            path_index = int(request.match_info.get("path_index", None))
            filename = request.match_info.get("filename", None)

-            if not folder_name in folder_paths.folder_names_and_paths:
+            if folder_name not in folder_paths.folder_names_and_paths:
                return web.Response(status=404)

            folders = folder_paths.folder_names_and_paths[folder_name]
--- a/app/subgraph_manager.py
+++ b/app/subgraph_manager.py
@ -0,0 +1,132 @@
+from __future__ import annotations
+
+from typing import TypedDict
+import os
+import folder_paths
+import glob
+from aiohttp import web
+import hashlib
+
+
+class Source:
+    custom_node = "custom_node"
+    templates = "templates"
+
+class SubgraphEntry(TypedDict):
+    source: str
+    """
+    Source of subgraph - custom_nodes vs templates.
+    """
+    path: str
+    """
+    Relative path of the subgraph file.
+    For custom nodes, will be the relative directory like <custom_node_dir>/subgraphs/<name>.json
+    """
+    name: str
+    """
+    Name of subgraph file.
+    """
+    info: CustomNodeSubgraphEntryInfo
+    """
+    Additional info about subgraph; in the case of custom_nodes, will contain nodepack name
+    """
+    data: str
+
+class CustomNodeSubgraphEntryInfo(TypedDict):
+    node_pack: str
+    """Node pack name."""
+
+class SubgraphManager:
+    def __init__(self):
+        self.cached_custom_node_subgraphs: dict[SubgraphEntry] | None = None
+        self.cached_blueprint_subgraphs: dict[SubgraphEntry] | None = None
+
+    def _create_entry(self, file: str, source: str, node_pack: str) -> tuple[str, SubgraphEntry]:
+        """Create a subgraph entry from a file path. Expects normalized path (forward slashes)."""
+        entry_id = hashlib.sha256(f"{source}{file}".encode()).hexdigest()
+        entry: SubgraphEntry = {
+            "source": source,
+            "name": os.path.splitext(os.path.basename(file))[0],
+            "path": file,
+            "info": {"node_pack": node_pack},
+        }
+        return entry_id, entry
+
+    async def load_entry_data(self, entry: SubgraphEntry):
+        with open(entry['path'], 'r') as f:
+            entry['data'] = f.read()
+        return entry
+
+    async def sanitize_entry(self, entry: SubgraphEntry | None, remove_data=False) -> SubgraphEntry | None:
+        if entry is None:
+            return None
+        entry = entry.copy()
+        entry.pop('path', None)
+        if remove_data:
+            entry.pop('data', None)
+        return entry
+
+    async def sanitize_entries(self, entries: dict[str, SubgraphEntry], remove_data=False) -> dict[str, SubgraphEntry]:
+        entries = entries.copy()
+        for key in list(entries.keys()):
+            entries[key] = await self.sanitize_entry(entries[key], remove_data)
+        return entries
+
+    async def get_custom_node_subgraphs(self, loadedModules, force_reload=False):
+        """Load subgraphs from custom nodes."""
+        if not force_reload and self.cached_custom_node_subgraphs is not None:
+            return self.cached_custom_node_subgraphs
+
+        subgraphs_dict: dict[SubgraphEntry] = {}
+        for folder in folder_paths.get_folder_paths("custom_nodes"):
+            pattern = os.path.join(folder, "*/subgraphs/*.json")
+            for file in glob.glob(pattern):
+                file = file.replace('\\', '/')
+                node_pack = "custom_nodes." + file.split('/')[-3]
+                entry_id, entry = self._create_entry(file, Source.custom_node, node_pack)
+                subgraphs_dict[entry_id] = entry
+
+        self.cached_custom_node_subgraphs = subgraphs_dict
+        return subgraphs_dict
+
+    async def get_blueprint_subgraphs(self, force_reload=False):
+        """Load subgraphs from the blueprints directory."""
+        if not force_reload and self.cached_blueprint_subgraphs is not None:
+            return self.cached_blueprint_subgraphs
+
+        subgraphs_dict: dict[SubgraphEntry] = {}
+        blueprints_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'blueprints')
+
+        if os.path.exists(blueprints_dir):
+            for file in glob.glob(os.path.join(blueprints_dir, "*.json")):
+                file = file.replace('\\', '/')
+                entry_id, entry = self._create_entry(file, Source.templates, "comfyui")
+                subgraphs_dict[entry_id] = entry
+
+        self.cached_blueprint_subgraphs = subgraphs_dict
+        return subgraphs_dict
+
+    async def get_all_subgraphs(self, loadedModules, force_reload=False):
+        """Get all subgraphs from all sources (custom nodes and blueprints)."""
+        custom_node_subgraphs = await self.get_custom_node_subgraphs(loadedModules, force_reload)
+        blueprint_subgraphs = await self.get_blueprint_subgraphs(force_reload)
+        return {**custom_node_subgraphs, **blueprint_subgraphs}
+
+    async def get_subgraph(self, id: str, loadedModules):
+        """Get a specific subgraph by ID from any source."""
+        entry = (await self.get_all_subgraphs(loadedModules)).get(id)
+        if entry is not None and entry.get('data') is None:
+            await self.load_entry_data(entry)
+        return entry
+
+    def add_routes(self, routes, loadedModules):
+        @routes.get("/global_subgraphs")
+        async def get_global_subgraphs(request):
+            subgraphs_dict = await self.get_all_subgraphs(loadedModules)
+            return web.json_response(await self.sanitize_entries(subgraphs_dict, remove_data=True))
+
+        @routes.get("/global_subgraphs/{id}")
+        async def get_global_subgraph(request):
+            id = request.match_info.get("id", None)
+            subgraph = await self.get_subgraph(id, loadedModules)
+            return web.json_response(await self.sanitize_entry(subgraph))
--- a/app/user_manager.py
+++ b/app/user_manager.py
@ -59,6 +59,9 @@ class UserManager():
        user = "default"
        if args.multi_user and "comfy-user" in request.headers:
            user = request.headers["comfy-user"]
+            # Block System Users (use same error message to prevent probing)
+            if user.startswith(folder_paths.SYSTEM_USER_PREFIX):
+                raise KeyError("Unknown user: " + user)

        if user not in self.users:
            raise KeyError("Unknown user: " + user)
@ -66,15 +69,16 @@ class UserManager():
        return user

    def get_request_user_filepath(self, request, file, type="userdata", create_dir=True):
-        user_directory = folder_paths.get_user_directory()
-
        if type == "userdata":
-            root_dir = user_directory
+            root_dir = folder_paths.get_user_directory()
        else:
            raise KeyError("Unknown filepath type:" + type)

        user = self.get_request_user_id(request)
-        path = user_root = os.path.abspath(os.path.join(root_dir, user))
+        user_root = folder_paths.get_public_user_directory(user)
+        if user_root is None:
+            return None
+        path = user_root

        # prevent leaving /{type}
        if os.path.commonpath((root_dir, user_root)) != root_dir:
@ -101,7 +105,11 @@ class UserManager():
        name = name.strip()
        if not name:
            raise ValueError("username not provided")
+        if name.startswith(folder_paths.SYSTEM_USER_PREFIX):
+            raise ValueError("System User prefix not allowed")
        user_id = re.sub("[^a-zA-Z0-9-_]+", '-', name)
+        if user_id.startswith(folder_paths.SYSTEM_USER_PREFIX):
+            raise ValueError("System User prefix not allowed")
        user_id = user_id + "_" + str(uuid.uuid4())

        self.users[user_id] = name
@ -132,7 +140,10 @@ class UserManager():
            if username in self.users.values():
                return web.json_response({"error": "Duplicate username."}, status=400)

-            user_id = self.add_user(username)
+            try:
+                user_id = self.add_user(username)
+            except ValueError as e:
+                return web.json_response({"error": str(e)}, status=400)
            return web.json_response(user_id)

        @routes.get("/userdata")
@ -424,7 +435,7 @@ class UserManager():
                return source

            dest = get_user_data_path(request, check_exists=False, param="dest")
-            if not isinstance(source, str):
+            if not isinstance(dest, str):
                return dest

            overwrite = request.query.get("overwrite", 'true') != "false"
--- a/blueprints/put_blueprints_here
+++ b/blueprints/put_blueprints_here
--- a/comfy/cldm/cldm.py
+++ b/comfy/cldm/cldm.py
@ -413,7 +413,8 @@ class ControlNet(nn.Module):
        out_middle = []

        if self.num_classes is not None:
-            assert y.shape[0] == x.shape[0]
+            if y is None:
+                raise ValueError("y is None, did you try using a controlnet for SDXL on SD1?")
            emb = emb + self.label_emb(y)

        h = x
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -97,6 +97,13 @@ class LatentPreviewMethod(enum.Enum):
    Latent2RGB = "latent2rgb"
    TAESD = "taesd"

+    @classmethod
+    def from_string(cls, value: str):
+        for member in cls:
+            if member.value == value:
+                return member
+        return None
+
 parser.add_argument("--preview-method", type=LatentPreviewMethod, default=LatentPreviewMethod.NoPreviews, help="Default preview method for sampler nodes.", action=EnumAction)

 parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.")
@ -105,6 +112,7 @@ cache_group = parser.add_mutually_exclusive_group()
 cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
 cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
 cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")
+cache_group.add_argument("--cache-ram", nargs='?', const=4.0, type=float, default=0, help="Use RAM pressure caching with the specified headroom threshold. If available RAM drops below the threhold the cache remove large items to free RAM. Default 4GB")

 attn_group = parser.add_mutually_exclusive_group()
 attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
@ -120,6 +128,12 @@ upcast.add_argument("--force-upcast-attention", action="store_true", help="Force
 upcast.add_argument("--dont-upcast-attention", action="store_true", help="Disable all upcasting of attention. Should be unnecessary except for debugging.")


+parser.add_argument("--enable-manager", action="store_true", help="Enable the ComfyUI-Manager feature.")
+manager_group = parser.add_mutually_exclusive_group()
+manager_group.add_argument("--disable-manager-ui", action="store_true", help="Disables only the ComfyUI-Manager UI and endpoints. Scheduled installations and similar background tasks will still operate.")
+manager_group.add_argument("--enable-manager-legacy-ui", action="store_true", help="Enables the legacy UI of ComfyUI-Manager")
+
+
 vram_group = parser.add_mutually_exclusive_group()
 vram_group.add_argument("--gpu-only", action="store_true", help="Store and run everything (text encoders/CLIP models, etc... on the GPU).")
 vram_group.add_argument("--highvram", action="store_true", help="By default models will be unloaded to CPU memory after being used. This option keeps them in GPU memory.")
@ -130,7 +144,8 @@ vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for e

 parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reserved depending on your OS.")

-parser.add_argument("--async-offload", action="store_true", help="Use async weight offloading.")
+parser.add_argument("--async-offload", nargs='?', const=2, type=int, default=None, metavar="NUM_STREAMS", help="Use async weight offloading. An optional argument controls the amount of offload streams. Default is 2. Enabled by default on Nvidia.")
+parser.add_argument("--disable-async-offload", action="store_true", help="Disable async weight offloading.")

 parser.add_argument("--force-non-blocking", action="store_true", help="Force ComfyUI to use non-blocking operations for all applicable tensors. This may improve performance on some non-Nvidia systems but can cause issues with some workflows.")

@ -145,7 +160,9 @@ class PerformanceFeature(enum.Enum):
    CublasOps = "cublas_ops"
    AutoTune = "autotune"

-parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature))))
+parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. This is used to test new features so using it might crash your comfyui. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature))))
+
+parser.add_argument("--disable-pinned-memory", action="store_true", help="Disable pinned memory use.")

 parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.")
 parser.add_argument("--disable-mmap", action="store_true", help="Don't use mmap when loading safetensors.")
@ -157,13 +174,14 @@ parser.add_argument("--windows-standalone-build", action="store_true", help="Win
 parser.add_argument("--disable-metadata", action="store_true", help="Disable saving prompt metadata in files.")
 parser.add_argument("--disable-all-custom-nodes", action="store_true", help="Disable loading all custom nodes.")
 parser.add_argument("--whitelist-custom-nodes", type=str, nargs='+', default=[], help="Specify custom node folders to load even when --disable-all-custom-nodes is enabled.")
-parser.add_argument("--disable-api-nodes", action="store_true", help="Disable loading all api nodes.")
+parser.add_argument("--disable-api-nodes", action="store_true", help="Disable loading all api nodes. Also prevents the frontend from communicating with the internet.")

 parser.add_argument("--multi-user", action="store_true", help="Enables per-user storage.")

 parser.add_argument("--verbose", default='INFO', const='DEBUG', nargs="?", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Set the logging level')
 parser.add_argument("--log-stdout", action="store_true", help="Send normal process output to stdout instead of stderr (default).")

+
 # The default built-in provider hosted under web/
 DEFAULT_VERSION_STRING = "comfyanonymous/ComfyUI@latest"

@ -212,7 +230,7 @@ parser.add_argument(
 database_default_path = os.path.abspath(
    os.path.join(os.path.dirname(__file__), "..", "user", "comfyui.db")
 )
-parser.add_argument("--database-url", type=str, default=f"sqlite+aiosqlite:///{database_default_path}", help="Specify the database URL, e.g. for an in-memory database you can use 'sqlite+aiosqlite:///:memory:'.")
+parser.add_argument("--database-url", type=str, default=f"sqlite:///{database_default_path}", help="Specify the database URL, e.g. for an in-memory database you can use 'sqlite:///:memory:'.")
 parser.add_argument("--disable-assets-autoscan", action="store_true", help="Disable asset scanning on startup for database synchronization.")

 if comfy.options.args_parsing:
--- a/comfy/clip_model.py
+++ b/comfy/clip_model.py
@ -1,6 +1,59 @@
 import torch
 from comfy.ldm.modules.attention import optimized_attention_for_device
 import comfy.ops
+import math
+
+def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], crop=True):
+    image = image[:, :, :, :3] if image.shape[3] > 3 else image
+    mean = torch.tensor(mean, device=image.device, dtype=image.dtype)
+    std = torch.tensor(std, device=image.device, dtype=image.dtype)
+    image = image.movedim(-1, 1)
+    if not (image.shape[2] == size and image.shape[3] == size):
+        if crop:
+            scale = (size / min(image.shape[2], image.shape[3]))
+            scale_size = (round(scale * image.shape[2]), round(scale * image.shape[3]))
+        else:
+            scale_size = (size, size)
+
+        image = torch.nn.functional.interpolate(image, size=scale_size, mode="bicubic", antialias=True)
+        h = (image.shape[2] - size)//2
+        w = (image.shape[3] - size)//2
+        image = image[:,:,h:h+size,w:w+size]
+    image = torch.clip((255. * image), 0, 255).round() / 255.0
+    return (image - mean.view([3,1,1])) / std.view([3,1,1])
+
+def siglip2_flex_calc_resolution(oh, ow, patch_size, max_num_patches, eps=1e-5):
+    def scale_dim(size, scale):
+        scaled = math.ceil(size * scale / patch_size) * patch_size
+        return max(patch_size, int(scaled))
+
+    # Binary search for optimal scale
+    lo, hi = eps / 10, 100.0
+    while hi - lo >= eps:
+        mid = (lo + hi) / 2
+        h, w = scale_dim(oh, mid), scale_dim(ow, mid)
+        if (h // patch_size) * (w // patch_size) <= max_num_patches:
+            lo = mid
+        else:
+            hi = mid
+
+    return scale_dim(oh, lo), scale_dim(ow, lo)
+
+def siglip2_preprocess(image, size, patch_size, num_patches, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], crop=True):
+    if size > 0:
+        return clip_preprocess(image, size=size, mean=mean, std=std, crop=crop)
+
+    image = image[:, :, :, :3] if image.shape[3] > 3 else image
+    mean = torch.tensor(mean, device=image.device, dtype=image.dtype)
+    std = torch.tensor(std, device=image.device, dtype=image.dtype)
+    image = image.movedim(-1, 1)
+
+    b, c, h, w = image.shape
+    h, w = siglip2_flex_calc_resolution(h, w, patch_size, num_patches)
+
+    image = torch.nn.functional.interpolate(image, size=(h, w), mode="bilinear", antialias=True)
+    image = torch.clip((255. * image), 0, 255).round() / 255.0
+    return (image - mean.view([3, 1, 1])) / std.view([3, 1, 1])

 class CLIPAttention(torch.nn.Module):
    def __init__(self, embed_dim, heads, dtype, device, operations):
@ -156,6 +209,27 @@ class CLIPTextModel(torch.nn.Module):
        out = self.text_projection(x[2])
        return (x[0], x[1], out, x[2])

+def siglip2_pos_embed(embed_weight, embeds, orig_shape):
+    embed_weight_len = round(embed_weight.shape[0] ** 0.5)
+    embed_weight = comfy.ops.cast_to_input(embed_weight, embeds).movedim(1, 0).reshape(1, -1, embed_weight_len, embed_weight_len)
+    embed_weight = torch.nn.functional.interpolate(embed_weight, size=orig_shape, mode="bilinear", align_corners=False, antialias=True)
+    embed_weight = embed_weight.reshape(-1, embed_weight.shape[-2] * embed_weight.shape[-1]).movedim(0, 1)
+    return embeds + embed_weight
+
+class Siglip2Embeddings(torch.nn.Module):
+    def __init__(self, embed_dim, num_channels=3, patch_size=14, image_size=224, model_type="", num_patches=None, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.patch_embedding = operations.Linear(num_channels * patch_size * patch_size, embed_dim, dtype=dtype, device=device)
+        self.position_embedding = operations.Embedding(num_patches, embed_dim, dtype=dtype, device=device)
+        self.patch_size = patch_size
+
+    def forward(self, pixel_values):
+        b, c, h, w = pixel_values.shape
+        img = pixel_values.movedim(1, -1).reshape(b, h // self.patch_size, self.patch_size, w // self.patch_size, self.patch_size, c)
+        img = img.permute(0, 1, 3, 2, 4, 5)
+        img = img.reshape(b, img.shape[1] * img.shape[2], -1)
+        img = self.patch_embedding(img)
+        return siglip2_pos_embed(self.position_embedding.weight, img, (h // self.patch_size, w // self.patch_size))

 class CLIPVisionEmbeddings(torch.nn.Module):
    def __init__(self, embed_dim, num_channels=3, patch_size=14, image_size=224, model_type="", dtype=None, device=None, operations=None):
@ -199,8 +273,11 @@ class CLIPVision(torch.nn.Module):
        intermediate_activation = config_dict["hidden_act"]
        model_type = config_dict["model_type"]

-        self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], model_type=model_type, dtype=dtype, device=device, operations=operations)
-        if model_type == "siglip_vision_model":
+        if model_type in ["siglip2_vision_model"]:
+            self.embeddings = Siglip2Embeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], model_type=model_type, num_patches=config_dict.get("num_patches", None), dtype=dtype, device=device, operations=operations)
+        else:
+            self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], model_type=model_type, dtype=dtype, device=device, operations=operations)
+        if model_type in ["siglip_vision_model", "siglip2_vision_model"]:
            self.pre_layrnorm = lambda a: a
            self.output_layernorm = True
        else:
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@ -1,6 +1,5 @@
 from .utils import load_torch_file, transformers_convert, state_dict_prefix_replace
 import os
-import torch
 import json
 import logging

@ -17,28 +16,12 @@ class Output:
    def __setitem__(self, key, item):
        setattr(self, key, item)

-def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], crop=True):
-    image = image[:, :, :, :3] if image.shape[3] > 3 else image
-    mean = torch.tensor(mean, device=image.device, dtype=image.dtype)
-    std = torch.tensor(std, device=image.device, dtype=image.dtype)
-    image = image.movedim(-1, 1)
-    if not (image.shape[2] == size and image.shape[3] == size):
-        if crop:
-            scale = (size / min(image.shape[2], image.shape[3]))
-            scale_size = (round(scale * image.shape[2]), round(scale * image.shape[3]))
-        else:
-            scale_size = (size, size)
-
-        image = torch.nn.functional.interpolate(image, size=scale_size, mode="bicubic", antialias=True)
-        h = (image.shape[2] - size)//2
-        w = (image.shape[3] - size)//2
-        image = image[:,:,h:h+size,w:w+size]
-    image = torch.clip((255. * image), 0, 255).round() / 255.0
-    return (image - mean.view([3,1,1])) / std.view([3,1,1])
+clip_preprocess = comfy.clip_model.clip_preprocess  # Prevent some stuff from breaking, TODO: remove eventually

 IMAGE_ENCODERS = {
    "clip_vision_model": comfy.clip_model.CLIPVisionModelProjection,
    "siglip_vision_model": comfy.clip_model.CLIPVisionModelProjection,
+    "siglip2_vision_model": comfy.clip_model.CLIPVisionModelProjection,
    "dinov2": comfy.image_encoders.dino2.Dinov2Model,
 }

@ -50,9 +33,10 @@ class ClipVisionModel():
        self.image_size = config.get("image_size", 224)
        self.image_mean = config.get("image_mean", [0.48145466, 0.4578275, 0.40821073])
        self.image_std = config.get("image_std", [0.26862954, 0.26130258, 0.27577711])
-        model_type = config.get("model_type", "clip_vision_model")
-        model_class = IMAGE_ENCODERS.get(model_type)
-        if model_type == "siglip_vision_model":
+        self.model_type = config.get("model_type", "clip_vision_model")
+        self.config = config.copy()
+        model_class = IMAGE_ENCODERS.get(self.model_type)
+        if self.model_type == "siglip_vision_model":
            self.return_all_hidden_states = True
        else:
            self.return_all_hidden_states = False
@ -73,12 +57,16 @@ class ClipVisionModel():

    def encode_image(self, image, crop=True):
        comfy.model_management.load_model_gpu(self.patcher)
-        pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop).float()
+        if self.model_type == "siglip2_vision_model":
+            pixel_values = comfy.clip_model.siglip2_preprocess(image.to(self.load_device), size=self.image_size, patch_size=self.config.get("patch_size", 16), num_patches=self.config.get("num_patches", 256), mean=self.image_mean, std=self.image_std, crop=crop).float()
+        else:
+            pixel_values = comfy.clip_model.clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop).float()
        out = self.model(pixel_values=pixel_values, intermediate_output='all' if self.return_all_hidden_states else -2)

        outputs = Output()
        outputs["last_hidden_state"] = out[0].to(comfy.model_management.intermediate_device())
        outputs["image_embeds"] = out[2].to(comfy.model_management.intermediate_device())
+        outputs["image_sizes"] = [pixel_values.shape[1:]] * pixel_values.shape[0]
        if self.return_all_hidden_states:
            all_hs = out[1].to(comfy.model_management.intermediate_device())
            outputs["penultimate_hidden_states"] = all_hs[:, -2]
@ -125,10 +113,14 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
    elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
        embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0]
        if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
-            if embed_shape == 729:
-                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
-            elif embed_shape == 1024:
-                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_512.json")
+            patch_embedding_shape = sd["vision_model.embeddings.patch_embedding.weight"].shape
+            if len(patch_embedding_shape) == 2:
+                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip2_base_naflex.json")
+            else:
+                if embed_shape == 729:
+                    json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
+                elif embed_shape == 1024:
+                    json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_512.json")
        elif embed_shape == 577:
            if "multi_modal_projector.linear_1.bias" in sd:
                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
--- a/comfy/clip_vision_siglip2_base_naflex.json
+++ b/comfy/clip_vision_siglip2_base_naflex.json
@ -0,0 +1,14 @@
+{
+  "num_channels": 3,
+  "hidden_act": "gelu_pytorch_tanh",
+  "hidden_size": 1152,
+  "image_size": -1,
+  "intermediate_size": 4304,
+  "model_type": "siglip2_vision_model",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 27,
+  "patch_size": 16,
+  "num_patches": 256,
+  "image_mean": [0.5, 0.5, 0.5],
+  "image_std": [0.5, 0.5, 0.5]
+}
--- a/comfy/context_windows.py
+++ b/comfy/context_windows.py
@ -51,32 +51,43 @@ class ContextHandlerABC(ABC):


 class IndexListContextWindow(ContextWindowABC):
-    def __init__(self, index_list: list[int], dim: int=0):
+    def __init__(self, index_list: list[int], dim: int=0, total_frames: int=0):
        self.index_list = index_list
        self.context_length = len(index_list)
        self.dim = dim
+        self.total_frames = total_frames
+        self.center_ratio = (min(index_list) + max(index_list)) / (2 * total_frames)

-    def get_tensor(self, full: torch.Tensor, device=None, dim=None) -> torch.Tensor:
+    def get_tensor(self, full: torch.Tensor, device=None, dim=None, retain_index_list=[]) -> torch.Tensor:
        if dim is None:
            dim = self.dim
        if dim == 0 and full.shape[dim] == 1:
            return full
-        idx = [slice(None)] * dim + [self.index_list]
-        return full[idx].to(device)
+        idx = tuple([slice(None)] * dim + [self.index_list])
+        window = full[idx]
+        if retain_index_list:
+            idx = tuple([slice(None)] * dim + [retain_index_list])
+            window[idx] = full[idx]
+        return window.to(device)

    def add_window(self, full: torch.Tensor, to_add: torch.Tensor, dim=None) -> torch.Tensor:
        if dim is None:
            dim = self.dim
-        idx = [slice(None)] * dim + [self.index_list]
+        idx = tuple([slice(None)] * dim + [self.index_list])
        full[idx] += to_add
        return full

+    def get_region_index(self, num_regions: int) -> int:
+        region_idx = int(self.center_ratio * num_regions)
+        return min(max(region_idx, 0), num_regions - 1)
+

 class IndexListCallbacks:
    EVALUATE_CONTEXT_WINDOWS = "evaluate_context_windows"
    COMBINE_CONTEXT_WINDOW_RESULTS = "combine_context_window_results"
    EXECUTE_START = "execute_start"
    EXECUTE_CLEANUP = "execute_cleanup"
+    RESIZE_COND_ITEM = "resize_cond_item"

    def init_callbacks(self):
        return {}
@ -94,7 +105,8 @@ class ContextFuseMethod:

 ContextResults = collections.namedtuple("ContextResults", ['window_idx', 'sub_conds_out', 'sub_conds', 'window'])
 class IndexListContextHandler(ContextHandlerABC):
-    def __init__(self, context_schedule: ContextSchedule, fuse_method: ContextFuseMethod, context_length: int=1, context_overlap: int=0, context_stride: int=1, closed_loop=False, dim=0):
+    def __init__(self, context_schedule: ContextSchedule, fuse_method: ContextFuseMethod, context_length: int=1, context_overlap: int=0, context_stride: int=1,
+                 closed_loop: bool=False, dim:int=0, freenoise: bool=False, cond_retain_index_list: list[int]=[], split_conds_to_windows: bool=False):
        self.context_schedule = context_schedule
        self.fuse_method = fuse_method
        self.context_length = context_length
@ -103,13 +115,18 @@ class IndexListContextHandler(ContextHandlerABC):
        self.closed_loop = closed_loop
        self.dim = dim
        self._step = 0
+        self.freenoise = freenoise
+        self.cond_retain_index_list = [int(x.strip()) for x in cond_retain_index_list.split(",")] if cond_retain_index_list else []
+        self.split_conds_to_windows = split_conds_to_windows

        self.callbacks = {}

    def should_use_context(self, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]) -> bool:
        # for now, assume first dim is batch - should have stored on BaseModel in actual implementation
        if x_in.size(self.dim) > self.context_length:
-            logging.info(f"Using context windows {self.context_length} for {x_in.size(self.dim)} frames.")
+            logging.info(f"Using context windows {self.context_length} with overlap {self.context_overlap} for {x_in.size(self.dim)} frames.")
+            if self.cond_retain_index_list:
+                logging.info(f"Retaining original cond for indexes: {self.cond_retain_index_list}")
            return True
        return False

@ -123,6 +140,11 @@ class IndexListContextHandler(ContextHandlerABC):
            return None
        # reuse or resize cond items to match context requirements
        resized_cond = []
+        # if multiple conds, split based on primary region
+        if self.split_conds_to_windows and len(cond_in) > 1:
+            region = window.get_region_index(len(cond_in))
+            logging.info(f"Splitting conds to windows; using region {region} for window {window.index_list[0]}-{window.index_list[-1]} with center ratio {window.center_ratio:.3f}")
+            cond_in = [cond_in[region]]
        # cond object is a list containing a dict - outer list is irrelevant, so just loop through it
        for actual_cond in cond_in:
            resized_actual_cond = actual_cond.copy()
@ -145,13 +167,38 @@ class IndexListContextHandler(ContextHandlerABC):
                        new_cond_item = cond_item.copy()
                        # when in dictionary, look for tensors and CONDCrossAttn [comfy/conds.py] (has cond attr that is a tensor)
                        for cond_key, cond_value in new_cond_item.items():
+                            # Allow callbacks to handle custom conditioning items
+                            handled = False
+                            for callback in comfy.patcher_extension.get_all_callbacks(
+                                IndexListCallbacks.RESIZE_COND_ITEM, self.callbacks
+                            ):
+                                result = callback(cond_key, cond_value, window, x_in, device, new_cond_item)
+                                if result is not None:
+                                    new_cond_item[cond_key] = result
+                                    handled = True
+                                    break
+                            if handled:
+                                continue
                            if isinstance(cond_value, torch.Tensor):
-                                if cond_value.ndim < self.dim and cond_value.size(0) == x_in.size(self.dim):
+                                if (self.dim < cond_value.ndim and cond_value(self.dim) == x_in.size(self.dim)) or \
+                                   (cond_value.ndim < self.dim and cond_value.size(0) == x_in.size(self.dim)):
                                    new_cond_item[cond_key] = window.get_tensor(cond_value, device)
+                            # Handle audio_embed (temporal dim is 1)
+                            elif cond_key == "audio_embed" and hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor):
+                                audio_cond = cond_value.cond
+                                if audio_cond.ndim > 1 and audio_cond.size(1) == x_in.size(self.dim):
+                                    new_cond_item[cond_key] = cond_value._copy_with(window.get_tensor(audio_cond, device, dim=1))
+                            # Handle vace_context (temporal dim is 3)
+                            elif cond_key == "vace_context" and hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor):
+                                vace_cond = cond_value.cond
+                                if vace_cond.ndim >= 4 and vace_cond.size(3) == x_in.size(self.dim):
+                                    sliced_vace = window.get_tensor(vace_cond, device, dim=3, retain_index_list=self.cond_retain_index_list)
+                                    new_cond_item[cond_key] = cond_value._copy_with(sliced_vace)
                            # if has cond that is a Tensor, check if needs to be subset
                            elif hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor):
-                                if cond_value.cond.ndim < self.dim and cond_value.cond.size(0) == x_in.size(self.dim):
-                                    new_cond_item[cond_key] = cond_value._copy_with(window.get_tensor(cond_value.cond, device))
+                                if  (self.dim < cond_value.cond.ndim and cond_value.cond.size(self.dim) == x_in.size(self.dim)) or \
+                                    (cond_value.cond.ndim < self.dim and cond_value.cond.size(0) == x_in.size(self.dim)):
+                                    new_cond_item[cond_key] = cond_value._copy_with(window.get_tensor(cond_value.cond, device, retain_index_list=self.cond_retain_index_list))
                            elif cond_key == "num_video_frames": # for SVD
                                new_cond_item[cond_key] = cond_value._copy_with(cond_value.cond)
                                new_cond_item[cond_key].cond = window.context_length
@ -164,7 +211,7 @@ class IndexListContextHandler(ContextHandlerABC):
        return resized_cond

    def set_step(self, timestep: torch.Tensor, model_options: dict[str]):
-        mask = torch.isclose(model_options["transformer_options"]["sample_sigmas"], timestep, rtol=0.0001)
+        mask = torch.isclose(model_options["transformer_options"]["sample_sigmas"], timestep[0], rtol=0.0001)
        matches = torch.nonzero(mask)
        if torch.numel(matches) == 0:
            raise Exception("No sample_sigmas matched current timestep; something went wrong.")
@ -173,7 +220,7 @@ class IndexListContextHandler(ContextHandlerABC):
    def get_context_windows(self, model: BaseModel, x_in: torch.Tensor, model_options: dict[str]) -> list[IndexListContextWindow]:
        full_length = x_in.size(self.dim) # TODO: choose dim based on model
        context_windows = self.context_schedule.func(full_length, self, model_options)
-        context_windows = [IndexListContextWindow(window, dim=self.dim) for window in context_windows]
+        context_windows = [IndexListContextWindow(window, dim=self.dim, total_frames=full_length) for window in context_windows]
        return context_windows

    def execute(self, calc_cond_batch: Callable, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]):
@ -250,8 +297,8 @@ class IndexListContextHandler(ContextHandlerABC):
                    prev_weight = (bias_total / (bias_total + bias))
                    new_weight = (bias / (bias_total + bias))
                    # account for dims of tensors
-                    idx_window = [slice(None)] * self.dim + [idx]
-                    pos_window = [slice(None)] * self.dim + [pos]
+                    idx_window = tuple([slice(None)] * self.dim + [idx])
+                    pos_window = tuple([slice(None)] * self.dim + [pos])
                    # apply new values
                    conds_final[i][idx_window] = conds_final[i][idx_window] * prev_weight + sub_conds_out[i][pos_window] * new_weight
                    biases_final[i][idx] = bias_total + bias
@ -287,6 +334,28 @@ def create_prepare_sampling_wrapper(model: ModelPatcher):
    )


+def _sampler_sample_wrapper(executor, guider, sigmas, extra_args, callback, noise, *args, **kwargs):
+    model_options = extra_args.get("model_options", None)
+    if model_options is None:
+        raise Exception("model_options not found in sampler_sample_wrapper; this should never happen, something went wrong.")
+    handler: IndexListContextHandler = model_options.get("context_handler", None)
+    if handler is None:
+        raise Exception("context_handler not found in sampler_sample_wrapper; this should never happen, something went wrong.")
+    if not handler.freenoise:
+        return executor(guider, sigmas, extra_args, callback, noise, *args, **kwargs)
+    noise = apply_freenoise(noise, handler.dim, handler.context_length, handler.context_overlap, extra_args["seed"])
+
+    return executor(guider, sigmas, extra_args, callback, noise, *args, **kwargs)
+
+
+def create_sampler_sample_wrapper(model: ModelPatcher):
+    model.add_wrapper_with_key(
+        comfy.patcher_extension.WrappersMP.SAMPLER_SAMPLE,
+        "ContextWindows_sampler_sample",
+        _sampler_sample_wrapper
+    )
+
+
 def match_weights_to_dim(weights: list[float], x_in: torch.Tensor, dim: int, device=None) -> torch.Tensor:
    total_dims = len(x_in.shape)
    weights_tensor = torch.Tensor(weights).to(device=device)
@ -538,3 +607,29 @@ def shift_window_to_end(window: list[int], num_frames: int):
    for i in range(len(window)):
        # 2) add end_delta to each val to slide windows to end
        window[i] = window[i] + end_delta
+
+
+# https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved/blob/90fb1331201a4b29488089e4fbffc0d82cc6d0a9/animatediff/sample_settings.py#L465
+def apply_freenoise(noise: torch.Tensor, dim: int, context_length: int, context_overlap: int, seed: int):
+    logging.info("Context windows: Applying FreeNoise")
+    generator = torch.Generator(device='cpu').manual_seed(seed)
+    latent_video_length = noise.shape[dim]
+    delta = context_length - context_overlap
+
+    for start_idx in range(0, latent_video_length - context_length, delta):
+        place_idx = start_idx + context_length
+
+        actual_delta = min(delta, latent_video_length - place_idx)
+        if actual_delta <= 0:
+            break
+
+        list_idx = torch.randperm(actual_delta, generator=generator, device='cpu') + start_idx
+
+        source_slice = [slice(None)] * noise.ndim
+        source_slice[dim] = list_idx
+        target_slice = [slice(None)] * noise.ndim
+        target_slice[dim] = slice(place_idx, place_idx + actual_delta)
+
+        noise[tuple(target_slice)] = noise[tuple(source_slice)]
+
+    return noise
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@ -310,11 +310,13 @@ class ControlLoraOps:
            self.bias = None

        def forward(self, input):
-            weight, bias = comfy.ops.cast_bias_weight(self, input)
+            weight, bias, offload_stream = comfy.ops.cast_bias_weight(self, input, offloadable=True)
            if self.up is not None:
-                return torch.nn.functional.linear(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias)
+                x = torch.nn.functional.linear(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias)
            else:
-                return torch.nn.functional.linear(input, weight, bias)
+                x = torch.nn.functional.linear(input, weight, bias)
+            comfy.ops.uncast_bias_weight(self, weight, bias, offload_stream)
+            return x

    class Conv2d(torch.nn.Module, comfy.ops.CastWeightBiasOp):
        def __init__(
@ -350,12 +352,13 @@ class ControlLoraOps:


        def forward(self, input):
-            weight, bias = comfy.ops.cast_bias_weight(self, input)
+            weight, bias, offload_stream = comfy.ops.cast_bias_weight(self, input, offloadable=True)
            if self.up is not None:
-                return torch.nn.functional.conv2d(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias, self.stride, self.padding, self.dilation, self.groups)
+                x = torch.nn.functional.conv2d(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias, self.stride, self.padding, self.dilation, self.groups)
            else:
-                return torch.nn.functional.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups)
-
+                x = torch.nn.functional.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups)
+            comfy.ops.uncast_bias_weight(self, weight, bias, offload_stream)
+            return x

 class ControlLora(ControlNet):
    def __init__(self, control_weights, global_average_pooling=False, model_options={}): #TODO? model_options
--- a/comfy/float.py
+++ b/comfy/float.py
@ -65,3 +65,147 @@ def stochastic_rounding(value, dtype, seed=0):
        return output

    return value.to(dtype=dtype)
+
+
+# TODO: improve this?
+def stochastic_float_to_fp4_e2m1(x, generator):
+    orig_shape = x.shape
+    sign = torch.signbit(x).to(torch.uint8)
+
+    exp = torch.floor(torch.log2(x.abs()) + 1.0).clamp(0, 3)
+    x += (torch.rand(x.size(), dtype=x.dtype, layout=x.layout, device=x.device, generator=generator) - 0.5) * (2 ** (exp - 2.0)) * 1.25
+
+    x = x.abs()
+    exp = torch.floor(torch.log2(x) + 1.1925).clamp(0, 3)
+
+    mantissa = torch.where(
+        exp > 0,
+        (x / (2.0 ** (exp - 1)) - 1.0) * 2.0,
+        (x * 2.0),
+        out=x
+    ).round().to(torch.uint8)
+    del x
+
+    exp = exp.to(torch.uint8)
+
+    fp4 = (sign << 3) | (exp << 1) | mantissa
+    del sign, exp, mantissa
+
+    fp4_flat = fp4.view(-1)
+    packed = (fp4_flat[0::2] << 4) | fp4_flat[1::2]
+    return packed.reshape(list(orig_shape)[:-1] + [-1])
+
+
+def to_blocked(input_matrix, flatten: bool = True) -> torch.Tensor:
+    """
+    Rearrange a large matrix by breaking it into blocks and applying the rearrangement pattern.
+    See:
+        https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout
+
+    Args:
+        input_matrix: Input tensor of shape (H, W)
+    Returns:
+        Rearranged tensor of shape (32*ceil_div(H,128), 16*ceil_div(W,4))
+    """
+
+    def ceil_div(a, b):
+        return (a + b - 1) // b
+
+    rows, cols = input_matrix.shape
+    n_row_blocks = ceil_div(rows, 128)
+    n_col_blocks = ceil_div(cols, 4)
+
+    # Calculate the padded shape
+    padded_rows = n_row_blocks * 128
+    padded_cols = n_col_blocks * 4
+
+    padded = input_matrix
+    if (rows, cols) != (padded_rows, padded_cols):
+        padded = torch.zeros(
+            (padded_rows, padded_cols),
+            device=input_matrix.device,
+            dtype=input_matrix.dtype,
+        )
+        padded[:rows, :cols] = input_matrix
+
+    # Rearrange the blocks
+    blocks = padded.view(n_row_blocks, 128, n_col_blocks, 4).permute(0, 2, 1, 3)
+    rearranged = blocks.reshape(-1, 4, 32, 4).transpose(1, 2).reshape(-1, 32, 16)
+    if flatten:
+        return rearranged.flatten()
+
+    return rearranged.reshape(padded_rows, padded_cols)
+
+
+def stochastic_round_quantize_nvfp4_block(x, per_tensor_scale, generator):
+    F4_E2M1_MAX = 6.0
+    F8_E4M3_MAX = 448.0
+
+    orig_shape = x.shape
+
+    block_size = 16
+
+    x = x.reshape(orig_shape[0], -1, block_size)
+    scaled_block_scales_fp8 = torch.clamp(((torch.amax(torch.abs(x), dim=-1)) / F4_E2M1_MAX) / per_tensor_scale.to(x.dtype), max=F8_E4M3_MAX).to(torch.float8_e4m3fn)
+    x = x / (per_tensor_scale.to(x.dtype) * scaled_block_scales_fp8.to(x.dtype)).unsqueeze(-1)
+
+    x = x.view(orig_shape).nan_to_num()
+    data_lp = stochastic_float_to_fp4_e2m1(x, generator=generator)
+    return data_lp, scaled_block_scales_fp8
+
+
+def stochastic_round_quantize_nvfp4(x, per_tensor_scale, pad_16x, seed=0):
+    def roundup(x: int, multiple: int) -> int:
+        """Round up x to the nearest multiple."""
+        return ((x + multiple - 1) // multiple) * multiple
+
+    generator = torch.Generator(device=x.device)
+    generator.manual_seed(seed)
+
+    # Handle padding
+    if pad_16x:
+        rows, cols = x.shape
+        padded_rows = roundup(rows, 16)
+        padded_cols = roundup(cols, 16)
+        if padded_rows != rows or padded_cols != cols:
+            x = torch.nn.functional.pad(x, (0, padded_cols - cols, 0, padded_rows - rows))
+
+    x, blocked_scaled = stochastic_round_quantize_nvfp4_block(x, per_tensor_scale, generator)
+    return x, to_blocked(blocked_scaled, flatten=False)
+
+
+def stochastic_round_quantize_nvfp4_by_block(x, per_tensor_scale, pad_16x, seed=0, block_size=4096 * 4096):
+    def roundup(x: int, multiple: int) -> int:
+        """Round up x to the nearest multiple."""
+        return ((x + multiple - 1) // multiple) * multiple
+
+    orig_shape = x.shape
+
+    # Handle padding
+    if pad_16x:
+        rows, cols = x.shape
+        padded_rows = roundup(rows, 16)
+        padded_cols = roundup(cols, 16)
+        if padded_rows != rows or padded_cols != cols:
+            x = torch.nn.functional.pad(x, (0, padded_cols - cols, 0, padded_rows - rows))
+            # Note: We update orig_shape because the output tensor logic below assumes x.shape matches
+            # what we want to produce. If we pad here, we want the padded output.
+            orig_shape = x.shape
+
+    orig_shape = list(orig_shape)
+
+    output_fp4 = torch.empty(orig_shape[:-1] + [orig_shape[-1] // 2], dtype=torch.uint8, device=x.device)
+    output_block = torch.empty(orig_shape[:-1] + [orig_shape[-1] // 16], dtype=torch.float8_e4m3fn, device=x.device)
+
+    generator = torch.Generator(device=x.device)
+    generator.manual_seed(seed)
+
+    num_slices = max(1, (x.numel() / block_size))
+    slice_size = max(1, (round(x.shape[0] / num_slices)))
+
+    for i in range(0, x.shape[0], slice_size):
+        fp4, block = stochastic_round_quantize_nvfp4_block(x[i: i + slice_size], per_tensor_scale, generator=generator)
+        output_fp4[i:i + slice_size].copy_(fp4)
+        output_block[i:i + slice_size].copy_(block)
+
+    return output_fp4, to_blocked(output_block, flatten=False)
--- a/comfy/hooks.py
+++ b/comfy/hooks.py
@ -527,7 +527,8 @@ class HookKeyframeGroup:
                        if self._current_keyframe.get_effective_guarantee_steps(max_sigma) > 0:
                            break
                    # if eval_c is outside the percent range, stop looking further
-                    else: break
+                    else:
+                        break
        # update steps current context is used
        self._current_used_steps += 1
        # update current timestep this was performed on
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@ -74,6 +74,9 @@ def get_ancestral_step(sigma_from, sigma_to, eta=1.):

 def default_noise_sampler(x, seed=None):
    if seed is not None:
+        if x.device == torch.device("cpu"):
+            seed += 1
+
        generator = torch.Generator(device=x.device)
        generator.manual_seed(seed)
    else:
@ -1557,10 +1560,13 @@ def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None


@torch.no_grad()
-def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=0.5):
+def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=0.5, solver_type="phi_1"):
    """SEEDS-2 - Stochastic Explicit Exponential Derivative-free Solvers (VP Data Prediction) stage 2.
    arXiv: https://arxiv.org/abs/2305.14267 (NeurIPS 2023)
    """
+    if solver_type not in {"phi_1", "phi_2"}:
+        raise ValueError("solver_type must be 'phi_1' or 'phi_2'")
+
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
@ -1600,8 +1606,14 @@ def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=Non
        denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args)

        # Step 2
-        denoised_d = torch.lerp(denoised, denoised_2, fac)
-        x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * ei_h_phi_1(-h_eta) * denoised_d
+        if solver_type == "phi_1":
+            denoised_d = torch.lerp(denoised, denoised_2, fac)
+            x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * ei_h_phi_1(-h_eta) * denoised_d
+        elif solver_type == "phi_2":
+            b2 = ei_h_phi_2(-h_eta) / r
+            b1 = ei_h_phi_1(-h_eta) - b2
+            x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * (b1 * denoised + b2 * denoised_2)
+
        if inject_noise:
            segment_factor = (r - 1) * h * eta
            sde_noise = sde_noise * segment_factor.exp()
@ -1609,6 +1621,17 @@ def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=Non
            x = x + sde_noise * sigmas[i + 1] * s_noise
    return x

+@torch.no_grad()
+def sample_exp_heun_2_x0(model, x, sigmas, extra_args=None, callback=None, disable=None, solver_type="phi_2"):
+    """Deterministic exponential Heun second order method in data prediction (x0) and logSNR time."""
+    return sample_seeds_2(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=0.0, s_noise=0.0, noise_sampler=None, r=1.0, solver_type=solver_type)
+
+
+@torch.no_grad()
+def sample_exp_heun_2_x0_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type="phi_2"):
+    """Stochastic exponential Heun second order method in data prediction (x0) and logSNR time."""
+    return sample_seeds_2(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, r=1.0, solver_type=solver_type)
+

@torch.no_grad()
 def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r_1=1./3, r_2=2./3):
@ -1756,7 +1779,7 @@ def sample_sa_solver(model, x, sigmas, extra_args=None, callback=None, disable=F
        # Predictor
        if sigmas[i + 1] == 0:
            # Denoising step
-            x = denoised
+            x_pred = denoised
        else:
            tau_t = tau_func(sigmas[i + 1])
            curr_lambdas = lambdas[i - predictor_order_used + 1:i + 1]
@ -1777,7 +1800,7 @@ def sample_sa_solver(model, x, sigmas, extra_args=None, callback=None, disable=F
            if tau_t > 0 and s_noise > 0:
                noise = noise_sampler(sigmas[i], sigmas[i + 1]) * sigmas[i + 1] * (-2 * tau_t ** 2 * h).expm1().neg().sqrt() * s_noise
                x_pred = x_pred + noise
-    return x
+    return x_pred


@torch.no_grad()
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@ -6,7 +6,9 @@ class LatentFormat:
    latent_dimensions = 2
    latent_rgb_factors = None
    latent_rgb_factors_bias = None
+    latent_rgb_factors_reshape = None
    taesd_decoder_name = None
+    spacial_downscale_ratio = 8

    def process_in(self, latent):
        return latent * self.scale_factor
@ -178,6 +180,55 @@ class Flux(SD3):
    def process_out(self, latent):
        return (latent / self.scale_factor) + self.shift_factor

+class Flux2(LatentFormat):
+    latent_channels = 128
+    spacial_downscale_ratio = 16
+
+    def __init__(self):
+        self.latent_rgb_factors =[
+            [0.0058, 0.0113, 0.0073],
+            [0.0495, 0.0443, 0.0836],
+            [-0.0099, 0.0096, 0.0644],
+            [0.2144, 0.3009, 0.3652],
+            [0.0166, -0.0039, -0.0054],
+            [0.0157, 0.0103, -0.0160],
+            [-0.0398, 0.0902, -0.0235],
+            [-0.0052, 0.0095, 0.0109],
+            [-0.3527, -0.2712, -0.1666],
+            [-0.0301, -0.0356, -0.0180],
+            [-0.0107, 0.0078, 0.0013],
+            [0.0746, 0.0090, -0.0941],
+            [0.0156, 0.0169, 0.0070],
+            [-0.0034, -0.0040, -0.0114],
+            [0.0032, 0.0181, 0.0080],
+            [-0.0939, -0.0008, 0.0186],
+            [0.0018, 0.0043, 0.0104],
+            [0.0284, 0.0056, -0.0127],
+            [-0.0024, -0.0022, -0.0030],
+            [0.1207, -0.0026, 0.0065],
+            [0.0128, 0.0101, 0.0142],
+            [0.0137, -0.0072, -0.0007],
+            [0.0095, 0.0092, -0.0059],
+            [0.0000, -0.0077, -0.0049],
+            [-0.0465, -0.0204, -0.0312],
+            [0.0095, 0.0012, -0.0066],
+            [0.0290, -0.0034, 0.0025],
+            [0.0220, 0.0169, -0.0048],
+            [-0.0332, -0.0457, -0.0468],
+            [-0.0085, 0.0389, 0.0609],
+            [-0.0076, 0.0003, -0.0043],
+            [-0.0111, -0.0460, -0.0614],
+        ]
+
+        self.latent_rgb_factors_bias = [-0.0329, -0.0718, -0.0851]
+        self.latent_rgb_factors_reshape = lambda t: t.reshape(t.shape[0], 32, 2, 2, t.shape[-2], t.shape[-1]).permute(0, 1, 4, 2, 5, 3).reshape(t.shape[0], 32, t.shape[-2] * 2, t.shape[-1] * 2)
+
+    def process_in(self, latent):
+        return latent
+
+    def process_out(self, latent):
+        return latent
+
 class Mochi(LatentFormat):
    latent_channels = 12
    latent_dimensions = 3
@ -358,6 +409,11 @@ class LTXV(LatentFormat):

        self.latent_rgb_factors_bias = [-0.0571, -0.1657, -0.2512]

+class LTXAV(LTXV):
+    def __init__(self):
+        self.latent_rgb_factors = None
+        self.latent_rgb_factors_bias = None
+
 class HunyuanVideo(LatentFormat):
    latent_channels = 16
    latent_dimensions = 3
@ -382,6 +438,7 @@ class HunyuanVideo(LatentFormat):
    ]

    latent_rgb_factors_bias = [ 0.0259, -0.0192, -0.0761]
+    taesd_decoder_name = "taehv"

 class Cosmos1CV8x8x8(LatentFormat):
    latent_channels = 16
@ -445,7 +502,7 @@ class Wan21(LatentFormat):
        ]).view(1, self.latent_channels, 1, 1, 1)


-        self.taesd_decoder_name = None #TODO
+        self.taesd_decoder_name = "lighttaew2_1"

    def process_in(self, latent):
        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
@ -516,6 +573,7 @@ class Wan22(Wan21):

    def __init__(self):
        self.scale_factor = 1.0
+        self.taesd_decoder_name = "lighttaew2_2"
        self.latents_mean = torch.tensor([
                -0.2289, -0.0052, -0.1323, -0.2339, -0.2799, 0.0174, 0.1838, 0.1557,
                -0.1382, 0.0542, 0.2813, 0.0891, 0.1570, -0.0098, 0.0375, -0.1825,
@ -611,6 +669,67 @@ class HunyuanImage21Refiner(LatentFormat):
    latent_dimensions = 3
    scale_factor = 1.03682

+    def process_in(self, latent):
+        out = latent * self.scale_factor
+        out = torch.cat((out[:, :, :1], out), dim=2)
+        out = out.permute(0, 2, 1, 3, 4)
+        b, f_times_2, c, h, w = out.shape
+        out = out.reshape(b, f_times_2 // 2, 2 * c, h, w)
+        out = out.permute(0, 2, 1, 3, 4).contiguous()
+        return out
+
+    def process_out(self, latent):
+        z = latent / self.scale_factor
+        z = z.permute(0, 2, 1, 3, 4)
+        b, f, c, h, w = z.shape
+        z = z.reshape(b, f, 2, c // 2, h, w)
+        z = z.permute(0, 1, 2, 3, 4, 5).reshape(b, f * 2, c // 2, h, w)
+        z = z.permute(0, 2, 1, 3, 4)
+        z = z[:, :, 1:]
+        return z
+
+class HunyuanVideo15(LatentFormat):
+    latent_rgb_factors = [
+        [ 0.0568, -0.0521, -0.0131],
+        [ 0.0014,  0.0735,  0.0326],
+        [ 0.0186,  0.0531, -0.0138],
+        [-0.0031,  0.0051,  0.0288],
+        [ 0.0110,  0.0556,  0.0432],
+        [-0.0041, -0.0023, -0.0485],
+        [ 0.0530,  0.0413,  0.0253],
+        [ 0.0283,  0.0251,  0.0339],
+        [ 0.0277, -0.0372, -0.0093],
+        [ 0.0393,  0.0944,  0.1131],
+        [ 0.0020,  0.0251,  0.0037],
+        [-0.0017,  0.0012,  0.0234],
+        [ 0.0468,  0.0436,  0.0203],
+        [ 0.0354,  0.0439, -0.0233],
+        [ 0.0090,  0.0123,  0.0346],
+        [ 0.0382,  0.0029,  0.0217],
+        [ 0.0261, -0.0300,  0.0030],
+        [-0.0088, -0.0220, -0.0283],
+        [-0.0272, -0.0121, -0.0363],
+        [-0.0664, -0.0622,  0.0144],
+        [ 0.0414,  0.0479,  0.0529],
+        [ 0.0355,  0.0612, -0.0247],
+        [ 0.0147,  0.0264,  0.0174],
+        [ 0.0438,  0.0038,  0.0542],
+        [ 0.0431, -0.0573, -0.0033],
+        [-0.0162, -0.0211, -0.0406],
+        [-0.0487, -0.0295, -0.0393],
+        [ 0.0005, -0.0109,  0.0253],
+        [ 0.0296,  0.0591,  0.0353],
+        [ 0.0119,  0.0181, -0.0306],
+        [-0.0085, -0.0362,  0.0229],
+        [ 0.0005, -0.0106,  0.0242]
+    ]
+
+    latent_rgb_factors_bias = [ 0.0456, -0.0202, -0.0644]
+    latent_channels = 32
+    latent_dimensions = 3
+    scale_factor = 1.03682
+    taesd_decoder_name = "lighttaehy1_5"
+
 class Hunyuan3Dv2(LatentFormat):
    latent_channels = 64
    latent_dimensions = 1
@ -632,6 +751,7 @@ class ACEAudio(LatentFormat):

 class ChromaRadiance(LatentFormat):
    latent_channels = 3
+    spacial_downscale_ratio = 1

    def __init__(self):
        self.latent_rgb_factors = [
--- a/comfy/ldm/ace/vae/music_dcae_pipeline.py
+++ b/comfy/ldm/ace/vae/music_dcae_pipeline.py
@ -23,8 +23,6 @@ class MusicDCAE(torch.nn.Module):
        else:
            self.source_sample_rate = source_sample_rate

-        # self.resampler = torchaudio.transforms.Resample(source_sample_rate, 44100)
-
        self.transform = transforms.Compose([
            transforms.Normalize(0.5, 0.5),
        ])
@ -37,10 +35,6 @@ class MusicDCAE(torch.nn.Module):
        self.scale_factor = 0.1786
        self.shift_factor = -1.9091

-    def load_audio(self, audio_path):
-        audio, sr = torchaudio.load(audio_path)
-        return audio, sr
-
    def forward_mel(self, audios):
        mels = []
        for i in range(len(audios)):
@ -73,10 +67,8 @@ class MusicDCAE(torch.nn.Module):
            latent = self.dcae.encoder(mel.unsqueeze(0))
            latents.append(latent)
        latents = torch.cat(latents, dim=0)
-        # latent_lengths = (audio_lengths / sr * 44100 / 512 / self.time_dimention_multiple).long()
        latents = (latents - self.shift_factor) * self.scale_factor
        return latents
-        # return latents, latent_lengths

    @torch.no_grad()
    def decode(self, latents, audio_lengths=None, sr=None):
@ -91,9 +83,7 @@ class MusicDCAE(torch.nn.Module):
            wav = self.vocoder.decode(mels[0]).squeeze(1)

            if sr is not None:
-                # resampler = torchaudio.transforms.Resample(44100, sr).to(latents.device).to(latents.dtype)
                wav = torchaudio.functional.resample(wav, 44100, sr)
-                # wav = resampler(wav)
            else:
                sr = 44100
            pred_wavs.append(wav)
@ -101,7 +91,6 @@ class MusicDCAE(torch.nn.Module):
        if audio_lengths is not None:
            pred_wavs = [wav[:, :length].cpu() for wav, length in zip(pred_wavs, audio_lengths)]
        return torch.stack(pred_wavs)
-        # return sr, pred_wavs

    def forward(self, audios, audio_lengths=None, sr=None):
        latents, latent_lengths = self.encode(audios=audios, audio_lengths=audio_lengths, sr=sr)
--- a/comfy/ldm/anima/model.py
+++ b/comfy/ldm/anima/model.py
@ -0,0 +1,202 @@
+from comfy.ldm.cosmos.predict2 import MiniTrainDIT
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+
+def rotate_half(x):
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(x, cos, sin, unsqueeze_dim=1):
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    x_embed = (x * cos) + (rotate_half(x) * sin)
+    return x_embed
+
+
+class RotaryEmbedding(nn.Module):
+    def __init__(self, head_dim):
+        super().__init__()
+        self.rope_theta = 10000
+        inv_freq = 1.0 / (self.rope_theta ** (torch.arange(0, head_dim, 2, dtype=torch.int64).to(dtype=torch.float) / head_dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class Attention(nn.Module):
+    def __init__(self, query_dim, context_dim, n_heads, head_dim, device=None, dtype=None, operations=None):
+        super().__init__()
+
+        inner_dim = head_dim * n_heads
+        self.n_heads = n_heads
+        self.head_dim = head_dim
+        self.query_dim = query_dim
+        self.context_dim = context_dim
+
+        self.q_proj = operations.Linear(query_dim, inner_dim, bias=False, device=device, dtype=dtype)
+        self.q_norm = operations.RMSNorm(self.head_dim, eps=1e-6, device=device, dtype=dtype)
+
+        self.k_proj = operations.Linear(context_dim, inner_dim, bias=False, device=device, dtype=dtype)
+        self.k_norm = operations.RMSNorm(self.head_dim, eps=1e-6, device=device, dtype=dtype)
+
+        self.v_proj = operations.Linear(context_dim, inner_dim, bias=False, device=device, dtype=dtype)
+
+        self.o_proj = operations.Linear(inner_dim, query_dim, bias=False, device=device, dtype=dtype)
+
+    def forward(self, x, mask=None, context=None, position_embeddings=None, position_embeddings_context=None):
+        context = x if context is None else context
+        input_shape = x.shape[:-1]
+        q_shape = (*input_shape, self.n_heads, self.head_dim)
+        context_shape = context.shape[:-1]
+        kv_shape = (*context_shape, self.n_heads, self.head_dim)
+
+        query_states = self.q_norm(self.q_proj(x).view(q_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(context).view(kv_shape)).transpose(1, 2)
+        value_states = self.v_proj(context).view(kv_shape).transpose(1, 2)
+
+        if position_embeddings is not None:
+            assert position_embeddings_context is not None
+            cos, sin = position_embeddings
+            query_states = apply_rotary_pos_emb(query_states, cos, sin)
+            cos, sin = position_embeddings_context
+            key_states = apply_rotary_pos_emb(key_states, cos, sin)
+
+        attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask=mask)
+
+        attn_output = attn_output.transpose(1, 2).reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+
+    def init_weights(self):
+        torch.nn.init.zeros_(self.o_proj.weight)
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, source_dim, model_dim, num_heads=16, mlp_ratio=4.0, use_self_attn=False, layer_norm=False, device=None, dtype=None, operations=None):
+        super().__init__()
+        self.use_self_attn = use_self_attn
+
+        if self.use_self_attn:
+            self.norm_self_attn = operations.LayerNorm(model_dim, device=device, dtype=dtype) if layer_norm else operations.RMSNorm(model_dim, eps=1e-6, device=device, dtype=dtype)
+            self.self_attn = Attention(
+                query_dim=model_dim,
+                context_dim=model_dim,
+                n_heads=num_heads,
+                head_dim=model_dim//num_heads,
+                device=device,
+                dtype=dtype,
+                operations=operations,
+            )
+
+        self.norm_cross_attn = operations.LayerNorm(model_dim, device=device, dtype=dtype) if layer_norm else operations.RMSNorm(model_dim, eps=1e-6, device=device, dtype=dtype)
+        self.cross_attn = Attention(
+            query_dim=model_dim,
+            context_dim=source_dim,
+            n_heads=num_heads,
+            head_dim=model_dim//num_heads,
+            device=device,
+            dtype=dtype,
+            operations=operations,
+        )
+
+        self.norm_mlp = operations.LayerNorm(model_dim, device=device, dtype=dtype) if layer_norm else operations.RMSNorm(model_dim, eps=1e-6, device=device, dtype=dtype)
+        self.mlp = nn.Sequential(
+            operations.Linear(model_dim, int(model_dim * mlp_ratio), device=device, dtype=dtype),
+            nn.GELU(),
+            operations.Linear(int(model_dim * mlp_ratio), model_dim, device=device, dtype=dtype)
+        )
+
+    def forward(self, x, context, target_attention_mask=None, source_attention_mask=None, position_embeddings=None, position_embeddings_context=None):
+        if self.use_self_attn:
+            normed = self.norm_self_attn(x)
+            attn_out = self.self_attn(normed, mask=target_attention_mask, position_embeddings=position_embeddings, position_embeddings_context=position_embeddings)
+            x = x + attn_out
+
+        normed = self.norm_cross_attn(x)
+        attn_out = self.cross_attn(normed, mask=source_attention_mask, context=context, position_embeddings=position_embeddings, position_embeddings_context=position_embeddings_context)
+        x = x + attn_out
+
+        x = x + self.mlp(self.norm_mlp(x))
+        return x
+
+    def init_weights(self):
+        torch.nn.init.zeros_(self.mlp[2].weight)
+        self.cross_attn.init_weights()
+
+
+class LLMAdapter(nn.Module):
+    def __init__(
+            self,
+            source_dim=1024,
+            target_dim=1024,
+            model_dim=1024,
+            num_layers=6,
+            num_heads=16,
+            use_self_attn=True,
+            layer_norm=False,
+            device=None,
+            dtype=None,
+            operations=None,
+        ):
+        super().__init__()
+
+        self.embed = operations.Embedding(32128, target_dim, device=device, dtype=dtype)
+        if model_dim != target_dim:
+            self.in_proj = operations.Linear(target_dim, model_dim, device=device, dtype=dtype)
+        else:
+            self.in_proj = nn.Identity()
+        self.rotary_emb = RotaryEmbedding(model_dim//num_heads)
+        self.blocks = nn.ModuleList([
+            TransformerBlock(source_dim, model_dim, num_heads=num_heads, use_self_attn=use_self_attn, layer_norm=layer_norm, device=device, dtype=dtype, operations=operations) for _ in range(num_layers)
+        ])
+        self.out_proj = operations.Linear(model_dim, target_dim, device=device, dtype=dtype)
+        self.norm = operations.RMSNorm(target_dim, eps=1e-6, device=device, dtype=dtype)
+
+    def forward(self, source_hidden_states, target_input_ids, target_attention_mask=None, source_attention_mask=None):
+        if target_attention_mask is not None:
+            target_attention_mask = target_attention_mask.to(torch.bool)
+            if target_attention_mask.ndim == 2:
+                target_attention_mask = target_attention_mask.unsqueeze(1).unsqueeze(1)
+
+        if source_attention_mask is not None:
+            source_attention_mask = source_attention_mask.to(torch.bool)
+            if source_attention_mask.ndim == 2:
+                source_attention_mask = source_attention_mask.unsqueeze(1).unsqueeze(1)
+
+        x = self.in_proj(self.embed(target_input_ids))
+        context = source_hidden_states
+        position_ids = torch.arange(x.shape[1], device=x.device).unsqueeze(0)
+        position_ids_context = torch.arange(context.shape[1], device=x.device).unsqueeze(0)
+        position_embeddings = self.rotary_emb(x, position_ids)
+        position_embeddings_context = self.rotary_emb(x, position_ids_context)
+        for block in self.blocks:
+            x = block(x, context, target_attention_mask=target_attention_mask, source_attention_mask=source_attention_mask, position_embeddings=position_embeddings, position_embeddings_context=position_embeddings_context)
+        return self.norm(self.out_proj(x))
+
+
+class Anima(MiniTrainDIT):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.llm_adapter = LLMAdapter(device=kwargs.get("device"), dtype=kwargs.get("dtype"), operations=kwargs.get("operations"))
+
+    def preprocess_text_embeds(self, text_embeds, text_ids):
+        if text_ids is not None:
+            return self.llm_adapter(text_embeds, text_ids)
+        else:
+            return text_embeds
--- a/comfy/ldm/chroma/layers.py
+++ b/comfy/ldm/chroma/layers.py
@ -1,15 +1,15 @@
 import torch
 from torch import Tensor, nn

-from comfy.ldm.flux.math import attention
 from comfy.ldm.flux.layers import (
    MLPEmbedder,
    RMSNorm,
-    QKNorm,
-    SelfAttention,
    ModulationOut,
 )

+# TODO: remove this in a few months
+SingleStreamBlock = None
+DoubleStreamBlock = None


 class ChromaModulationOut(ModulationOut):
@ -48,124 +48,6 @@ class Approximator(nn.Module):
        return x


-class DoubleStreamBlock(nn.Module):
-    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None):
-        super().__init__()
-
-        mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        self.num_heads = num_heads
-        self.hidden_size = hidden_size
-        self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
-
-        self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.img_mlp = nn.Sequential(
-            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
-            nn.GELU(approximate="tanh"),
-            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
-        )
-
-        self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
-
-        self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.txt_mlp = nn.Sequential(
-            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
-            nn.GELU(approximate="tanh"),
-            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
-        )
-        self.flipped_img_txt = flipped_img_txt
-
-    def forward(self, img: Tensor, txt: Tensor, pe: Tensor, vec: Tensor, attn_mask=None, transformer_options={}):
-        (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec
-
-        # prepare image for attention
-        img_modulated = torch.addcmul(img_mod1.shift, 1 + img_mod1.scale, self.img_norm1(img))
-        img_qkv = self.img_attn.qkv(img_modulated)
-        img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
-
-        # prepare txt for attention
-        txt_modulated = torch.addcmul(txt_mod1.shift, 1 + txt_mod1.scale, self.txt_norm1(txt))
-        txt_qkv = self.txt_attn.qkv(txt_modulated)
-        txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
-
-        # run actual attention
-        attn = attention(torch.cat((txt_q, img_q), dim=2),
-                         torch.cat((txt_k, img_k), dim=2),
-                         torch.cat((txt_v, img_v), dim=2),
-                         pe=pe, mask=attn_mask, transformer_options=transformer_options)
-
-        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
-
-        # calculate the img bloks
-        img.addcmul_(img_mod1.gate, self.img_attn.proj(img_attn))
-        img.addcmul_(img_mod2.gate, self.img_mlp(torch.addcmul(img_mod2.shift, 1 + img_mod2.scale, self.img_norm2(img))))
-
-        # calculate the txt bloks
-        txt.addcmul_(txt_mod1.gate, self.txt_attn.proj(txt_attn))
-        txt.addcmul_(txt_mod2.gate, self.txt_mlp(torch.addcmul(txt_mod2.shift, 1 + txt_mod2.scale, self.txt_norm2(txt))))
-
-        if txt.dtype == torch.float16:
-            txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504)
-
-        return img, txt
-
-
-class SingleStreamBlock(nn.Module):
-    """
-    A DiT block with parallel linear layers as described in
-    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
-    """
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        mlp_ratio: float = 4.0,
-        qk_scale: float = None,
-        dtype=None,
-        device=None,
-        operations=None
-    ):
-        super().__init__()
-        self.hidden_dim = hidden_size
-        self.num_heads = num_heads
-        head_dim = hidden_size // num_heads
-        self.scale = qk_scale or head_dim**-0.5
-
-        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        # qkv and mlp_in
-        self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device)
-        # proj and mlp_out
-        self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device)
-
-        self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
-
-        self.hidden_size = hidden_size
-        self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-
-        self.mlp_act = nn.GELU(approximate="tanh")
-
-    def forward(self, x: Tensor, pe: Tensor, vec: Tensor, attn_mask=None, transformer_options={}) -> Tensor:
-        mod = vec
-        x_mod = torch.addcmul(mod.shift, 1 + mod.scale, self.pre_norm(x))
-        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
-
-        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        q, k = self.norm(q, k, v)
-
-        # compute attention
-        attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options)
-        # compute activation in mlp stream, cat again and run second linear layer
-        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
-        x.addcmul_(mod.gate, output)
-        if x.dtype == torch.float16:
-            x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
-        return x
-
-
 class LastLayer(nn.Module):
    def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None):
        super().__init__()
--- a/comfy/ldm/chroma/model.py
+++ b/comfy/ldm/chroma/model.py
@ -11,12 +11,12 @@ import comfy.ldm.common_dit
 from comfy.ldm.flux.layers import (
    EmbedND,
    timestep_embedding,
+    DoubleStreamBlock,
+    SingleStreamBlock,
 )

 from .layers import (
-    DoubleStreamBlock,
    LastLayer,
-    SingleStreamBlock,
    Approximator,
    ChromaModulationOut,
 )
@ -40,7 +40,8 @@ class ChromaParams:
    out_dim: int
    hidden_dim: int
    n_layers: int
-
+    txt_ids_dims: list
+    vec_in_dim: int



@ -90,6 +91,7 @@ class Chroma(nn.Module):
                    self.num_heads,
                    mlp_ratio=params.mlp_ratio,
                    qkv_bias=params.qkv_bias,
+                    modulation=False,
                    dtype=dtype, device=device, operations=operations
                )
                for _ in range(params.depth)
@ -98,7 +100,7 @@ class Chroma(nn.Module):

        self.single_blocks = nn.ModuleList(
            [
-                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations)
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, modulation=False, dtype=dtype, device=device, operations=operations)
                for _ in range(params.depth_single_blocks)
            ]
        )
@ -178,7 +180,10 @@ class Chroma(nn.Module):
        pe = self.pe_embedder(ids)

        blocks_replace = patches_replace.get("dit", {})
+        transformer_options["total_blocks"] = len(self.double_blocks)
+        transformer_options["block_type"] = "double"
        for i, block in enumerate(self.double_blocks):
+            transformer_options["block_index"] = i
            if i not in self.skip_mmdit:
                double_mod = (
                    self.get_modulations(mod_vectors, "double_img", idx=i),
@ -221,7 +226,10 @@ class Chroma(nn.Module):

        img = torch.cat((txt, img), 1)

+        transformer_options["total_blocks"] = len(self.single_blocks)
+        transformer_options["block_type"] = "single"
        for i, block in enumerate(self.single_blocks):
+            transformer_options["block_index"] = i
            if i not in self.skip_dit:
                single_mod = self.get_modulations(mod_vectors, "single", idx=i)
                if ("single_block", i) in blocks_replace:
--- a/comfy/ldm/chroma_radiance/model.py
+++ b/comfy/ldm/chroma_radiance/model.py
@ -10,12 +10,10 @@ from torch import Tensor, nn
 from einops import repeat
 import comfy.ldm.common_dit

-from comfy.ldm.flux.layers import EmbedND
+from comfy.ldm.flux.layers import EmbedND, DoubleStreamBlock, SingleStreamBlock

 from comfy.ldm.chroma.model import Chroma, ChromaParams
 from comfy.ldm.chroma.layers import (
-    DoubleStreamBlock,
-    SingleStreamBlock,
    Approximator,
 )
 from .layers import (
@ -39,7 +37,7 @@ class ChromaRadianceParams(ChromaParams):
    nerf_final_head_type: str
    # None means use the same dtype as the model.
    nerf_embedder_dtype: Optional[torch.dtype]
-
+    use_x0: bool

 class ChromaRadiance(Chroma):
    """
@ -89,7 +87,6 @@ class ChromaRadiance(Chroma):
                    dtype=dtype, device=device, operations=operations
                )

-
        self.double_blocks = nn.ModuleList(
            [
                DoubleStreamBlock(
@ -97,6 +94,7 @@ class ChromaRadiance(Chroma):
                    self.num_heads,
                    mlp_ratio=params.mlp_ratio,
                    qkv_bias=params.qkv_bias,
+                    modulation=False,
                    dtype=dtype, device=device, operations=operations
                )
                for _ in range(params.depth)
@ -109,6 +107,7 @@ class ChromaRadiance(Chroma):
                    self.hidden_size,
                    self.num_heads,
                    mlp_ratio=params.mlp_ratio,
+                    modulation=False,
                    dtype=dtype, device=device, operations=operations,
                )
                for _ in range(params.depth_single_blocks)
@ -160,6 +159,9 @@ class ChromaRadiance(Chroma):
        self.skip_dit = []
        self.lite = False

+        if params.use_x0:
+            self.register_buffer("__x0__", torch.tensor([]))
+
    @property
    def _nerf_final_layer(self) -> nn.Module:
        if self.params.nerf_final_head_type == "linear":
@ -189,15 +191,15 @@ class ChromaRadiance(Chroma):
        nerf_pixels = nn.functional.unfold(img_orig, kernel_size=patch_size, stride=patch_size)
        nerf_pixels = nerf_pixels.transpose(1, 2) # -> [B, NumPatches, C * P * P]

+        # Reshape for per-patch processing
+        nerf_hidden = img_out.reshape(B * num_patches, params.hidden_size)
+        nerf_pixels = nerf_pixels.reshape(B * num_patches, C, patch_size**2).transpose(1, 2)
+
        if params.nerf_tile_size > 0 and num_patches > params.nerf_tile_size:
            # Enable tiling if nerf_tile_size isn't 0 and we actually have more patches than
            # the tile size.
-            img_dct = self.forward_tiled_nerf(img_out, nerf_pixels, B, C, num_patches, patch_size, params)
+            img_dct = self.forward_tiled_nerf(nerf_hidden, nerf_pixels, B, C, num_patches, patch_size, params)
        else:
-            # Reshape for per-patch processing
-            nerf_hidden = img_out.reshape(B * num_patches, params.hidden_size)
-            nerf_pixels = nerf_pixels.reshape(B * num_patches, C, patch_size**2).transpose(1, 2)
-
            # Get DCT-encoded pixel embeddings [pixel-dct]
            img_dct = self.nerf_image_embedder(nerf_pixels)

@ -240,17 +242,8 @@ class ChromaRadiance(Chroma):
            end = min(i + tile_size, num_patches)

            # Slice the current tile from the input tensors
-            nerf_hidden_tile = nerf_hidden[:, i:end, :]
-            nerf_pixels_tile = nerf_pixels[:, i:end, :]
-
-            # Get the actual number of patches in this tile (can be smaller for the last tile)
-            num_patches_tile = nerf_hidden_tile.shape[1]
-
-            # Reshape the tile for per-patch processing
-            # [B, NumPatches_tile, D] -> [B * NumPatches_tile, D]
-            nerf_hidden_tile = nerf_hidden_tile.reshape(batch * num_patches_tile, params.hidden_size)
-            # [B, NumPatches_tile, C*P*P] -> [B*NumPatches_tile, C, P*P] -> [B*NumPatches_tile, P*P, C]
-            nerf_pixels_tile = nerf_pixels_tile.reshape(batch * num_patches_tile, channels, patch_size**2).transpose(1, 2)
+            nerf_hidden_tile = nerf_hidden[i * batch:end * batch]
+            nerf_pixels_tile = nerf_pixels[i * batch:end * batch]

            # get DCT-encoded pixel embeddings [pixel-dct]
            img_dct_tile = self.nerf_image_embedder(nerf_pixels_tile)
@ -277,7 +270,7 @@ class ChromaRadiance(Chroma):
        bad_keys = tuple(
            k
            for k, v in overrides.items()
-            if type(v) != type(getattr(params, k)) and (v is not None or k not in nullable_keys)
+            if not isinstance(v, type(getattr(params, k))) and (v is not None or k not in nullable_keys)
        )
        if bad_keys:
            e = f"Invalid value(s) in transformer_options chroma_radiance_options: {', '.join(bad_keys)}"
@ -286,6 +279,12 @@ class ChromaRadiance(Chroma):
        params_dict |= overrides
        return params.__class__(**params_dict)

+    def _apply_x0_residual(self, predicted, noisy, timesteps):
+
+        # non zero during training to prevent 0 div
+        eps = 0.0
+        return (noisy - predicted) / (timesteps.view(-1,1,1,1) + eps)
+
    def _forward(
        self,
        x: Tensor,
@ -326,4 +325,11 @@ class ChromaRadiance(Chroma):
            transformer_options,
            attn_mask=kwargs.get("attention_mask", None),
        )
-        return self.forward_nerf(img, img_out, params)[:, :, :h, :w]
+
+        out = self.forward_nerf(img, img_out, params)[:, :, :h, :w]
+
+        # If x0 variant → v-pred, just return this instead
+        if hasattr(self, "__x0__"):
+            out = self._apply_x0_residual(out, img, timestep)
+        return out
+
--- a/comfy/ldm/flux/layers.py
+++ b/comfy/ldm/flux/layers.py
@ -48,15 +48,44 @@ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 10
    return embedding

 class MLPEmbedder(nn.Module):
-    def __init__(self, in_dim: int, hidden_dim: int, dtype=None, device=None, operations=None):
+    def __init__(self, in_dim: int, hidden_dim: int, bias=True, dtype=None, device=None, operations=None):
        super().__init__()
-        self.in_layer = operations.Linear(in_dim, hidden_dim, bias=True, dtype=dtype, device=device)
+        self.in_layer = operations.Linear(in_dim, hidden_dim, bias=bias, dtype=dtype, device=device)
        self.silu = nn.SiLU()
-        self.out_layer = operations.Linear(hidden_dim, hidden_dim, bias=True, dtype=dtype, device=device)
+        self.out_layer = operations.Linear(hidden_dim, hidden_dim, bias=bias, dtype=dtype, device=device)

    def forward(self, x: Tensor) -> Tensor:
        return self.out_layer(self.silu(self.in_layer(x)))

+class YakMLP(nn.Module):
+    def __init__(self, hidden_size: int, intermediate_size: int, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.gate_proj = operations.Linear(self.hidden_size, self.intermediate_size, bias=True, dtype=dtype, device=device)
+        self.up_proj = operations.Linear(self.hidden_size, self.intermediate_size, bias=True, dtype=dtype, device=device)
+        self.down_proj = operations.Linear(self.intermediate_size, self.hidden_size, bias=True, dtype=dtype, device=device)
+        self.act_fn = nn.SiLU()
+
+    def forward(self, x: Tensor) -> Tensor:
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+def build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=False, yak_mlp=False, dtype=None, device=None, operations=None):
+    if yak_mlp:
+        return YakMLP(hidden_size, mlp_hidden_dim, dtype=dtype, device=device, operations=operations)
+    if mlp_silu_act:
+        return nn.Sequential(
+            operations.Linear(hidden_size, mlp_hidden_dim * 2, bias=False, dtype=dtype, device=device),
+            SiLUActivation(),
+            operations.Linear(mlp_hidden_dim, hidden_size, bias=False, dtype=dtype, device=device),
+        )
+    else:
+        return nn.Sequential(
+            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
+            nn.GELU(approximate="tanh"),
+            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
+        )

 class RMSNorm(torch.nn.Module):
    def __init__(self, dim: int, dtype=None, device=None, operations=None):
@ -80,14 +109,14 @@ class QKNorm(torch.nn.Module):


 class SelfAttention(nn.Module):
-    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, dtype=None, device=None, operations=None):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, proj_bias: bool = True, dtype=None, device=None, operations=None):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads

        self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
        self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
-        self.proj = operations.Linear(dim, dim, dtype=dtype, device=device)
+        self.proj = operations.Linear(dim, dim, bias=proj_bias, dtype=dtype, device=device)


@dataclass
@ -98,11 +127,11 @@ class ModulationOut:


 class Modulation(nn.Module):
-    def __init__(self, dim: int, double: bool, dtype=None, device=None, operations=None):
+    def __init__(self, dim: int, double: bool, bias=True, dtype=None, device=None, operations=None):
        super().__init__()
        self.is_double = double
        self.multiplier = 6 if double else 3
-        self.lin = operations.Linear(dim, self.multiplier * dim, bias=True, dtype=dtype, device=device)
+        self.lin = operations.Linear(dim, self.multiplier * dim, bias=bias, dtype=dtype, device=device)

    def forward(self, vec: Tensor) -> tuple:
        if vec.ndim == 2:
@ -129,77 +158,107 @@ def apply_mod(tensor, m_mult, m_add=None, modulation_dims=None):
        return tensor


+class SiLUActivation(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.gate_fn = nn.SiLU()
+
+    def forward(self, x: Tensor) -> Tensor:
+        x1, x2 = x.chunk(2, dim=-1)
+        return self.gate_fn(x1) * x2
+
+
 class DoubleStreamBlock(nn.Module):
-    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, modulation=True, mlp_silu_act=False, proj_bias=True, yak_mlp=False, dtype=None, device=None, operations=None):
        super().__init__()

        mlp_hidden_dim = int(hidden_size * mlp_ratio)
        self.num_heads = num_heads
        self.hidden_size = hidden_size
-        self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
+        self.modulation = modulation
+
+        if self.modulation:
+            self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
+
        self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias, dtype=dtype, device=device, operations=operations)

        self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.img_mlp = nn.Sequential(
-            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
-            nn.GELU(approximate="tanh"),
-            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
-        )

-        self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
+        self.img_mlp = build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=mlp_silu_act, yak_mlp=yak_mlp, dtype=dtype, device=device, operations=operations)
+
+        if self.modulation:
+            self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
+
        self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias, dtype=dtype, device=device, operations=operations)

        self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.txt_mlp = nn.Sequential(
-            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
-            nn.GELU(approximate="tanh"),
-            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
-        )
+
+        self.txt_mlp = build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=mlp_silu_act, yak_mlp=yak_mlp, dtype=dtype, device=device, operations=operations)
+
        self.flipped_img_txt = flipped_img_txt

    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None, transformer_options={}):
-        img_mod1, img_mod2 = self.img_mod(vec)
-        txt_mod1, txt_mod2 = self.txt_mod(vec)
+        if self.modulation:
+            img_mod1, img_mod2 = self.img_mod(vec)
+            txt_mod1, txt_mod2 = self.txt_mod(vec)
+        else:
+            (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec

        # prepare image for attention
        img_modulated = self.img_norm1(img)
        img_modulated = apply_mod(img_modulated, (1 + img_mod1.scale), img_mod1.shift, modulation_dims_img)
        img_qkv = self.img_attn.qkv(img_modulated)
+        del img_modulated
        img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        del img_qkv
        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)

        # prepare txt for attention
        txt_modulated = self.txt_norm1(txt)
        txt_modulated = apply_mod(txt_modulated, (1 + txt_mod1.scale), txt_mod1.shift, modulation_dims_txt)
        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        del txt_modulated
        txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        del txt_qkv
        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)

        if self.flipped_img_txt:
+            q = torch.cat((img_q, txt_q), dim=2)
+            del img_q, txt_q
+            k = torch.cat((img_k, txt_k), dim=2)
+            del img_k, txt_k
+            v = torch.cat((img_v, txt_v), dim=2)
+            del img_v, txt_v
            # run actual attention
-            attn = attention(torch.cat((img_q, txt_q), dim=2),
-                             torch.cat((img_k, txt_k), dim=2),
-                             torch.cat((img_v, txt_v), dim=2),
+            attn = attention(q, k, v,
                             pe=pe, mask=attn_mask, transformer_options=transformer_options)
+            del q, k, v

            img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1]:]
        else:
+            q = torch.cat((txt_q, img_q), dim=2)
+            del txt_q, img_q
+            k = torch.cat((txt_k, img_k), dim=2)
+            del txt_k, img_k
+            v = torch.cat((txt_v, img_v), dim=2)
+            del txt_v, img_v
            # run actual attention
-            attn = attention(torch.cat((txt_q, img_q), dim=2),
-                             torch.cat((txt_k, img_k), dim=2),
-                             torch.cat((txt_v, img_v), dim=2),
+            attn = attention(q, k, v,
                             pe=pe, mask=attn_mask, transformer_options=transformer_options)
+            del q, k, v

            txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:]

        # calculate the img bloks
-        img = img + apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img)
-        img = img + apply_mod(self.img_mlp(apply_mod(self.img_norm2(img), (1 + img_mod2.scale), img_mod2.shift, modulation_dims_img)), img_mod2.gate, None, modulation_dims_img)
+        img += apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img)
+        del img_attn
+        img += apply_mod(self.img_mlp(apply_mod(self.img_norm2(img), (1 + img_mod2.scale), img_mod2.shift, modulation_dims_img)), img_mod2.gate, None, modulation_dims_img)

        # calculate the txt bloks
        txt += apply_mod(self.txt_attn.proj(txt_attn), txt_mod1.gate, None, modulation_dims_txt)
+        del txt_attn
        txt += apply_mod(self.txt_mlp(apply_mod(self.txt_norm2(txt), (1 + txt_mod2.scale), txt_mod2.shift, modulation_dims_txt)), txt_mod2.gate, None, modulation_dims_txt)

        if txt.dtype == torch.float16:
@ -220,6 +279,10 @@ class SingleStreamBlock(nn.Module):
        num_heads: int,
        mlp_ratio: float = 4.0,
        qk_scale: float = None,
+        modulation=True,
+        mlp_silu_act=False,
+        bias=True,
+        yak_mlp=False,
        dtype=None,
        device=None,
        operations=None
@ -231,30 +294,55 @@ class SingleStreamBlock(nn.Module):
        self.scale = qk_scale or head_dim**-0.5

        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+
+        self.mlp_hidden_dim_first = self.mlp_hidden_dim
+        self.yak_mlp = yak_mlp
+        if mlp_silu_act:
+            self.mlp_hidden_dim_first = int(hidden_size * mlp_ratio * 2)
+            self.mlp_act = SiLUActivation()
+        else:
+            self.mlp_act = nn.GELU(approximate="tanh")
+
+        if self.yak_mlp:
+            self.mlp_hidden_dim_first *= 2
+            self.mlp_act = nn.SiLU()
+
        # qkv and mlp_in
-        self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device)
+        self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim_first, bias=bias, dtype=dtype, device=device)
        # proj and mlp_out
-        self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device)
+        self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, bias=bias, dtype=dtype, device=device)

        self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)

        self.hidden_size = hidden_size
        self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)

-        self.mlp_act = nn.GELU(approximate="tanh")
-        self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)
+        if modulation:
+            self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)
+        else:
+            self.modulation = None

    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims=None, transformer_options={}) -> Tensor:
-        mod, _ = self.modulation(vec)
-        qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        if self.modulation:
+            mod, _ = self.modulation(vec)
+        else:
+            mod = vec
+
+        qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim_first], dim=-1)

        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        del qkv
        q, k = self.norm(q, k, v)

        # compute attention
        attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options)
+        del q, k, v
        # compute activation in mlp stream, cat again and run second linear layer
-        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        if self.yak_mlp:
+            mlp = self.mlp_act(mlp[..., self.mlp_hidden_dim_first // 2:]) * mlp[..., :self.mlp_hidden_dim_first // 2]
+        else:
+            mlp = self.mlp_act(mlp)
+        output = self.linear2(torch.cat((attn, mlp), 2))
        x += apply_mod(output, mod.gate, None, modulation_dims)
        if x.dtype == torch.float16:
            x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
@ -262,11 +350,11 @@ class SingleStreamBlock(nn.Module):


 class LastLayer(nn.Module):
-    def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int, bias=True, dtype=None, device=None, operations=None):
        super().__init__()
        self.norm_final = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
-        self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device))
+        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=bias, dtype=dtype, device=device)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=bias, dtype=dtype, device=device))

    def forward(self, x: Tensor, vec: Tensor, modulation_dims=None) -> Tensor:
        if vec.ndim == 2:
--- a/comfy/ldm/flux/math.py
+++ b/comfy/ldm/flux/math.py
@ -4,23 +4,16 @@ from torch import Tensor

 from comfy.ldm.modules.attention import optimized_attention
 import comfy.model_management
+import logging


 def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None, transformer_options={}) -> Tensor:
-    q_shape = q.shape
-    k_shape = k.shape
-
    if pe is not None:
-        q = q.to(dtype=pe.dtype).reshape(*q.shape[:-1], -1, 1, 2)
-        k = k.to(dtype=pe.dtype).reshape(*k.shape[:-1], -1, 1, 2)
-        q = (pe[..., 0] * q[..., 0] + pe[..., 1] * q[..., 1]).reshape(*q_shape).type_as(v)
-        k = (pe[..., 0] * k[..., 0] + pe[..., 1] * k[..., 1]).reshape(*k_shape).type_as(v)
-
+        q, k = apply_rope(q, k, pe)
    heads = q.shape[1]
    x = optimized_attention(q, k, v, heads, skip_reshape=True, mask=mask, transformer_options=transformer_options)
    return x

-
 def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
    assert dim % 2 == 0
    if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu() or comfy.model_management.is_directml_enabled():
@ -35,10 +28,20 @@ def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
    return out.to(dtype=torch.float32, device=pos.device)

-def apply_rope1(x: Tensor, freqs_cis: Tensor):
-    x_ = x.to(dtype=freqs_cis.dtype).reshape(*x.shape[:-1], -1, 1, 2)
-    x_out = freqs_cis[..., 0] * x_[..., 0] + freqs_cis[..., 1] * x_[..., 1]
-    return x_out.reshape(*x.shape).type_as(x)

-def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
-    return apply_rope1(xq, freqs_cis), apply_rope1(xk, freqs_cis)
+try:
+    import comfy.quant_ops
+    apply_rope = comfy.quant_ops.ck.apply_rope
+    apply_rope1 = comfy.quant_ops.ck.apply_rope1
+except:
+    logging.warning("No comfy kitchen, using old apply_rope functions.")
+    def apply_rope1(x: Tensor, freqs_cis: Tensor):
+        x_ = x.to(dtype=freqs_cis.dtype).reshape(*x.shape[:-1], -1, 1, 2)
+
+        x_out = freqs_cis[..., 0] * x_[..., 0]
+        x_out.addcmul_(freqs_cis[..., 1], x_[..., 1])
+
+        return x_out.reshape(*x.shape).type_as(x)
+
+    def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
+        return apply_rope1(xq, freqs_cis), apply_rope1(xk, freqs_cis)
--- a/comfy/ldm/flux/model.py
+++ b/comfy/ldm/flux/model.py
@ -15,6 +15,8 @@ from .layers import (
    MLPEmbedder,
    SingleStreamBlock,
    timestep_embedding,
+    Modulation,
+    RMSNorm
 )

@dataclass
@ -33,6 +35,14 @@ class FluxParams:
    patch_size: int
    qkv_bias: bool
    guidance_embed: bool
+    txt_ids_dims: list
+    global_modulation: bool = False
+    mlp_silu_act: bool = False
+    ops_bias: bool = True
+    default_ref_method: str = "offset"
+    ref_index_scale: float = 1.0
+    yak_mlp: bool = False
+    txt_norm: bool = False


 class Flux(nn.Module):
@ -58,13 +68,22 @@ class Flux(nn.Module):
        self.hidden_size = params.hidden_size
        self.num_heads = params.num_heads
        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
-        self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=True, dtype=dtype, device=device)
-        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations)
-        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations)
+        self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device, operations=operations)
+        if params.vec_in_dim is not None:
+            self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations)
+        else:
+            self.vector_in = None
+
        self.guidance_in = (
-            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations) if params.guidance_embed else nn.Identity()
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device, operations=operations) if params.guidance_embed else nn.Identity()
        )
-        self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, dtype=dtype, device=device)
+        self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device)
+
+        if params.txt_norm:
+            self.txt_norm = RMSNorm(params.context_in_dim, dtype=dtype, device=device, operations=operations)
+        else:
+            self.txt_norm = None

        self.double_blocks = nn.ModuleList(
            [
@ -73,6 +92,10 @@ class Flux(nn.Module):
                    self.num_heads,
                    mlp_ratio=params.mlp_ratio,
                    qkv_bias=params.qkv_bias,
+                    modulation=params.global_modulation is False,
+                    mlp_silu_act=params.mlp_silu_act,
+                    proj_bias=params.ops_bias,
+                    yak_mlp=params.yak_mlp,
                    dtype=dtype, device=device, operations=operations
                )
                for _ in range(params.depth)
@ -81,13 +104,30 @@ class Flux(nn.Module):

        self.single_blocks = nn.ModuleList(
            [
-                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations)
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, modulation=params.global_modulation is False, mlp_silu_act=params.mlp_silu_act, bias=params.ops_bias, yak_mlp=params.yak_mlp, dtype=dtype, device=device, operations=operations)
                for _ in range(params.depth_single_blocks)
            ]
        )

        if final_layer:
-            self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, dtype=dtype, device=device, operations=operations)
+            self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, bias=params.ops_bias, dtype=dtype, device=device, operations=operations)
+
+        if params.global_modulation:
+            self.double_stream_modulation_img = Modulation(
+                self.hidden_size,
+                double=True,
+                bias=False,
+                dtype=dtype, device=device, operations=operations
+            )
+            self.double_stream_modulation_txt = Modulation(
+                self.hidden_size,
+                double=True,
+                bias=False,
+                dtype=dtype, device=device, operations=operations
+            )
+            self.single_stream_modulation = Modulation(
+                self.hidden_size, double=False, bias=False, dtype=dtype, device=device, operations=operations
+            )

    def forward_orig(
        self,
@ -103,9 +143,6 @@ class Flux(nn.Module):
        attn_mask: Tensor = None,
    ) -> Tensor:

-        if y is None:
-            y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype)
-
        patches = transformer_options.get("patches", {})
        patches_replace = transformer_options.get("patches_replace", {})
        if img.ndim != 3 or txt.ndim != 3:
@ -118,9 +155,19 @@ class Flux(nn.Module):
            if guidance is not None:
                vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))

-        vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
+        if self.vector_in is not None:
+            if y is None:
+                y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype)
+            vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
+
+        if self.txt_norm is not None:
+            txt = self.txt_norm(txt)
        txt = self.txt_in(txt)

+        vec_orig = vec
+        if self.params.global_modulation:
+            vec = (self.double_stream_modulation_img(vec_orig), self.double_stream_modulation_txt(vec_orig))
+
        if "post_input" in patches:
            for p in patches["post_input"]:
                out = p({"img": img, "txt": txt, "img_ids": img_ids, "txt_ids": txt_ids})
@ -136,7 +183,10 @@ class Flux(nn.Module):
            pe = None

        blocks_replace = patches_replace.get("dit", {})
+        transformer_options["total_blocks"] = len(self.double_blocks)
+        transformer_options["block_type"] = "double"
        for i, block in enumerate(self.double_blocks):
+            transformer_options["block_index"] = i
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
@ -177,7 +227,13 @@ class Flux(nn.Module):

        img = torch.cat((txt, img), 1)

+        if self.params.global_modulation:
+            vec, _ = self.single_stream_modulation(vec_orig)
+
+        transformer_options["total_blocks"] = len(self.single_blocks)
+        transformer_options["block_type"] = "single"
        for i, block in enumerate(self.single_blocks):
+            transformer_options["block_index"] = i
            if ("single_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
@ -207,10 +263,10 @@ class Flux(nn.Module):

        img = img[:, txt.shape[1] :, ...]

-        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        img = self.final_layer(img, vec_orig)  # (N, T, patch_size ** 2 * out_channels)
        return img

-    def process_img(self, x, index=0, h_offset=0, w_offset=0):
+    def process_img(self, x, index=0, h_offset=0, w_offset=0, transformer_options={}):
        bs, c, h, w = x.shape
        patch_size = self.patch_size
        x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))
@ -222,10 +278,22 @@ class Flux(nn.Module):
        h_offset = ((h_offset + (patch_size // 2)) // patch_size)
        w_offset = ((w_offset + (patch_size // 2)) // patch_size)

-        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
+        steps_h = h_len
+        steps_w = w_len
+
+        rope_options = transformer_options.get("rope_options", None)
+        if rope_options is not None:
+            h_len = (h_len - 1.0) * rope_options.get("scale_y", 1.0) + 1.0
+            w_len = (w_len - 1.0) * rope_options.get("scale_x", 1.0) + 1.0
+
+            index += rope_options.get("shift_t", 0.0)
+            h_offset += rope_options.get("shift_y", 0.0)
+            w_offset += rope_options.get("shift_x", 0.0)
+
+        img_ids = torch.zeros((steps_h, steps_w, len(self.params.axes_dim)), device=x.device, dtype=torch.float32)
        img_ids[:, :, 0] = img_ids[:, :, 1] + index
-        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
-        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
+        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=steps_h, device=x.device, dtype=torch.float32).unsqueeze(1)
+        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=steps_w, device=x.device, dtype=torch.float32).unsqueeze(0)
        return img, repeat(img_ids, "h w c -> b (h w) c", b=bs)

    def forward(self, x, timestep, context, y=None, guidance=None, ref_latents=None, control=None, transformer_options={}, **kwargs):
@ -241,16 +309,16 @@ class Flux(nn.Module):

        h_len = ((h_orig + (patch_size // 2)) // patch_size)
        w_len = ((w_orig + (patch_size // 2)) // patch_size)
-        img, img_ids = self.process_img(x)
+        img, img_ids = self.process_img(x, transformer_options=transformer_options)
        img_tokens = img.shape[1]
        if ref_latents is not None:
            h = 0
            w = 0
            index = 0
-            ref_latents_method = kwargs.get("ref_latents_method", "offset")
+            ref_latents_method = kwargs.get("ref_latents_method", self.params.default_ref_method)
            for ref in ref_latents:
                if ref_latents_method == "index":
-                    index += 1
+                    index += self.params.ref_index_scale
                    h_offset = 0
                    w_offset = 0
                elif ref_latents_method == "uxo":
@ -274,7 +342,12 @@ class Flux(nn.Module):
                img = torch.cat([img, kontext], dim=1)
                img_ids = torch.cat([img_ids, kontext_ids], dim=1)

-        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
+        txt_ids = torch.zeros((bs, context.shape[1], len(self.params.axes_dim)), device=x.device, dtype=torch.float32)
+
+        if len(self.params.txt_ids_dims) > 0:
+            for i in self.params.txt_ids_dims:
+                txt_ids[:, :, i] = torch.linspace(0, context.shape[1] - 1, steps=context.shape[1], device=x.device, dtype=torch.float32)
+
        out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None))
        out = out[:, :img_tokens]
-        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h_orig,:w_orig]
+        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=self.patch_size, pw=self.patch_size)[:,:,:h_orig,:w_orig]
--- a/comfy/ldm/hunyuan_video/model.py
+++ b/comfy/ldm/hunyuan_video/model.py
@ -6,7 +6,6 @@ import comfy.ldm.flux.layers
 import comfy.ldm.modules.diffusionmodules.mmdit
 from comfy.ldm.modules.attention import optimized_attention

-
 from dataclasses import dataclass
 from einops import repeat

@ -42,6 +41,9 @@ class HunyuanVideoParams:
    guidance_embed: bool
    byt5: bool
    meanflow: bool
+    use_cond_type_embedding: bool
+    vision_in_dim: int
+    meanflow_sum: bool


 class SelfAttentionRef(nn.Module):
@ -157,7 +159,10 @@ class TokenRefiner(nn.Module):
        t = self.t_embedder(timestep_embedding(timesteps, 256, time_factor=1.0).to(x.dtype))
        # m = mask.float().unsqueeze(-1)
        # c = (x.float() * m).sum(dim=1) / m.sum(dim=1) #TODO: the following works when the x.shape is the same length as the tokens but might break otherwise
-        c = x.sum(dim=1) / x.shape[1]
+        if x.dtype == torch.float16:
+            c = x.float().sum(dim=1) / x.shape[1]
+        else:
+            c = x.sum(dim=1) / x.shape[1]

        c = t + self.c_embedder(c.to(x.dtype))
        x = self.input_embedder(x)
@ -196,11 +201,15 @@ class HunyuanVideo(nn.Module):
    def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs):
        super().__init__()
        self.dtype = dtype
+        operation_settings = {"operations": operations, "device": device, "dtype": dtype}
+
        params = HunyuanVideoParams(**kwargs)
        self.params = params
        self.patch_size = params.patch_size
        self.in_channels = params.in_channels
        self.out_channels = params.out_channels
+        self.use_cond_type_embedding = params.use_cond_type_embedding
+        self.vision_in_dim = params.vision_in_dim
        if params.hidden_size % params.num_heads != 0:
            raise ValueError(
                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
@ -266,6 +275,18 @@ class HunyuanVideo(nn.Module):
        if final_layer:
            self.final_layer = LastLayer(self.hidden_size, self.patch_size[-1], self.out_channels, dtype=dtype, device=device, operations=operations)

+        # HunyuanVideo 1.5 specific modules
+        if self.vision_in_dim is not None:
+            from comfy.ldm.wan.model import MLPProj
+            self.vision_in = MLPProj(in_dim=self.vision_in_dim, out_dim=self.hidden_size, operation_settings=operation_settings)
+        else:
+            self.vision_in = None
+        if self.use_cond_type_embedding:
+            # 0: text_encoder feature 1: byt5 feature 2: vision_encoder feature
+            self.cond_type_embedding = nn.Embedding(3, self.hidden_size)
+        else:
+            self.cond_type_embedding = None
+
    def forward_orig(
        self,
        img: Tensor,
@ -276,6 +297,7 @@ class HunyuanVideo(nn.Module):
        timesteps: Tensor,
        y: Tensor = None,
        txt_byt5=None,
+        clip_fea=None,
        guidance: Tensor = None,
        guiding_frame_index=None,
        ref_latent=None,
@ -296,7 +318,7 @@ class HunyuanVideo(nn.Module):
                timesteps_r = transformer_options['sample_sigmas'][w[0] + 1]
                timesteps_r = timesteps_r.unsqueeze(0).to(device=timesteps.device, dtype=timesteps.dtype)
                vec_r = self.time_r_in(timestep_embedding(timesteps_r, 256, time_factor=1000.0).to(img.dtype))
-                vec = (vec + vec_r) / 2
+                vec = (vec + vec_r) if self.params.meanflow_sum else (vec + vec_r) / 2

        if ref_latent is not None:
            ref_latent_ids = self.img_ids(ref_latent)
@ -331,12 +353,31 @@ class HunyuanVideo(nn.Module):

        txt = self.txt_in(txt, timesteps, txt_mask, transformer_options=transformer_options)

+        if self.cond_type_embedding is not None:
+            self.cond_type_embedding.to(txt.device)
+            cond_emb = self.cond_type_embedding(torch.zeros_like(txt[:, :, 0], device=txt.device, dtype=torch.long))
+            txt = txt + cond_emb.to(txt.dtype)
+
        if self.byt5_in is not None and txt_byt5 is not None:
            txt_byt5 = self.byt5_in(txt_byt5)
+            if self.cond_type_embedding is not None:
+                cond_emb = self.cond_type_embedding(torch.ones_like(txt_byt5[:, :, 0], device=txt_byt5.device, dtype=torch.long))
+                txt_byt5 = txt_byt5 + cond_emb.to(txt_byt5.dtype)
+                txt = torch.cat((txt_byt5, txt), dim=1) # byt5 first for HunyuanVideo1.5
+            else:
+                txt = torch.cat((txt, txt_byt5), dim=1)
            txt_byt5_ids = torch.zeros((txt_ids.shape[0], txt_byt5.shape[1], txt_ids.shape[-1]), device=txt_ids.device, dtype=txt_ids.dtype)
-            txt = torch.cat((txt, txt_byt5), dim=1)
            txt_ids = torch.cat((txt_ids, txt_byt5_ids), dim=1)

+        if clip_fea is not None:
+            txt_vision_states = self.vision_in(clip_fea)
+            if self.cond_type_embedding is not None:
+                cond_emb = self.cond_type_embedding(2 * torch.ones_like(txt_vision_states[:, :, 0], dtype=torch.long, device=txt_vision_states.device))
+                txt_vision_states = txt_vision_states + cond_emb
+            txt = torch.cat((txt_vision_states.to(txt.dtype), txt), dim=1)
+            extra_txt_ids = torch.zeros((txt_ids.shape[0], txt_vision_states.shape[1], txt_ids.shape[-1]), device=txt_ids.device, dtype=txt_ids.dtype)
+            txt_ids = torch.cat((txt_ids, extra_txt_ids), dim=1)
+
        ids = torch.cat((img_ids, txt_ids), dim=1)
        pe = self.pe_embedder(ids)

@ -349,7 +390,10 @@ class HunyuanVideo(nn.Module):
            attn_mask = None

        blocks_replace = patches_replace.get("dit", {})
+        transformer_options["total_blocks"] = len(self.double_blocks)
+        transformer_options["block_type"] = "double"
        for i, block in enumerate(self.double_blocks):
+            transformer_options["block_index"] = i
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
@ -371,7 +415,10 @@ class HunyuanVideo(nn.Module):

        img = torch.cat((img, txt), 1)

+        transformer_options["total_blocks"] = len(self.single_blocks)
+        transformer_options["block_type"] = "single"
        for i, block in enumerate(self.single_blocks):
+            transformer_options["block_index"] = i
            if ("single_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
@ -430,14 +477,14 @@ class HunyuanVideo(nn.Module):
        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
        return repeat(img_ids, "h w c -> b (h w) c", b=bs)

-    def forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
+    def forward(self, x, timestep, context, y=None, txt_byt5=None, clip_fea=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
            self._forward,
            self,
            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
-        ).execute(x, timestep, context, y, txt_byt5, guidance, attention_mask, guiding_frame_index, ref_latent, disable_time_r, control, transformer_options, **kwargs)
+        ).execute(x, timestep, context, y, txt_byt5, clip_fea, guidance, attention_mask, guiding_frame_index, ref_latent, disable_time_r, control, transformer_options, **kwargs)

-    def _forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
+    def _forward(self, x, timestep, context, y=None, txt_byt5=None, clip_fea=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
        bs = x.shape[0]
        if len(self.patch_size) == 3:
            img_ids = self.img_ids(x)
@ -445,5 +492,5 @@ class HunyuanVideo(nn.Module):
        else:
            img_ids = self.img_ids_2d(x)
            txt_ids = torch.zeros((bs, context.shape[1], 2), device=x.device, dtype=x.dtype)
-        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, txt_byt5, guidance, guiding_frame_index, ref_latent, disable_time_r=disable_time_r, control=control, transformer_options=transformer_options)
+        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, txt_byt5, clip_fea, guidance, guiding_frame_index, ref_latent, disable_time_r=disable_time_r, control=control, transformer_options=transformer_options)
        return out
--- a/comfy/ldm/hunyuan_video/upsampler.py
+++ b/comfy/ldm/hunyuan_video/upsampler.py
@ -0,0 +1,122 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, VideoConv3d
+from comfy.ldm.hunyuan_video.vae_refiner import RMS_norm
+import comfy.model_management
+import comfy.model_patcher
+
+class SRResidualCausalBlock3D(nn.Module):
+    def __init__(self, channels: int):
+        super().__init__()
+        self.block = nn.Sequential(
+            VideoConv3d(channels, channels, kernel_size=3),
+            nn.SiLU(inplace=True),
+            VideoConv3d(channels, channels, kernel_size=3),
+            nn.SiLU(inplace=True),
+            VideoConv3d(channels, channels, kernel_size=3),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x + self.block(x)
+
+class SRModel3DV2(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        hidden_channels: int = 64,
+        num_blocks: int = 6,
+        global_residual: bool = False,
+    ):
+        super().__init__()
+        self.in_conv = VideoConv3d(in_channels, hidden_channels, kernel_size=3)
+        self.blocks = nn.ModuleList([SRResidualCausalBlock3D(hidden_channels) for _ in range(num_blocks)])
+        self.out_conv = VideoConv3d(hidden_channels, out_channels, kernel_size=3)
+        self.global_residual = bool(global_residual)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        residual = x
+        y = self.in_conv(x)
+        for blk in self.blocks:
+            y = blk(y)
+        y = self.out_conv(y)
+        if self.global_residual and (y.shape == residual.shape):
+            y = y + residual
+        return y
+
+
+class Upsampler(nn.Module):
+    def __init__(
+        self,
+        z_channels: int,
+        out_channels: int,
+        block_out_channels: tuple[int, ...],
+        num_res_blocks: int = 2,
+    ):
+        super().__init__()
+        self.num_res_blocks = num_res_blocks
+        self.block_out_channels = block_out_channels
+        self.z_channels = z_channels
+
+        ch = block_out_channels[0]
+        self.conv_in = VideoConv3d(z_channels, ch, kernel_size=3)
+
+        self.up = nn.ModuleList()
+
+        for i, tgt in enumerate(block_out_channels):
+            stage = nn.Module()
+            stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
+                                                    out_channels=tgt,
+                                                    temb_channels=0,
+                                                    conv_shortcut=False,
+                                                    conv_op=VideoConv3d, norm_op=RMS_norm)
+                                        for j in range(num_res_blocks + 1)])
+            ch = tgt
+            self.up.append(stage)
+
+        self.norm_out = RMS_norm(ch)
+        self.conv_out = VideoConv3d(ch, out_channels, kernel_size=3)
+
+    def forward(self, z):
+        """
+        Args:
+            z: (B, C, T, H, W)
+            target_shape: (H, W)
+        """
+        # z to block_in
+        repeats = self.block_out_channels[0] // (self.z_channels)
+        x = self.conv_in(z) + z.repeat_interleave(repeats=repeats, dim=1)
+
+        # upsampling
+        for stage in self.up:
+            for blk in stage.block:
+                x = blk(x)
+
+        out = self.conv_out(F.silu(self.norm_out(x)))
+        return out
+
+UPSAMPLERS = {
+    "720p": SRModel3DV2,
+    "1080p": Upsampler,
+}
+
+class HunyuanVideo15SRModel():
+    def __init__(self, model_type, config):
+        self.load_device = comfy.model_management.vae_device()
+        offload_device = comfy.model_management.vae_offload_device()
+        self.dtype = comfy.model_management.vae_dtype(self.load_device)
+        self.model_class = UPSAMPLERS.get(model_type)
+        self.model = self.model_class(**config).eval()
+
+        self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
+
+    def load_sd(self, sd):
+        return self.model.load_state_dict(sd, strict=True)
+
+    def get_sd(self):
+        return self.model.state_dict()
+
+    def resample_latent(self, latent):
+        comfy.model_management.load_model_gpu(self.patcher)
+        return self.model(latent.to(self.load_device))
--- a/comfy/ldm/hunyuan_video/vae_refiner.py
+++ b/comfy/ldm/hunyuan_video/vae_refiner.py
@ -1,11 +1,13 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, VideoConv3d
+from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, CarriedConv3d, Normalize, conv_carry_causal_3d, torch_cat_if_needed
 import comfy.ops
 import comfy.ldm.models.autoencoder
+import comfy.model_management
 ops = comfy.ops.disable_weight_init

+
 class RMS_norm(nn.Module):
    def __init__(self, dim):
        super().__init__()
@ -14,23 +16,25 @@ class RMS_norm(nn.Module):
        self.gamma = nn.Parameter(torch.empty(shape))

    def forward(self, x):
-        return F.normalize(x, dim=1) * self.scale * self.gamma
+        return F.normalize(x, dim=1) * self.scale * comfy.model_management.cast_to(self.gamma, dtype=x.dtype, device=x.device)

 class DnSmpl(nn.Module):
-    def __init__(self, ic, oc, tds=True):
+    def __init__(self, ic, oc, tds, refiner_vae, op):
        super().__init__()
        fct = 2 * 2 * 2 if tds else 1 * 2 * 2
        assert oc % fct == 0
-        self.conv = VideoConv3d(ic, oc // fct, kernel_size=3)
+        self.conv = op(ic, oc // fct, kernel_size=3, stride=1, padding=1)
+        self.refiner_vae = refiner_vae

        self.tds = tds
        self.gs = fct * ic // oc

-    def forward(self, x):
+    def forward(self, x, conv_carry_in=None, conv_carry_out=None):
        r1 = 2 if self.tds else 1
-        h = self.conv(x)
+        h = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out)
+
+        if self.tds and self.refiner_vae and conv_carry_in is None:

-        if self.tds:
            hf = h[:, :, :1, :, :]
            b, c, f, ht, wd = hf.shape
            hf = hf.reshape(b, c, f, ht // 2, 2, wd // 2, 2)
@ -38,14 +42,7 @@ class DnSmpl(nn.Module):
            hf = hf.reshape(b, 2 * 2 * c, f, ht // 2, wd // 2)
            hf = torch.cat([hf, hf], dim=1)

-            hn = h[:, :, 1:, :, :]
-            b, c, frms, ht, wd = hn.shape
-            nf = frms // r1
-            hn = hn.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2)
-            hn = hn.permute(0, 3, 5, 7, 1, 2, 4, 6)
-            hn = hn.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2)
-
-            h = torch.cat([hf, hn], dim=2)
+            h = h[:, :, 1:, :, :]

            xf = x[:, :, :1, :, :]
            b, ci, f, ht, wd = xf.shape
@ -53,49 +50,49 @@ class DnSmpl(nn.Module):
            xf = xf.permute(0, 4, 6, 1, 2, 3, 5)
            xf = xf.reshape(b, 2 * 2 * ci, f, ht // 2, wd // 2)
            B, C, T, H, W = xf.shape
-            xf = xf.view(B, h.shape[1], self.gs // 2, T, H, W).mean(dim=2)
+            xf = xf.view(B, hf.shape[1], self.gs // 2, T, H, W).mean(dim=2)

-            xn = x[:, :, 1:, :, :]
-            b, ci, frms, ht, wd = xn.shape
-            nf = frms // r1
-            xn = xn.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2)
-            xn = xn.permute(0, 3, 5, 7, 1, 2, 4, 6)
-            xn = xn.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2)
-            B, C, T, H, W = xn.shape
-            xn = xn.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2)
-            sc = torch.cat([xf, xn], dim=2)
-        else:
-            b, c, frms, ht, wd = h.shape
-            nf = frms // r1
-            h = h.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2)
-            h = h.permute(0, 3, 5, 7, 1, 2, 4, 6)
-            h = h.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2)
+            x = x[:, :, 1:, :, :]

-            b, ci, frms, ht, wd = x.shape
-            nf = frms // r1
-            sc = x.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2)
-            sc = sc.permute(0, 3, 5, 7, 1, 2, 4, 6)
-            sc = sc.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2)
-            B, C, T, H, W = sc.shape
-            sc = sc.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2)
+        if h.shape[2] == 0:
+            return hf + xf

-        return h + sc
+        b, c, frms, ht, wd = h.shape
+        nf = frms // r1
+        h = h.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2)
+        h = h.permute(0, 3, 5, 7, 1, 2, 4, 6)
+        h = h.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2)
+
+        b, ci, frms, ht, wd = x.shape
+        nf = frms // r1
+        x = x.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2)
+        x = x.permute(0, 3, 5, 7, 1, 2, 4, 6)
+        x = x.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2)
+        B, C, T, H, W = x.shape
+        x = x.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2)
+
+        if self.tds and self.refiner_vae and conv_carry_in is None:
+            h = torch.cat([hf, h], dim=2)
+            x = torch.cat([xf, x], dim=2)
+
+        return h + x


 class UpSmpl(nn.Module):
-    def __init__(self, ic, oc, tus=True):
+    def __init__(self, ic, oc, tus, refiner_vae, op):
        super().__init__()
        fct = 2 * 2 * 2 if tus else 1 * 2 * 2
-        self.conv = VideoConv3d(ic, oc * fct, kernel_size=3)
+        self.conv = op(ic, oc * fct, kernel_size=3, stride=1, padding=1)
+        self.refiner_vae = refiner_vae

        self.tus = tus
        self.rp = fct * oc // ic

-    def forward(self, x):
+    def forward(self, x, conv_carry_in=None, conv_carry_out=None):
        r1 = 2 if self.tus else 1
-        h = self.conv(x)
+        h = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out)

-        if self.tus:
+        if self.tus and self.refiner_vae and conv_carry_in is None:
            hf = h[:, :, :1, :, :]
            b, c, f, ht, wd = hf.shape
            nc = c // (2 * 2)
@ -104,14 +101,7 @@ class UpSmpl(nn.Module):
            hf = hf.reshape(b, nc, f, ht * 2, wd * 2)
            hf = hf[:, : hf.shape[1] // 2]

-            hn = h[:, :, 1:, :, :]
-            b, c, frms, ht, wd = hn.shape
-            nc = c // (r1 * 2 * 2)
-            hn = hn.reshape(b, r1, 2, 2, nc, frms, ht, wd)
-            hn = hn.permute(0, 4, 5, 1, 6, 2, 7, 3)
-            hn = hn.reshape(b, nc, frms * r1, ht * 2, wd * 2)
-
-            h = torch.cat([hf, hn], dim=2)
+            h = h[:, :, 1:, :, :]

            xf = x[:, :, :1, :, :]
            b, ci, f, ht, wd = xf.shape
@ -122,109 +112,147 @@ class UpSmpl(nn.Module):
            xf = xf.permute(0, 3, 4, 5, 1, 6, 2)
            xf = xf.reshape(b, nc, f, ht * 2, wd * 2)

-            xn = x[:, :, 1:, :, :]
-            xn = xn.repeat_interleave(repeats=self.rp, dim=1)
-            b, c, frms, ht, wd = xn.shape
-            nc = c // (r1 * 2 * 2)
-            xn = xn.reshape(b, r1, 2, 2, nc, frms, ht, wd)
-            xn = xn.permute(0, 4, 5, 1, 6, 2, 7, 3)
-            xn = xn.reshape(b, nc, frms * r1, ht * 2, wd * 2)
-            sc = torch.cat([xf, xn], dim=2)
-        else:
-            b, c, frms, ht, wd = h.shape
-            nc = c // (r1 * 2 * 2)
-            h = h.reshape(b, r1, 2, 2, nc, frms, ht, wd)
-            h = h.permute(0, 4, 5, 1, 6, 2, 7, 3)
-            h = h.reshape(b, nc, frms * r1, ht * 2, wd * 2)
+            x = x[:, :, 1:, :, :]

-            sc = x.repeat_interleave(repeats=self.rp, dim=1)
-            b, c, frms, ht, wd = sc.shape
-            nc = c // (r1 * 2 * 2)
-            sc = sc.reshape(b, r1, 2, 2, nc, frms, ht, wd)
-            sc = sc.permute(0, 4, 5, 1, 6, 2, 7, 3)
-            sc = sc.reshape(b, nc, frms * r1, ht * 2, wd * 2)
+        b, c, frms, ht, wd = h.shape
+        nc = c // (r1 * 2 * 2)
+        h = h.reshape(b, r1, 2, 2, nc, frms, ht, wd)
+        h = h.permute(0, 4, 5, 1, 6, 2, 7, 3)
+        h = h.reshape(b, nc, frms * r1, ht * 2, wd * 2)

-        return h + sc
+        x = x.repeat_interleave(repeats=self.rp, dim=1)
+        b, c, frms, ht, wd = x.shape
+        nc = c // (r1 * 2 * 2)
+        x = x.reshape(b, r1, 2, 2, nc, frms, ht, wd)
+        x = x.permute(0, 4, 5, 1, 6, 2, 7, 3)
+        x = x.reshape(b, nc, frms * r1, ht * 2, wd * 2)
+
+        if self.tus and self.refiner_vae and conv_carry_in is None:
+            h = torch.cat([hf, h], dim=2)
+            x = torch.cat([xf, x], dim=2)
+
+        return h + x

 class Encoder(nn.Module):
    def __init__(self, in_channels, z_channels, block_out_channels, num_res_blocks,
-                 ffactor_spatial, ffactor_temporal, downsample_match_channel=True, **_):
+                 ffactor_spatial, ffactor_temporal, downsample_match_channel=True, refiner_vae=True, **_):
        super().__init__()
        self.z_channels = z_channels
        self.block_out_channels = block_out_channels
        self.num_res_blocks = num_res_blocks
-        self.conv_in = VideoConv3d(in_channels, block_out_channels[0], 3, 1, 1)
+        self.ffactor_temporal = ffactor_temporal
+
+        self.refiner_vae = refiner_vae
+        if self.refiner_vae:
+            conv_op = CarriedConv3d
+            norm_op = RMS_norm
+        else:
+            conv_op = ops.Conv3d
+            norm_op = Normalize
+
+        self.conv_in = conv_op(in_channels, block_out_channels[0], 3, 1, 1)

        self.down = nn.ModuleList()
        ch = block_out_channels[0]
        depth = (ffactor_spatial >> 1).bit_length()
-        depth_temporal = ((ffactor_spatial // ffactor_temporal) >> 1).bit_length()
+        depth_temporal = ((ffactor_spatial // self.ffactor_temporal) >> 1).bit_length()

        for i, tgt in enumerate(block_out_channels):
            stage = nn.Module()
            stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
                                                     out_channels=tgt,
                                                     temb_channels=0,
-                                                     conv_op=VideoConv3d, norm_op=RMS_norm)
+                                                     conv_op=conv_op, norm_op=norm_op)
                                        for j in range(num_res_blocks)])
            ch = tgt
            if i < depth:
                nxt = block_out_channels[i + 1] if i + 1 < len(block_out_channels) and downsample_match_channel else ch
-                stage.downsample = DnSmpl(ch, nxt, tds=i >= depth_temporal)
+                stage.downsample = DnSmpl(ch, nxt, tds=i >= depth_temporal, refiner_vae=self.refiner_vae, op=conv_op)
                ch = nxt
            self.down.append(stage)

        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm)
-        self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=RMS_norm)
-        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm)
+        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
+        self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op)
+        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)

-        self.norm_out = RMS_norm(ch)
-        self.conv_out = VideoConv3d(ch, z_channels << 1, 3, 1, 1)
+        self.norm_out = norm_op(ch)
+        self.conv_out = conv_op(ch, z_channels << 1, 3, 1, 1)

        self.regul = comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer()

    def forward(self, x):
-        x = self.conv_in(x)
+        if not self.refiner_vae and x.shape[2] == 1:
+            x = x.expand(-1, -1, self.ffactor_temporal, -1, -1)

-        for stage in self.down:
-            for blk in stage.block:
-                x = blk(x)
-            if hasattr(stage, 'downsample'):
-                x = stage.downsample(x)
+        if self.refiner_vae:
+            xl = [x[:, :, :1, :, :]]
+            if x.shape[2] > self.ffactor_temporal:
+                xl += torch.split(x[:, :, 1: 1 + ((x.shape[2] - 1) // self.ffactor_temporal) * self.ffactor_temporal, :, :], self.ffactor_temporal * 2, dim=2)
+            x = xl
+        else:
+            x = [x]
+        out = []

-        x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x)))
+        conv_carry_in = None
+
+        for i, x1 in enumerate(x):
+            conv_carry_out = []
+            if i == len(x) - 1:
+                conv_carry_out = None
+
+            x1 = [ x1 ]
+            x1 = conv_carry_causal_3d(x1, self.conv_in, conv_carry_in, conv_carry_out)
+
+            for stage in self.down:
+                for blk in stage.block:
+                    x1 = blk(x1, None, conv_carry_in, conv_carry_out)
+                if hasattr(stage, 'downsample'):
+                    x1 = stage.downsample(x1, conv_carry_in, conv_carry_out)
+
+            out.append(x1)
+            conv_carry_in = conv_carry_out
+
+        out = torch_cat_if_needed(out, dim=2)
+
+        x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(out)))
+        del out

        b, c, t, h, w = x.shape
        grp = c // (self.z_channels << 1)
        skip = x.view(b, c // grp, grp, t, h, w).mean(2)

-        out = self.conv_out(F.silu(self.norm_out(x))) + skip
-        out = self.regul(out)[0]
+        out = conv_carry_causal_3d([F.silu(self.norm_out(x))], self.conv_out) + skip
+
+        if self.refiner_vae:
+            out = self.regul(out)[0]

-        out = torch.cat((out[:, :, :1], out), dim=2)
-        out = out.permute(0, 2, 1, 3, 4)
-        b, f_times_2, c, h, w = out.shape
-        out = out.reshape(b, f_times_2 // 2, 2 * c, h, w)
-        out = out.permute(0, 2, 1, 3, 4).contiguous()
        return out

 class Decoder(nn.Module):
    def __init__(self, z_channels, out_channels, block_out_channels, num_res_blocks,
-                 ffactor_spatial, ffactor_temporal, upsample_match_channel=True, **_):
+                 ffactor_spatial, ffactor_temporal, upsample_match_channel=True, refiner_vae=True, **_):
        super().__init__()
        block_out_channels = block_out_channels[::-1]
        self.z_channels = z_channels
        self.block_out_channels = block_out_channels
        self.num_res_blocks = num_res_blocks

+        self.refiner_vae = refiner_vae
+        if self.refiner_vae:
+            conv_op = CarriedConv3d
+            norm_op = RMS_norm
+        else:
+            conv_op = ops.Conv3d
+            norm_op = Normalize
+
        ch = block_out_channels[0]
-        self.conv_in = VideoConv3d(z_channels, ch, 3)
+        self.conv_in = conv_op(z_channels, ch, kernel_size=3, stride=1, padding=1)

        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm)
-        self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=RMS_norm)
-        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm)
+        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
+        self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op)
+        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch,  conv_op=conv_op, norm_op=norm_op)

        self.up = nn.ModuleList()
        depth = (ffactor_spatial >> 1).bit_length()
@ -235,33 +263,51 @@ class Decoder(nn.Module):
            stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
                                                     out_channels=tgt,
                                                     temb_channels=0,
-                                                     conv_op=VideoConv3d, norm_op=RMS_norm)
+                                                     conv_op=conv_op, norm_op=norm_op)
                                        for j in range(num_res_blocks + 1)])
            ch = tgt
            if i < depth:
                nxt = block_out_channels[i + 1] if i + 1 < len(block_out_channels) and upsample_match_channel else ch
-                stage.upsample = UpSmpl(ch, nxt, tus=i < depth_temporal)
+                stage.upsample = UpSmpl(ch, nxt, tus=i < depth_temporal, refiner_vae=self.refiner_vae, op=conv_op)
                ch = nxt
            self.up.append(stage)

-        self.norm_out = RMS_norm(ch)
-        self.conv_out = VideoConv3d(ch, out_channels, 3)
+        self.norm_out = norm_op(ch)
+        self.conv_out = conv_op(ch, out_channels, 3, stride=1, padding=1)

    def forward(self, z):
-        z = z.permute(0, 2, 1, 3, 4)
-        b, f, c, h, w = z.shape
-        z = z.reshape(b, f, 2, c // 2, h, w)
-        z = z.permute(0, 1, 2, 3, 4, 5).reshape(b, f * 2, c // 2, h, w)
-        z = z.permute(0, 2, 1, 3, 4)
-        z = z[:, :, 1:]
-
-        x = self.conv_in(z) + z.repeat_interleave(self.block_out_channels[0] // self.z_channels, 1)
+        x = conv_carry_causal_3d([z], self.conv_in) + z.repeat_interleave(self.block_out_channels[0] // self.z_channels, 1)
        x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x)))

-        for stage in self.up:
-            for blk in stage.block:
-                x = blk(x)
-            if hasattr(stage, 'upsample'):
-                x = stage.upsample(x)
+        if self.refiner_vae:
+            x = torch.split(x, 2, dim=2)
+        else:
+            x = [ x ]
+        out = []
+
+        conv_carry_in = None
+
+        for i, x1 in enumerate(x):
+            conv_carry_out = []
+            if i == len(x) - 1:
+                conv_carry_out = None
+            for stage in self.up:
+                for blk in stage.block:
+                    x1 = blk(x1, None, conv_carry_in, conv_carry_out)
+                if hasattr(stage, 'upsample'):
+                    x1 = stage.upsample(x1, conv_carry_in, conv_carry_out)
+
+            x1 = [ F.silu(self.norm_out(x1)) ]
+            x1 = conv_carry_causal_3d(x1, self.conv_out, conv_carry_in, conv_carry_out)
+            out.append(x1)
+            conv_carry_in = conv_carry_out
+        del x
+
+        out = torch_cat_if_needed(out, dim=2)
+
+        if not self.refiner_vae:
+            if z.shape[-3] == 1:
+                out = out[:, :, -1:]
+
+        return out

-        return self.conv_out(F.silu(self.norm_out(x)))
--- a/comfy/ldm/kandinsky5/model.py
+++ b/comfy/ldm/kandinsky5/model.py
@ -0,0 +1,413 @@
+import torch
+from torch import nn
+import math
+
+import comfy.ldm.common_dit
+from comfy.ldm.modules.attention import optimized_attention
+from comfy.ldm.flux.math import apply_rope1
+from comfy.ldm.flux.layers import EmbedND
+
+def attention(q, k, v, heads, transformer_options={}):
+    return optimized_attention(
+        q.transpose(1, 2),
+        k.transpose(1, 2),
+        v.transpose(1, 2),
+        heads=heads,
+        skip_reshape=True,
+        transformer_options=transformer_options
+    )
+
+def apply_scale_shift_norm(norm, x, scale, shift):
+    return torch.addcmul(shift, norm(x), scale + 1.0)
+
+def apply_gate_sum(x, out, gate):
+    return torch.addcmul(x, gate, out)
+
+def get_shift_scale_gate(params):
+    shift, scale, gate = torch.chunk(params, 3, dim=-1)
+    return tuple(x.unsqueeze(1) for x in (shift, scale, gate))
+
+def get_freqs(dim, max_period=10000.0):
+    return torch.exp(-math.log(max_period) * torch.arange(start=0, end=dim, dtype=torch.float32) / dim)
+
+
+class TimeEmbeddings(nn.Module):
+    def __init__(self, model_dim, time_dim, max_period=10000.0, operation_settings=None):
+        super().__init__()
+        assert model_dim % 2 == 0
+        self.model_dim = model_dim
+        self.max_period = max_period
+        self.register_buffer("freqs", get_freqs(model_dim // 2, max_period), persistent=False)
+        operations = operation_settings.get("operations")
+        self.in_layer = operations.Linear(model_dim, time_dim, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.activation = nn.SiLU()
+        self.out_layer = operations.Linear(time_dim, time_dim, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+
+    def forward(self, timestep, dtype):
+        args = torch.outer(timestep, self.freqs.to(device=timestep.device))
+        time_embed = torch.cat([torch.cos(args), torch.sin(args)], dim=-1).to(dtype)
+        time_embed = self.out_layer(self.activation(self.in_layer(time_embed)))
+        return time_embed
+
+
+class TextEmbeddings(nn.Module):
+    def __init__(self, text_dim, model_dim, operation_settings=None):
+        super().__init__()
+        operations = operation_settings.get("operations")
+        self.in_layer = operations.Linear(text_dim, model_dim, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.norm = operations.LayerNorm(model_dim, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+
+    def forward(self, text_embed):
+        text_embed = self.in_layer(text_embed)
+        return self.norm(text_embed).type_as(text_embed)
+
+
+class VisualEmbeddings(nn.Module):
+    def __init__(self, visual_dim, model_dim, patch_size, operation_settings=None):
+        super().__init__()
+        self.patch_size = patch_size
+        operations = operation_settings.get("operations")
+        self.in_layer = operations.Linear(visual_dim, model_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+
+    def forward(self, x):
+        x = x.movedim(1, -1)  # B C T H W -> B T H W C
+        B, T, H, W, dim = x.shape
+        pt, ph, pw = self.patch_size
+
+        x = x.view(
+            B,
+            T // pt, pt,
+            H // ph, ph,
+            W // pw, pw,
+            dim,
+        ).permute(0, 1, 3, 5, 2, 4, 6, 7).flatten(4, 7)
+
+        return self.in_layer(x)
+
+
+class Modulation(nn.Module):
+    def __init__(self, time_dim, model_dim, num_params, operation_settings=None):
+        super().__init__()
+        self.activation = nn.SiLU()
+        self.out_layer = operation_settings.get("operations").Linear(time_dim, num_params * model_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+
+    def forward(self, x):
+        return self.out_layer(self.activation(x))
+
+
+class SelfAttention(nn.Module):
+    def __init__(self, num_channels, head_dim, operation_settings=None):
+        super().__init__()
+        assert num_channels % head_dim == 0
+        self.num_heads = num_channels // head_dim
+        self.head_dim = head_dim
+
+        operations = operation_settings.get("operations")
+        self.to_query = operations.Linear(num_channels, num_channels, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.to_key = operations.Linear(num_channels, num_channels, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.to_value = operations.Linear(num_channels, num_channels, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.query_norm = operations.RMSNorm(head_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.key_norm = operations.RMSNorm(head_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+
+        self.out_layer = operations.Linear(num_channels, num_channels, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.num_chunks = 2
+
+    def _compute_qk(self, x, freqs, proj_fn, norm_fn):
+        result = proj_fn(x).view(*x.shape[:-1], self.num_heads, -1)
+        return apply_rope1(norm_fn(result), freqs)
+
+    def _forward(self, x, freqs, transformer_options={}):
+        q = self._compute_qk(x, freqs, self.to_query, self.query_norm)
+        k = self._compute_qk(x, freqs, self.to_key, self.key_norm)
+        v = self.to_value(x).view(*x.shape[:-1], self.num_heads, -1)
+        out = attention(q, k, v, self.num_heads, transformer_options=transformer_options)
+        return self.out_layer(out)
+
+    def _forward_chunked(self, x, freqs, transformer_options={}):
+        def process_chunks(proj_fn, norm_fn):
+            x_chunks = torch.chunk(x, self.num_chunks, dim=1)
+            freqs_chunks = torch.chunk(freqs, self.num_chunks, dim=1)
+            chunks = []
+            for x_chunk, freqs_chunk in zip(x_chunks, freqs_chunks):
+                chunks.append(self._compute_qk(x_chunk, freqs_chunk, proj_fn, norm_fn))
+            return torch.cat(chunks, dim=1)
+
+        q = process_chunks(self.to_query, self.query_norm)
+        k = process_chunks(self.to_key, self.key_norm)
+        v = self.to_value(x).view(*x.shape[:-1], self.num_heads, -1)
+        out = attention(q, k, v, self.num_heads, transformer_options=transformer_options)
+        return self.out_layer(out)
+
+    def forward(self, x, freqs, transformer_options={}):
+        if x.shape[1] > 8192:
+            return self._forward_chunked(x, freqs, transformer_options=transformer_options)
+        else:
+            return self._forward(x, freqs, transformer_options=transformer_options)
+
+
+class CrossAttention(SelfAttention):
+    def get_qkv(self, x, context):
+        q = self.to_query(x).view(*x.shape[:-1], self.num_heads, -1)
+        k = self.to_key(context).view(*context.shape[:-1], self.num_heads, -1)
+        v = self.to_value(context).view(*context.shape[:-1], self.num_heads, -1)
+        return q, k, v
+
+    def forward(self, x, context, transformer_options={}):
+        q, k, v = self.get_qkv(x, context)
+        out = attention(self.query_norm(q), self.key_norm(k), v, self.num_heads, transformer_options=transformer_options)
+        return self.out_layer(out)
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, ff_dim, operation_settings=None):
+        super().__init__()
+        operations = operation_settings.get("operations")
+        self.in_layer = operations.Linear(dim, ff_dim, bias=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.activation = nn.GELU()
+        self.out_layer = operations.Linear(ff_dim, dim, bias=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.num_chunks = 4
+
+    def _forward(self, x):
+        return self.out_layer(self.activation(self.in_layer(x)))
+
+    def _forward_chunked(self, x):
+        chunks = torch.chunk(x, self.num_chunks, dim=1)
+        output_chunks = []
+        for chunk in chunks:
+            output_chunks.append(self._forward(chunk))
+        return torch.cat(output_chunks, dim=1)
+
+    def forward(self, x):
+        if x.shape[1] > 8192:
+            return self._forward_chunked(x)
+        else:
+            return self._forward(x)
+
+
+class OutLayer(nn.Module):
+    def __init__(self, model_dim, time_dim, visual_dim, patch_size, operation_settings=None):
+        super().__init__()
+        self.patch_size = patch_size
+        self.modulation = Modulation(time_dim, model_dim, 2, operation_settings=operation_settings)
+        operations = operation_settings.get("operations")
+        self.norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.out_layer = operations.Linear(model_dim, math.prod(patch_size) * visual_dim, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+
+    def forward(self, visual_embed, time_embed):
+        B, T, H, W, _ = visual_embed.shape
+        shift, scale = torch.chunk(self.modulation(time_embed), 2, dim=-1)
+        scale = scale[:, None, None, None, :]
+        shift = shift[:, None, None, None, :]
+        visual_embed = apply_scale_shift_norm(self.norm, visual_embed, scale, shift)
+        x = self.out_layer(visual_embed)
+
+        out_dim = x.shape[-1] // (self.patch_size[0] * self.patch_size[1] * self.patch_size[2])
+        x = x.view(
+            B, T, H, W,
+            out_dim,
+            self.patch_size[0], self.patch_size[1], self.patch_size[2]
+        )
+        return x.permute(0, 4, 1, 5, 2, 6, 3, 7).flatten(2, 3).flatten(3, 4).flatten(4, 5)
+
+
+class TransformerEncoderBlock(nn.Module):
+    def __init__(self, model_dim, time_dim, ff_dim, head_dim, operation_settings=None):
+        super().__init__()
+        self.text_modulation = Modulation(time_dim, model_dim, 6, operation_settings=operation_settings)
+        operations = operation_settings.get("operations")
+
+        self.self_attention_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.self_attention = SelfAttention(model_dim, head_dim, operation_settings=operation_settings)
+
+        self.feed_forward_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.feed_forward = FeedForward(model_dim, ff_dim, operation_settings=operation_settings)
+
+    def forward(self, x, time_embed, freqs, transformer_options={}):
+        self_attn_params, ff_params = torch.chunk(self.text_modulation(time_embed), 2, dim=-1)
+        shift, scale, gate = get_shift_scale_gate(self_attn_params)
+        out = apply_scale_shift_norm(self.self_attention_norm, x, scale, shift)
+        out = self.self_attention(out, freqs, transformer_options=transformer_options)
+        x = apply_gate_sum(x, out, gate)
+
+        shift, scale, gate = get_shift_scale_gate(ff_params)
+        out = apply_scale_shift_norm(self.feed_forward_norm, x, scale, shift)
+        out = self.feed_forward(out)
+        x = apply_gate_sum(x, out, gate)
+        return x
+
+
+class TransformerDecoderBlock(nn.Module):
+    def __init__(self, model_dim, time_dim, ff_dim, head_dim, operation_settings=None):
+        super().__init__()
+        self.visual_modulation = Modulation(time_dim, model_dim, 9, operation_settings=operation_settings)
+
+        operations = operation_settings.get("operations")
+        self.self_attention_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.self_attention = SelfAttention(model_dim, head_dim, operation_settings=operation_settings)
+
+        self.cross_attention_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.cross_attention = CrossAttention(model_dim, head_dim, operation_settings=operation_settings)
+
+        self.feed_forward_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.feed_forward = FeedForward(model_dim, ff_dim, operation_settings=operation_settings)
+
+    def forward(self, visual_embed, text_embed, time_embed, freqs, transformer_options={}):
+        self_attn_params, cross_attn_params, ff_params = torch.chunk(self.visual_modulation(time_embed), 3, dim=-1)
+        # self attention
+        shift, scale, gate = get_shift_scale_gate(self_attn_params)
+        visual_out = apply_scale_shift_norm(self.self_attention_norm, visual_embed, scale, shift)
+        visual_out = self.self_attention(visual_out, freqs, transformer_options=transformer_options)
+        visual_embed = apply_gate_sum(visual_embed, visual_out, gate)
+        # cross attention
+        shift, scale, gate = get_shift_scale_gate(cross_attn_params)
+        visual_out = apply_scale_shift_norm(self.cross_attention_norm, visual_embed, scale, shift)
+        visual_out = self.cross_attention(visual_out, text_embed, transformer_options=transformer_options)
+        visual_embed = apply_gate_sum(visual_embed, visual_out, gate)
+        # feed forward
+        shift, scale, gate = get_shift_scale_gate(ff_params)
+        visual_out = apply_scale_shift_norm(self.feed_forward_norm, visual_embed, scale, shift)
+        visual_out = self.feed_forward(visual_out)
+        visual_embed = apply_gate_sum(visual_embed, visual_out, gate)
+        return visual_embed
+
+
+class Kandinsky5(nn.Module):
+    def __init__(
+        self,
+        in_visual_dim=16, out_visual_dim=16, in_text_dim=3584, in_text_dim2=768, time_dim=512,
+        model_dim=1792, ff_dim=7168, visual_embed_dim=132, patch_size=(1, 2, 2), num_text_blocks=2, num_visual_blocks=32,
+        axes_dims=(16, 24, 24), rope_scale_factor=(1.0, 2.0, 2.0),
+        dtype=None, device=None, operations=None, **kwargs
+    ):
+        super().__init__()
+        head_dim = sum(axes_dims)
+        self.rope_scale_factor = rope_scale_factor
+        self.in_visual_dim = in_visual_dim
+        self.model_dim = model_dim
+        self.patch_size = patch_size
+        self.visual_embed_dim = visual_embed_dim
+        self.dtype = dtype
+        self.device = device
+        operation_settings = {"operations": operations, "device": device, "dtype": dtype}
+
+        self.time_embeddings = TimeEmbeddings(model_dim, time_dim, operation_settings=operation_settings)
+        self.text_embeddings = TextEmbeddings(in_text_dim, model_dim, operation_settings=operation_settings)
+        self.pooled_text_embeddings = TextEmbeddings(in_text_dim2, time_dim, operation_settings=operation_settings)
+        self.visual_embeddings = VisualEmbeddings(visual_embed_dim, model_dim, patch_size, operation_settings=operation_settings)
+
+        self.text_transformer_blocks = nn.ModuleList(
+            [TransformerEncoderBlock(model_dim, time_dim, ff_dim, head_dim, operation_settings=operation_settings) for _ in range(num_text_blocks)]
+        )
+
+        self.visual_transformer_blocks = nn.ModuleList(
+            [TransformerDecoderBlock(model_dim, time_dim, ff_dim, head_dim, operation_settings=operation_settings) for _ in range(num_visual_blocks)]
+        )
+
+        self.out_layer = OutLayer(model_dim, time_dim, out_visual_dim, patch_size, operation_settings=operation_settings)
+
+        self.rope_embedder_3d = EmbedND(dim=head_dim, theta=10000.0, axes_dim=axes_dims)
+        self.rope_embedder_1d = EmbedND(dim=head_dim, theta=10000.0, axes_dim=[head_dim])
+
+    def rope_encode_1d(self, seq_len, seq_start=0, steps=None, device=None, dtype=None, transformer_options={}):
+        steps = seq_len if steps is None else steps
+        seq_ids = torch.linspace(seq_start, seq_start + (seq_len - 1), steps=steps, device=device, dtype=dtype)
+        seq_ids = seq_ids.reshape(-1, 1).unsqueeze(0)  # Shape: (1, steps, 1)
+        freqs = self.rope_embedder_1d(seq_ids).movedim(1, 2)
+        return freqs
+
+    def rope_encode_3d(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None, transformer_options={}):
+
+        patch_size = self.patch_size
+        t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
+        h_len = ((h + (patch_size[1] // 2)) // patch_size[1])
+        w_len = ((w + (patch_size[2] // 2)) // patch_size[2])
+
+        if steps_t is None:
+            steps_t = t_len
+        if steps_h is None:
+            steps_h = h_len
+        if steps_w is None:
+            steps_w = w_len
+
+        h_start = 0
+        w_start = 0
+        rope_options = transformer_options.get("rope_options", None)
+        if rope_options is not None:
+            t_len = (t_len - 1.0) * rope_options.get("scale_t", 1.0) + 1.0
+            h_len = (h_len - 1.0) * rope_options.get("scale_y", 1.0) + 1.0
+            w_len = (w_len - 1.0) * rope_options.get("scale_x", 1.0) + 1.0
+
+            t_start += rope_options.get("shift_t", 0.0)
+            h_start += rope_options.get("shift_y", 0.0)
+            w_start += rope_options.get("shift_x", 0.0)
+        else:
+            rope_scale_factor = self.rope_scale_factor
+            if self.model_dim == 4096: # pro video model uses different rope scaling at higher resolutions
+                if h * w >= 14080:
+                    rope_scale_factor = (1.0, 3.16, 3.16)
+
+            t_len = (t_len - 1.0) / rope_scale_factor[0] + 1.0
+            h_len = (h_len - 1.0) / rope_scale_factor[1] + 1.0
+            w_len = (w_len - 1.0) / rope_scale_factor[2] + 1.0
+
+        img_ids = torch.zeros((steps_t, steps_h, steps_w, 3), device=device, dtype=dtype)
+        img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(t_start, t_start + (t_len - 1), steps=steps_t, device=device, dtype=dtype).reshape(-1, 1, 1)
+        img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(h_start, h_start + (h_len - 1), steps=steps_h, device=device, dtype=dtype).reshape(1, -1, 1)
+        img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(w_start, w_start + (w_len - 1), steps=steps_w, device=device, dtype=dtype).reshape(1, 1, -1)
+        img_ids = img_ids.reshape(1, -1, img_ids.shape[-1])
+
+        freqs = self.rope_embedder_3d(img_ids).movedim(1, 2)
+        return freqs
+
+    def forward_orig(self, x, timestep, context, y, freqs, freqs_text, transformer_options={}, **kwargs):
+        patches_replace = transformer_options.get("patches_replace", {})
+        context = self.text_embeddings(context)
+        time_embed = self.time_embeddings(timestep, x.dtype) + self.pooled_text_embeddings(y)
+
+        for block in self.text_transformer_blocks:
+            context = block(context, time_embed, freqs_text, transformer_options=transformer_options)
+
+        visual_embed = self.visual_embeddings(x)
+        visual_shape = visual_embed.shape[:-1]
+        visual_embed = visual_embed.flatten(1, -2)
+
+        blocks_replace = patches_replace.get("dit", {})
+        transformer_options["total_blocks"] = len(self.visual_transformer_blocks)
+        transformer_options["block_type"] = "double"
+        for i, block in enumerate(self.visual_transformer_blocks):
+            transformer_options["block_index"] = i
+            if ("double_block", i) in blocks_replace:
+                def block_wrap(args):
+                    return block(x=args["x"], context=args["context"], time_embed=args["time_embed"], freqs=args["freqs"], transformer_options=args.get("transformer_options"))
+                visual_embed = blocks_replace[("double_block", i)]({"x": visual_embed, "context": context, "time_embed": time_embed, "freqs": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap})["x"]
+            else:
+                visual_embed = block(visual_embed, context, time_embed, freqs=freqs, transformer_options=transformer_options)
+
+        visual_embed = visual_embed.reshape(*visual_shape, -1)
+        return self.out_layer(visual_embed, time_embed)
+
+    def _forward(self, x, timestep, context, y, time_dim_replace=None, transformer_options={}, **kwargs):
+        original_dims = x.ndim
+        if original_dims == 4:
+            x = x.unsqueeze(2)
+        bs, c, t_len, h, w = x.shape
+        x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size)
+
+        if time_dim_replace is not None:
+            time_dim_replace = comfy.ldm.common_dit.pad_to_patch_size(time_dim_replace, self.patch_size)
+            x[:, :time_dim_replace.shape[1], :time_dim_replace.shape[2]] = time_dim_replace
+
+        freqs = self.rope_encode_3d(t_len, h, w, device=x.device, dtype=x.dtype, transformer_options=transformer_options)
+        freqs_text = self.rope_encode_1d(context.shape[1], device=x.device, dtype=x.dtype, transformer_options=transformer_options)
+
+        out = self.forward_orig(x, timestep, context, y, freqs, freqs_text, transformer_options=transformer_options, **kwargs)
+        if original_dims == 4:
+            out = out.squeeze(2)
+        return out
+
+    def forward(self, x, timestep, context, y, time_dim_replace=None, transformer_options={}, **kwargs):
+        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
+            self._forward,
+            self,
+            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
+        ).execute(x, timestep, context, y, time_dim_replace=time_dim_replace, transformer_options=transformer_options, **kwargs)
--- a/comfy/ldm/lightricks/av_model.py
+++ b/comfy/ldm/lightricks/av_model.py
@ -0,0 +1,871 @@
+from typing import Tuple
+import torch
+import torch.nn as nn
+from comfy.ldm.lightricks.model import (
+    CrossAttention,
+    FeedForward,
+    AdaLayerNormSingle,
+    PixArtAlphaTextProjection,
+    LTXVModel,
+)
+from comfy.ldm.lightricks.symmetric_patchifier import AudioPatchifier
+import comfy.ldm.common_dit
+
+class CompressedTimestep:
+    """Store video timestep embeddings in compressed form using per-frame indexing."""
+    __slots__ = ('data', 'batch_size', 'num_frames', 'patches_per_frame', 'feature_dim')
+
+    def __init__(self, tensor: torch.Tensor, patches_per_frame: int):
+        """
+        tensor: [batch_size, num_tokens, feature_dim] tensor where num_tokens = num_frames * patches_per_frame
+        patches_per_frame: Number of spatial patches per frame (height * width in latent space), or None to disable compression
+        """
+        self.batch_size, num_tokens, self.feature_dim = tensor.shape
+
+        # Check if compression is valid (num_tokens must be divisible by patches_per_frame)
+        if patches_per_frame is not None and num_tokens % patches_per_frame == 0 and num_tokens >= patches_per_frame:
+            self.patches_per_frame = patches_per_frame
+            self.num_frames = num_tokens // patches_per_frame
+
+            # Reshape to [batch, frames, patches_per_frame, feature_dim] and store one value per frame
+            # All patches in a frame are identical, so we only keep the first one
+            reshaped = tensor.view(self.batch_size, self.num_frames, patches_per_frame, self.feature_dim)
+            self.data = reshaped[:, :, 0, :].contiguous()  # [batch, frames, feature_dim]
+        else:
+            # Not divisible or too small - store directly without compression
+            self.patches_per_frame = 1
+            self.num_frames = num_tokens
+            self.data = tensor
+
+    def expand(self):
+        """Expand back to original tensor."""
+        if self.patches_per_frame == 1:
+            return self.data
+
+        # [batch, frames, feature_dim] -> [batch, frames, patches_per_frame, feature_dim] -> [batch, tokens, feature_dim]
+        expanded = self.data.unsqueeze(2).expand(self.batch_size, self.num_frames, self.patches_per_frame, self.feature_dim)
+        return expanded.reshape(self.batch_size, -1, self.feature_dim)
+
+    def expand_for_computation(self, scale_shift_table: torch.Tensor, batch_size: int, indices: slice = slice(None, None)):
+        """Compute ada values on compressed per-frame data, then expand spatially."""
+        num_ada_params = scale_shift_table.shape[0]
+
+        # No compression - compute directly
+        if self.patches_per_frame == 1:
+            num_tokens = self.data.shape[1]
+            dim_per_param = self.feature_dim // num_ada_params
+            reshaped = self.data.reshape(batch_size, num_tokens, num_ada_params, dim_per_param)[:, :, indices, :]
+            table_values = scale_shift_table[indices].unsqueeze(0).unsqueeze(0).to(device=self.data.device, dtype=self.data.dtype)
+            ada_values = (table_values + reshaped).unbind(dim=2)
+            return ada_values
+
+        # Compressed: compute on per-frame data then expand spatially
+        # Reshape: [batch, frames, feature_dim] -> [batch, frames, num_ada_params, dim_per_param]
+        frame_reshaped = self.data.reshape(batch_size, self.num_frames, num_ada_params, -1)[:, :, indices, :]
+        table_values = scale_shift_table[indices].unsqueeze(0).unsqueeze(0).to(
+            device=self.data.device, dtype=self.data.dtype
+        )
+        frame_ada = (table_values + frame_reshaped).unbind(dim=2)
+
+        # Expand each ada parameter spatially: [batch, frames, dim] -> [batch, frames, patches, dim] -> [batch, tokens, dim]
+        return tuple(
+            frame_val.unsqueeze(2).expand(batch_size, self.num_frames, self.patches_per_frame, -1)
+            .reshape(batch_size, -1, frame_val.shape[-1])
+            for frame_val in frame_ada
+        )
+
+class BasicAVTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        v_dim,
+        a_dim,
+        v_heads,
+        a_heads,
+        vd_head,
+        ad_head,
+        v_context_dim=None,
+        a_context_dim=None,
+        attn_precision=None,
+        dtype=None,
+        device=None,
+        operations=None,
+    ):
+        super().__init__()
+
+        self.attn_precision = attn_precision
+
+        self.attn1 = CrossAttention(
+            query_dim=v_dim,
+            heads=v_heads,
+            dim_head=vd_head,
+            context_dim=None,
+            attn_precision=self.attn_precision,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+        self.audio_attn1 = CrossAttention(
+            query_dim=a_dim,
+            heads=a_heads,
+            dim_head=ad_head,
+            context_dim=None,
+            attn_precision=self.attn_precision,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+
+        self.attn2 = CrossAttention(
+            query_dim=v_dim,
+            context_dim=v_context_dim,
+            heads=v_heads,
+            dim_head=vd_head,
+            attn_precision=self.attn_precision,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+        self.audio_attn2 = CrossAttention(
+            query_dim=a_dim,
+            context_dim=a_context_dim,
+            heads=a_heads,
+            dim_head=ad_head,
+            attn_precision=self.attn_precision,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+
+        # Q: Video, K,V: Audio
+        self.audio_to_video_attn = CrossAttention(
+            query_dim=v_dim,
+            context_dim=a_dim,
+            heads=a_heads,
+            dim_head=ad_head,
+            attn_precision=self.attn_precision,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+
+        # Q: Audio, K,V: Video
+        self.video_to_audio_attn = CrossAttention(
+            query_dim=a_dim,
+            context_dim=v_dim,
+            heads=a_heads,
+            dim_head=ad_head,
+            attn_precision=self.attn_precision,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+
+        self.ff = FeedForward(
+            v_dim, dim_out=v_dim, glu=True, dtype=dtype, device=device, operations=operations
+        )
+        self.audio_ff = FeedForward(
+            a_dim, dim_out=a_dim, glu=True, dtype=dtype, device=device, operations=operations
+        )
+
+        self.scale_shift_table = nn.Parameter(torch.empty(6, v_dim, device=device, dtype=dtype))
+        self.audio_scale_shift_table = nn.Parameter(
+            torch.empty(6, a_dim, device=device, dtype=dtype)
+        )
+
+        self.scale_shift_table_a2v_ca_audio = nn.Parameter(
+            torch.empty(5, a_dim, device=device, dtype=dtype)
+        )
+        self.scale_shift_table_a2v_ca_video = nn.Parameter(
+            torch.empty(5, v_dim, device=device, dtype=dtype)
+        )
+
+    def get_ada_values(
+        self, scale_shift_table: torch.Tensor, batch_size: int, timestep: torch.Tensor, indices: slice = slice(None, None)
+    ):
+        if isinstance(timestep, CompressedTimestep):
+            return timestep.expand_for_computation(scale_shift_table, batch_size, indices)
+
+        num_ada_params = scale_shift_table.shape[0]
+
+        ada_values = (
+            scale_shift_table[indices].unsqueeze(0).unsqueeze(0).to(device=timestep.device, dtype=timestep.dtype)
+            + timestep.reshape(batch_size, timestep.shape[1], num_ada_params, -1)[:, :, indices, :]
+        ).unbind(dim=2)
+        return ada_values
+
+    def get_av_ca_ada_values(
+        self,
+        scale_shift_table: torch.Tensor,
+        batch_size: int,
+        scale_shift_timestep: torch.Tensor,
+        gate_timestep: torch.Tensor,
+        num_scale_shift_values: int = 4,
+    ):
+        scale_shift_ada_values = self.get_ada_values(
+            scale_shift_table[:num_scale_shift_values, :],
+            batch_size,
+            scale_shift_timestep,
+        )
+        gate_ada_values = self.get_ada_values(
+            scale_shift_table[num_scale_shift_values:, :],
+            batch_size,
+            gate_timestep,
+        )
+
+        return (*scale_shift_ada_values, *gate_ada_values)
+
+    def forward(
+        self, x: Tuple[torch.Tensor, torch.Tensor], v_context=None, a_context=None, attention_mask=None, v_timestep=None, a_timestep=None,
+        v_pe=None, a_pe=None, v_cross_pe=None, a_cross_pe=None, v_cross_scale_shift_timestep=None, a_cross_scale_shift_timestep=None,
+        v_cross_gate_timestep=None, a_cross_gate_timestep=None, transformer_options=None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        run_vx = transformer_options.get("run_vx", True)
+        run_ax = transformer_options.get("run_ax", True)
+
+        vx, ax = x
+        run_ax = run_ax and ax.numel() > 0
+        run_a2v = run_vx and transformer_options.get("a2v_cross_attn", True) and ax.numel() > 0
+        run_v2a = run_ax and transformer_options.get("v2a_cross_attn", True)
+
+        # video
+        if run_vx:
+            # video self-attention
+            vshift_msa, vscale_msa = (self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(0, 2)))
+            norm_vx = comfy.ldm.common_dit.rms_norm(vx) * (1 + vscale_msa) + vshift_msa
+            del vshift_msa, vscale_msa
+            attn1_out = self.attn1(norm_vx, pe=v_pe, transformer_options=transformer_options)
+            del norm_vx
+            # video cross-attention
+            vgate_msa = self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(2, 3))[0]
+            vx.addcmul_(attn1_out, vgate_msa)
+            del vgate_msa, attn1_out
+            vx.add_(self.attn2(comfy.ldm.common_dit.rms_norm(vx), context=v_context, mask=attention_mask, transformer_options=transformer_options))
+
+        # audio
+        if run_ax:
+            # audio self-attention
+            ashift_msa, ascale_msa = (self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(0, 2)))
+            norm_ax = comfy.ldm.common_dit.rms_norm(ax) * (1 + ascale_msa) + ashift_msa
+            del ashift_msa, ascale_msa
+            attn1_out = self.audio_attn1(norm_ax, pe=a_pe, transformer_options=transformer_options)
+            del norm_ax
+            # audio cross-attention
+            agate_msa = self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(2, 3))[0]
+            ax.addcmul_(attn1_out, agate_msa)
+            del agate_msa, attn1_out
+            ax.add_(self.audio_attn2(comfy.ldm.common_dit.rms_norm(ax), context=a_context, mask=attention_mask, transformer_options=transformer_options))
+
+        # video - audio cross attention.
+        if run_a2v or run_v2a:
+            vx_norm3 = comfy.ldm.common_dit.rms_norm(vx)
+            ax_norm3 = comfy.ldm.common_dit.rms_norm(ax)
+
+            # audio to video cross attention
+            if run_a2v:
+                scale_ca_audio_hidden_states_a2v, shift_ca_audio_hidden_states_a2v = self.get_ada_values(
+                    self.scale_shift_table_a2v_ca_audio[:4, :], ax.shape[0], a_cross_scale_shift_timestep)[:2]
+                scale_ca_video_hidden_states_a2v_v, shift_ca_video_hidden_states_a2v_v = self.get_ada_values(
+                    self.scale_shift_table_a2v_ca_video[:4, :], vx.shape[0], v_cross_scale_shift_timestep)[:2]
+
+                vx_scaled = vx_norm3 * (1 + scale_ca_video_hidden_states_a2v_v) + shift_ca_video_hidden_states_a2v_v
+                ax_scaled = ax_norm3 * (1 + scale_ca_audio_hidden_states_a2v) + shift_ca_audio_hidden_states_a2v
+                del scale_ca_video_hidden_states_a2v_v, shift_ca_video_hidden_states_a2v_v, scale_ca_audio_hidden_states_a2v, shift_ca_audio_hidden_states_a2v
+
+                a2v_out = self.audio_to_video_attn(vx_scaled, context=ax_scaled, pe=v_cross_pe, k_pe=a_cross_pe, transformer_options=transformer_options)
+                del vx_scaled, ax_scaled
+
+                gate_out_a2v = self.get_ada_values(self.scale_shift_table_a2v_ca_video[4:, :], vx.shape[0], v_cross_gate_timestep)[0]
+                vx.addcmul_(a2v_out, gate_out_a2v)
+                del gate_out_a2v, a2v_out
+
+            # video to audio cross attention
+            if run_v2a:
+                scale_ca_audio_hidden_states_v2a, shift_ca_audio_hidden_states_v2a = self.get_ada_values(
+                    self.scale_shift_table_a2v_ca_audio[:4, :], ax.shape[0], a_cross_scale_shift_timestep)[2:4]
+                scale_ca_video_hidden_states_v2a, shift_ca_video_hidden_states_v2a = self.get_ada_values(
+                    self.scale_shift_table_a2v_ca_video[:4, :], vx.shape[0], v_cross_scale_shift_timestep)[2:4]
+
+                ax_scaled = ax_norm3 * (1 + scale_ca_audio_hidden_states_v2a) + shift_ca_audio_hidden_states_v2a
+                vx_scaled = vx_norm3 * (1 + scale_ca_video_hidden_states_v2a) + shift_ca_video_hidden_states_v2a
+                del scale_ca_video_hidden_states_v2a, shift_ca_video_hidden_states_v2a, scale_ca_audio_hidden_states_v2a, shift_ca_audio_hidden_states_v2a
+
+                v2a_out = self.video_to_audio_attn(ax_scaled, context=vx_scaled, pe=a_cross_pe, k_pe=v_cross_pe, transformer_options=transformer_options)
+                del ax_scaled, vx_scaled
+
+                gate_out_v2a = self.get_ada_values(self.scale_shift_table_a2v_ca_audio[4:, :], ax.shape[0], a_cross_gate_timestep)[0]
+                ax.addcmul_(v2a_out, gate_out_v2a)
+                del gate_out_v2a, v2a_out
+
+            del vx_norm3, ax_norm3
+
+        # video feedforward
+        if run_vx:
+            vshift_mlp, vscale_mlp = self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(3, 5))
+            vx_scaled = comfy.ldm.common_dit.rms_norm(vx) * (1 + vscale_mlp) + vshift_mlp
+            del vshift_mlp, vscale_mlp
+
+            ff_out = self.ff(vx_scaled)
+            del vx_scaled
+
+            vgate_mlp = self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(5, 6))[0]
+            vx.addcmul_(ff_out, vgate_mlp)
+            del vgate_mlp, ff_out
+
+        # audio feedforward
+        if run_ax:
+            ashift_mlp, ascale_mlp = self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(3, 5))
+            ax_scaled = comfy.ldm.common_dit.rms_norm(ax) * (1 + ascale_mlp) + ashift_mlp
+            del ashift_mlp, ascale_mlp
+
+            ff_out = self.audio_ff(ax_scaled)
+            del ax_scaled
+
+            agate_mlp = self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(5, 6))[0]
+            ax.addcmul_(ff_out, agate_mlp)
+            del agate_mlp, ff_out
+
+        return vx, ax
+
+
+class LTXAVModel(LTXVModel):
+    """LTXAV model for audio-video generation."""
+
+    def __init__(
+        self,
+        in_channels=128,
+        audio_in_channels=128,
+        cross_attention_dim=4096,
+        audio_cross_attention_dim=2048,
+        attention_head_dim=128,
+        audio_attention_head_dim=64,
+        num_attention_heads=32,
+        audio_num_attention_heads=32,
+        caption_channels=3840,
+        num_layers=48,
+        positional_embedding_theta=10000.0,
+        positional_embedding_max_pos=[20, 2048, 2048],
+        audio_positional_embedding_max_pos=[20],
+        causal_temporal_positioning=False,
+        vae_scale_factors=(8, 32, 32),
+        use_middle_indices_grid=False,
+        timestep_scale_multiplier=1000.0,
+        av_ca_timestep_scale_multiplier=1.0,
+        dtype=None,
+        device=None,
+        operations=None,
+        **kwargs,
+    ):
+        # Store audio-specific parameters
+        self.audio_in_channels = audio_in_channels
+        self.audio_cross_attention_dim = audio_cross_attention_dim
+        self.audio_attention_head_dim = audio_attention_head_dim
+        self.audio_num_attention_heads = audio_num_attention_heads
+        self.audio_positional_embedding_max_pos = audio_positional_embedding_max_pos
+
+        # Calculate audio dimensions
+        self.audio_inner_dim = audio_num_attention_heads * audio_attention_head_dim
+        self.audio_out_channels = audio_in_channels
+
+        # Audio-specific constants
+        self.num_audio_channels = 8
+        self.audio_frequency_bins = 16
+
+        self.av_ca_timestep_scale_multiplier = av_ca_timestep_scale_multiplier
+
+        super().__init__(
+            in_channels=in_channels,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            num_attention_heads=num_attention_heads,
+            caption_channels=caption_channels,
+            num_layers=num_layers,
+            positional_embedding_theta=positional_embedding_theta,
+            positional_embedding_max_pos=positional_embedding_max_pos,
+            causal_temporal_positioning=causal_temporal_positioning,
+            vae_scale_factors=vae_scale_factors,
+            use_middle_indices_grid=use_middle_indices_grid,
+            timestep_scale_multiplier=timestep_scale_multiplier,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+            **kwargs,
+        )
+
+    def _init_model_components(self, device, dtype, **kwargs):
+        """Initialize LTXAV-specific components."""
+        # Audio-specific projections
+        self.audio_patchify_proj = self.operations.Linear(
+            self.audio_in_channels, self.audio_inner_dim, bias=True, dtype=dtype, device=device
+        )
+
+        # Audio-specific AdaLN
+        self.audio_adaln_single = AdaLayerNormSingle(
+            self.audio_inner_dim,
+            use_additional_conditions=False,
+            dtype=dtype,
+            device=device,
+            operations=self.operations,
+        )
+
+        num_scale_shift_values = 4
+        self.av_ca_video_scale_shift_adaln_single = AdaLayerNormSingle(
+            self.inner_dim,
+            use_additional_conditions=False,
+            embedding_coefficient=num_scale_shift_values,
+            dtype=dtype,
+            device=device,
+            operations=self.operations,
+        )
+        self.av_ca_a2v_gate_adaln_single = AdaLayerNormSingle(
+            self.inner_dim,
+            use_additional_conditions=False,
+            embedding_coefficient=1,
+            dtype=dtype,
+            device=device,
+            operations=self.operations,
+        )
+        self.av_ca_audio_scale_shift_adaln_single = AdaLayerNormSingle(
+            self.audio_inner_dim,
+            use_additional_conditions=False,
+            embedding_coefficient=num_scale_shift_values,
+            dtype=dtype,
+            device=device,
+            operations=self.operations,
+        )
+        self.av_ca_v2a_gate_adaln_single = AdaLayerNormSingle(
+            self.audio_inner_dim,
+            use_additional_conditions=False,
+            embedding_coefficient=1,
+            dtype=dtype,
+            device=device,
+            operations=self.operations,
+        )
+
+        # Audio caption projection
+        self.audio_caption_projection = PixArtAlphaTextProjection(
+            in_features=self.caption_channels,
+            hidden_size=self.audio_inner_dim,
+            dtype=dtype,
+            device=device,
+            operations=self.operations,
+        )
+
+    def _init_transformer_blocks(self, device, dtype, **kwargs):
+        """Initialize transformer blocks for LTXAV."""
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicAVTransformerBlock(
+                    v_dim=self.inner_dim,
+                    a_dim=self.audio_inner_dim,
+                    v_heads=self.num_attention_heads,
+                    a_heads=self.audio_num_attention_heads,
+                    vd_head=self.attention_head_dim,
+                    ad_head=self.audio_attention_head_dim,
+                    v_context_dim=self.cross_attention_dim,
+                    a_context_dim=self.audio_cross_attention_dim,
+                    dtype=dtype,
+                    device=device,
+                    operations=self.operations,
+                )
+                for _ in range(self.num_layers)
+            ]
+        )
+
+    def _init_output_components(self, device, dtype):
+        """Initialize output components for LTXAV."""
+        # Video output components
+        super()._init_output_components(device, dtype)
+        # Audio output components
+        self.audio_scale_shift_table = nn.Parameter(
+            torch.empty(2, self.audio_inner_dim, dtype=dtype, device=device)
+        )
+        self.audio_norm_out = self.operations.LayerNorm(
+            self.audio_inner_dim, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device
+        )
+        self.audio_proj_out = self.operations.Linear(
+            self.audio_inner_dim, self.audio_out_channels, dtype=dtype, device=device
+        )
+        self.a_patchifier = AudioPatchifier(1, start_end=True)
+
+    def separate_audio_and_video_latents(self, x, audio_length):
+        """Separate audio and video latents from combined input."""
+        # vx = x[:, : self.in_channels]
+        # ax = x[:, self.in_channels :]
+        #
+        # ax = ax.reshape(ax.shape[0], -1)
+        # ax = ax[:, : audio_length * self.num_audio_channels * self.audio_frequency_bins]
+        #
+        # ax = ax.reshape(
+        #     ax.shape[0], self.num_audio_channels, audio_length, self.audio_frequency_bins
+        # )
+
+        vx = x[0]
+        ax = x[1] if len(x) > 1 else torch.zeros(
+            (vx.shape[0], self.num_audio_channels, 0, self.audio_frequency_bins),
+            device=vx.device, dtype=vx.dtype
+        )
+        return vx, ax
+
+    def recombine_audio_and_video_latents(self, vx, ax, target_shape=None):
+        if ax.numel() == 0:
+            return vx
+        else:
+            return [vx, ax]
+        """Recombine audio and video latents for output."""
+        # if ax.device != vx.device or ax.dtype != vx.dtype:
+        #     logging.warning("Audio and video latents are on different devices or dtypes.")
+        #     ax = ax.to(device=vx.device, dtype=vx.dtype)
+        #     logging.warning(f"Audio audio latent moved to device: {ax.device}, dtype: {ax.dtype}")
+        #
+        # ax = ax.reshape(ax.shape[0], -1)
+        # # pad to f x h x w of the video latents
+        # divisor = vx.shape[-1] * vx.shape[-2] * vx.shape[-3]
+        # if target_shape is None:
+        #     repetitions = math.ceil(ax.shape[-1] / divisor)
+        # else:
+        #     repetitions = target_shape[1] - vx.shape[1]
+        # padded_len = repetitions * divisor
+        # ax = F.pad(ax, (0, padded_len - ax.shape[-1]))
+        # ax = ax.reshape(ax.shape[0], -1, vx.shape[-3], vx.shape[-2], vx.shape[-1])
+        # return torch.cat([vx, ax], dim=1)
+
+    def _process_input(self, x, keyframe_idxs, denoise_mask, **kwargs):
+        """Process input for LTXAV - separate audio and video, then patchify."""
+        audio_length = kwargs.get("audio_length", 0)
+        # Separate audio and video latents
+        vx, ax = self.separate_audio_and_video_latents(x, audio_length)
+
+        has_spatial_mask = False
+        if denoise_mask is not None:
+            # check if any frame has spatial variation (inpainting)
+            for frame_idx in range(denoise_mask.shape[2]):
+                frame_mask = denoise_mask[0, 0, frame_idx]
+                if frame_mask.numel() > 0 and frame_mask.min() != frame_mask.max():
+                    has_spatial_mask = True
+                    break
+
+        [vx, v_pixel_coords, additional_args] = super()._process_input(
+            vx, keyframe_idxs, denoise_mask, **kwargs
+        )
+        additional_args["has_spatial_mask"] = has_spatial_mask
+
+        ax, a_latent_coords = self.a_patchifier.patchify(ax)
+        ax = self.audio_patchify_proj(ax)
+
+        # additional_args.update({"av_orig_shape": list(x.shape)})
+        return [vx, ax], [v_pixel_coords, a_latent_coords], additional_args
+
+    def _prepare_timestep(self, timestep, batch_size, hidden_dtype, **kwargs):
+        """Prepare timestep embeddings."""
+        # TODO: some code reuse is needed here.
+        grid_mask = kwargs.get("grid_mask", None)
+        if grid_mask is not None:
+            timestep = timestep[:, grid_mask]
+
+        timestep_scaled = timestep * self.timestep_scale_multiplier
+
+        v_timestep, v_embedded_timestep = self.adaln_single(
+            timestep_scaled.flatten(),
+            {"resolution": None, "aspect_ratio": None},
+            batch_size=batch_size,
+            hidden_dtype=hidden_dtype,
+        )
+
+        # Calculate patches_per_frame from orig_shape: [batch, channels, frames, height, width]
+        # Video tokens are arranged as (frames * height * width), so patches_per_frame = height * width
+        orig_shape = kwargs.get("orig_shape")
+        has_spatial_mask = kwargs.get("has_spatial_mask", None)
+        v_patches_per_frame = None
+        if not has_spatial_mask and orig_shape is not None and len(orig_shape) == 5:
+            # orig_shape[3] = height, orig_shape[4] = width (in latent space)
+            v_patches_per_frame = orig_shape[3] * orig_shape[4]
+
+        # Reshape to [batch_size, num_tokens, dim] and compress for storage
+        v_timestep = CompressedTimestep(v_timestep.view(batch_size, -1, v_timestep.shape[-1]), v_patches_per_frame)
+        v_embedded_timestep = CompressedTimestep(v_embedded_timestep.view(batch_size, -1, v_embedded_timestep.shape[-1]), v_patches_per_frame)
+
+        # Prepare audio timestep
+        a_timestep = kwargs.get("a_timestep")
+        if a_timestep is not None:
+            a_timestep_scaled = a_timestep * self.timestep_scale_multiplier
+            a_timestep_flat = a_timestep_scaled.flatten()
+            timestep_flat = timestep_scaled.flatten()
+            av_ca_factor = self.av_ca_timestep_scale_multiplier / self.timestep_scale_multiplier
+
+            # Cross-attention timesteps - compress these too
+            av_ca_audio_scale_shift_timestep, _ = self.av_ca_audio_scale_shift_adaln_single(
+                a_timestep_flat,
+                {"resolution": None, "aspect_ratio": None},
+                batch_size=batch_size,
+                hidden_dtype=hidden_dtype,
+            )
+            av_ca_video_scale_shift_timestep, _ = self.av_ca_video_scale_shift_adaln_single(
+                timestep_flat,
+                {"resolution": None, "aspect_ratio": None},
+                batch_size=batch_size,
+                hidden_dtype=hidden_dtype,
+            )
+            av_ca_a2v_gate_noise_timestep, _ = self.av_ca_a2v_gate_adaln_single(
+                timestep_flat * av_ca_factor,
+                {"resolution": None, "aspect_ratio": None},
+                batch_size=batch_size,
+                hidden_dtype=hidden_dtype,
+            )
+            av_ca_v2a_gate_noise_timestep, _ = self.av_ca_v2a_gate_adaln_single(
+                a_timestep_flat * av_ca_factor,
+                {"resolution": None, "aspect_ratio": None},
+                batch_size=batch_size,
+                hidden_dtype=hidden_dtype,
+            )
+
+            # Compress cross-attention timesteps (only video side, audio is too small to benefit)
+            # v_patches_per_frame is None for spatial masks, set for temporal masks or no mask
+            cross_av_timestep_ss = [
+                av_ca_audio_scale_shift_timestep.view(batch_size, -1, av_ca_audio_scale_shift_timestep.shape[-1]),
+                CompressedTimestep(av_ca_video_scale_shift_timestep.view(batch_size, -1, av_ca_video_scale_shift_timestep.shape[-1]), v_patches_per_frame),  # video - compressed if possible
+                CompressedTimestep(av_ca_a2v_gate_noise_timestep.view(batch_size, -1, av_ca_a2v_gate_noise_timestep.shape[-1]), v_patches_per_frame),  # video - compressed if possible
+                av_ca_v2a_gate_noise_timestep.view(batch_size, -1, av_ca_v2a_gate_noise_timestep.shape[-1]),
+            ]
+
+            a_timestep, a_embedded_timestep = self.audio_adaln_single(
+                a_timestep_flat,
+                {"resolution": None, "aspect_ratio": None},
+                batch_size=batch_size,
+                hidden_dtype=hidden_dtype,
+            )
+            # Audio timesteps
+            a_timestep = a_timestep.view(batch_size, -1, a_timestep.shape[-1])
+            a_embedded_timestep = a_embedded_timestep.view(batch_size, -1, a_embedded_timestep.shape[-1])
+        else:
+            a_timestep = timestep_scaled
+            a_embedded_timestep = kwargs.get("embedded_timestep")
+            cross_av_timestep_ss = []
+
+        return [v_timestep, a_timestep, cross_av_timestep_ss], [
+            v_embedded_timestep,
+            a_embedded_timestep,
+        ]
+
+    def _prepare_context(self, context, batch_size, x, attention_mask=None):
+        vx = x[0]
+        ax = x[1]
+        v_context, a_context = torch.split(
+            context, int(context.shape[-1] / 2), len(context.shape) - 1
+        )
+
+        v_context, attention_mask = super()._prepare_context(
+            v_context, batch_size, vx, attention_mask
+        )
+        if self.audio_caption_projection is not None:
+            a_context = self.audio_caption_projection(a_context)
+            a_context = a_context.view(batch_size, -1, ax.shape[-1])
+
+        return [v_context, a_context], attention_mask
+
+    def _prepare_positional_embeddings(self, pixel_coords, frame_rate, x_dtype):
+        v_pixel_coords = pixel_coords[0]
+        v_pe = super()._prepare_positional_embeddings(v_pixel_coords, frame_rate, x_dtype)
+
+        a_latent_coords = pixel_coords[1]
+        a_pe = self._precompute_freqs_cis(
+            a_latent_coords,
+            dim=self.audio_inner_dim,
+            out_dtype=x_dtype,
+            max_pos=self.audio_positional_embedding_max_pos,
+            use_middle_indices_grid=self.use_middle_indices_grid,
+            num_attention_heads=self.audio_num_attention_heads,
+        )
+
+        # calculate positional embeddings for the middle of the token duration, to use in av cross attention layers.
+        max_pos = max(
+            self.positional_embedding_max_pos[0], self.audio_positional_embedding_max_pos[0]
+        )
+        v_pixel_coords = v_pixel_coords.to(torch.float32)
+        v_pixel_coords[:, 0] = v_pixel_coords[:, 0] * (1.0 / frame_rate)
+        av_cross_video_freq_cis = self._precompute_freqs_cis(
+            v_pixel_coords[:, 0:1, :],
+            dim=self.audio_cross_attention_dim,
+            out_dtype=x_dtype,
+            max_pos=[max_pos],
+            use_middle_indices_grid=True,
+            num_attention_heads=self.audio_num_attention_heads,
+        )
+        av_cross_audio_freq_cis = self._precompute_freqs_cis(
+            a_latent_coords[:, 0:1, :],
+            dim=self.audio_cross_attention_dim,
+            out_dtype=x_dtype,
+            max_pos=[max_pos],
+            use_middle_indices_grid=True,
+            num_attention_heads=self.audio_num_attention_heads,
+        )
+
+        return [(v_pe, av_cross_video_freq_cis), (a_pe, av_cross_audio_freq_cis)]
+
+    def _process_transformer_blocks(
+        self, x, context, attention_mask, timestep, pe, transformer_options={}, **kwargs
+    ):
+        vx = x[0]
+        ax = x[1]
+        v_context = context[0]
+        a_context = context[1]
+        v_timestep = timestep[0]
+        a_timestep = timestep[1]
+        v_pe, av_cross_video_freq_cis = pe[0]
+        a_pe, av_cross_audio_freq_cis = pe[1]
+
+        (
+            av_ca_audio_scale_shift_timestep,
+            av_ca_video_scale_shift_timestep,
+            av_ca_a2v_gate_noise_timestep,
+            av_ca_v2a_gate_noise_timestep,
+        ) = timestep[2]
+
+        """Process transformer blocks for LTXAV."""
+        patches_replace = transformer_options.get("patches_replace", {})
+        blocks_replace = patches_replace.get("dit", {})
+
+        # Process transformer blocks
+        for i, block in enumerate(self.transformer_blocks):
+            if ("double_block", i) in blocks_replace:
+
+                def block_wrap(args):
+                    out = {}
+                    out["img"] = block(
+                        args["img"],
+                        v_context=args["v_context"],
+                        a_context=args["a_context"],
+                        attention_mask=args["attention_mask"],
+                        v_timestep=args["v_timestep"],
+                        a_timestep=args["a_timestep"],
+                        v_pe=args["v_pe"],
+                        a_pe=args["a_pe"],
+                        v_cross_pe=args["v_cross_pe"],
+                        a_cross_pe=args["a_cross_pe"],
+                        v_cross_scale_shift_timestep=args["v_cross_scale_shift_timestep"],
+                        a_cross_scale_shift_timestep=args["a_cross_scale_shift_timestep"],
+                        v_cross_gate_timestep=args["v_cross_gate_timestep"],
+                        a_cross_gate_timestep=args["a_cross_gate_timestep"],
+                        transformer_options=args["transformer_options"],
+                    )
+                    return out
+
+                out = blocks_replace[("double_block", i)](
+                    {
+                        "img": (vx, ax),
+                        "v_context": v_context,
+                        "a_context": a_context,
+                        "attention_mask": attention_mask,
+                        "v_timestep": v_timestep,
+                        "a_timestep": a_timestep,
+                        "v_pe": v_pe,
+                        "a_pe": a_pe,
+                        "v_cross_pe": av_cross_video_freq_cis,
+                        "a_cross_pe": av_cross_audio_freq_cis,
+                        "v_cross_scale_shift_timestep": av_ca_video_scale_shift_timestep,
+                        "a_cross_scale_shift_timestep": av_ca_audio_scale_shift_timestep,
+                        "v_cross_gate_timestep": av_ca_a2v_gate_noise_timestep,
+                        "a_cross_gate_timestep": av_ca_v2a_gate_noise_timestep,
+                        "transformer_options": transformer_options,
+                    },
+                    {"original_block": block_wrap},
+                )
+                vx, ax = out["img"]
+            else:
+                vx, ax = block(
+                    (vx, ax),
+                    v_context=v_context,
+                    a_context=a_context,
+                    attention_mask=attention_mask,
+                    v_timestep=v_timestep,
+                    a_timestep=a_timestep,
+                    v_pe=v_pe,
+                    a_pe=a_pe,
+                    v_cross_pe=av_cross_video_freq_cis,
+                    a_cross_pe=av_cross_audio_freq_cis,
+                    v_cross_scale_shift_timestep=av_ca_video_scale_shift_timestep,
+                    a_cross_scale_shift_timestep=av_ca_audio_scale_shift_timestep,
+                    v_cross_gate_timestep=av_ca_a2v_gate_noise_timestep,
+                    a_cross_gate_timestep=av_ca_v2a_gate_noise_timestep,
+                    transformer_options=transformer_options,
+                )
+
+        return [vx, ax]
+
+    def _process_output(self, x, embedded_timestep, keyframe_idxs, **kwargs):
+        vx = x[0]
+        ax = x[1]
+        v_embedded_timestep = embedded_timestep[0]
+        a_embedded_timestep = embedded_timestep[1]
+
+        # Expand compressed video timestep if needed
+        if isinstance(v_embedded_timestep, CompressedTimestep):
+            v_embedded_timestep = v_embedded_timestep.expand()
+
+        vx = super()._process_output(vx, v_embedded_timestep, keyframe_idxs, **kwargs)
+
+        # Process audio output
+        a_scale_shift_values = (
+            self.audio_scale_shift_table[None, None].to(device=a_embedded_timestep.device, dtype=a_embedded_timestep.dtype)
+            + a_embedded_timestep[:, :, None]
+        )
+        a_shift, a_scale = a_scale_shift_values[:, :, 0], a_scale_shift_values[:, :, 1]
+
+        ax = self.audio_norm_out(ax)
+        ax = ax * (1 + a_scale) + a_shift
+        ax = self.audio_proj_out(ax)
+
+        # Unpatchify audio
+        ax = self.a_patchifier.unpatchify(
+            ax, channels=self.num_audio_channels, freq=self.audio_frequency_bins
+        )
+
+        # Recombine audio and video
+        original_shape = kwargs.get("av_orig_shape")
+        return self.recombine_audio_and_video_latents(vx, ax, original_shape)
+
+    def forward(
+        self,
+        x,
+        timestep,
+        context,
+        attention_mask=None,
+        frame_rate=25,
+        transformer_options={},
+        keyframe_idxs=None,
+        **kwargs,
+    ):
+        """
+        Forward pass for LTXAV model.
+
+        Args:
+            x: Combined audio-video input tensor
+            timestep: Tuple of (video_timestep, audio_timestep) or single timestep
+            context: Context tensor (e.g., text embeddings)
+            attention_mask: Attention mask tensor
+            frame_rate: Frame rate for temporal processing
+            transformer_options: Additional options for transformer blocks
+            keyframe_idxs: Keyframe indices for temporal processing
+            **kwargs: Additional keyword arguments including audio_length
+
+        Returns:
+            Combined audio-video output tensor
+        """
+        # Handle timestep format
+        if isinstance(timestep, (tuple, list)) and len(timestep) == 2:
+            v_timestep, a_timestep = timestep
+            kwargs["a_timestep"] = a_timestep
+            timestep = v_timestep
+        else:
+            kwargs["a_timestep"] = timestep
+
+        # Call parent forward method
+        return super().forward(
+            x,
+            timestep,
+            context,
+            attention_mask,
+            frame_rate,
+            transformer_options,
+            keyframe_idxs,
+            **kwargs,
+        )
--- a/comfy/ldm/lightricks/embeddings_connector.py
+++ b/comfy/ldm/lightricks/embeddings_connector.py
@ -0,0 +1,305 @@
+import math
+from typing import Optional
+
+import comfy.ldm.common_dit
+import torch
+from comfy.ldm.lightricks.model import (
+    CrossAttention,
+    FeedForward,
+    generate_freq_grid_np,
+    interleaved_freqs_cis,
+    split_freqs_cis,
+)
+from torch import nn
+
+
+class BasicTransformerBlock1D(nn.Module):
+    r"""
+    A basic Transformer block.
+
+    Parameters:
+
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        standardization_norm (`str`, *optional*, defaults to `"layer_norm"`): The type of pre-normalization to use. Can be `"layer_norm"` or `"rms_norm"`.
+        norm_eps (`float`, *optional*, defaults to 1e-5): Epsilon value for normalization layers.
+        qk_norm (`str`, *optional*, defaults to None):
+            Set to 'layer_norm' or `rms_norm` to perform query and key normalization.
+        final_dropout (`bool` *optional*, defaults to False):
+            Whether to apply a final dropout after the last feed-forward layer.
+        ff_inner_dim (`int`, *optional*): Dimension of the inner feed-forward layer. If not provided, defaults to `dim * 4`.
+        ff_bias (`bool`, *optional*, defaults to `True`): Whether to use bias in the feed-forward layer.
+        attention_out_bias (`bool`, *optional*, defaults to `True`): Whether to use bias in the attention output layer.
+        use_rope (`bool`, *optional*, defaults to `False`): Whether to use Rotary Position Embeddings (RoPE).
+        ffn_dim_mult (`int`, *optional*, defaults to 4): Multiplier for the inner dimension of the feed-forward layer.
+    """
+
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        d_head,
+        context_dim=None,
+        attn_precision=None,
+        dtype=None,
+        device=None,
+        operations=None,
+    ):
+        super().__init__()
+
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        self.attn1 = CrossAttention(
+            query_dim=dim,
+            heads=n_heads,
+            dim_head=d_head,
+            context_dim=None,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+
+        # 3. Feed-forward
+        self.ff = FeedForward(
+            dim,
+            dim_out=dim,
+            glu=True,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+
+    def forward(self, hidden_states, attention_mask=None, pe=None) -> torch.FloatTensor:
+
+        # Notice that normalization is always applied before the real computation in the following blocks.
+
+        # 1. Normalization Before Self-Attention
+        norm_hidden_states = comfy.ldm.common_dit.rms_norm(hidden_states)
+
+        norm_hidden_states = norm_hidden_states.squeeze(1)
+
+        # 2. Self-Attention
+        attn_output = self.attn1(norm_hidden_states, mask=attention_mask, pe=pe)
+
+        hidden_states = attn_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        # 3. Normalization before Feed-Forward
+        norm_hidden_states = comfy.ldm.common_dit.rms_norm(hidden_states)
+
+        # 4. Feed-forward
+        ff_output = self.ff(norm_hidden_states)
+
+        hidden_states = ff_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        return hidden_states
+
+
+class Embeddings1DConnector(nn.Module):
+    _supports_gradient_checkpointing = True
+
+    def __init__(
+        self,
+        in_channels=128,
+        cross_attention_dim=2048,
+        attention_head_dim=128,
+        num_attention_heads=30,
+        num_layers=2,
+        positional_embedding_theta=10000.0,
+        positional_embedding_max_pos=[4096],
+        causal_temporal_positioning=False,
+        num_learnable_registers: Optional[int] = 128,
+        dtype=None,
+        device=None,
+        operations=None,
+        split_rope=False,
+        double_precision_rope=False,
+        **kwargs,
+    ):
+        super().__init__()
+        self.dtype = dtype
+        self.out_channels = in_channels
+        self.num_attention_heads = num_attention_heads
+        self.inner_dim = num_attention_heads * attention_head_dim
+        self.causal_temporal_positioning = causal_temporal_positioning
+        self.positional_embedding_theta = positional_embedding_theta
+        self.positional_embedding_max_pos = positional_embedding_max_pos
+        self.split_rope = split_rope
+        self.double_precision_rope = double_precision_rope
+        self.transformer_1d_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock1D(
+                    self.inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    context_dim=cross_attention_dim,
+                    dtype=dtype,
+                    device=device,
+                    operations=operations,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+        inner_dim = num_attention_heads * attention_head_dim
+        self.num_learnable_registers = num_learnable_registers
+        if self.num_learnable_registers:
+            self.learnable_registers = nn.Parameter(
+                torch.rand(
+                    self.num_learnable_registers, inner_dim, dtype=dtype, device=device
+                )
+                * 2.0
+                - 1.0
+            )
+
+    def get_fractional_positions(self, indices_grid):
+        fractional_positions = torch.stack(
+            [
+                indices_grid[:, i] / self.positional_embedding_max_pos[i]
+                for i in range(1)
+            ],
+            dim=-1,
+        )
+        return fractional_positions
+
+    def precompute_freqs(self, indices_grid, spacing):
+        source_dtype = indices_grid.dtype
+        dtype = (
+            torch.float32
+            if source_dtype in (torch.bfloat16, torch.float16)
+            else source_dtype
+        )
+
+        fractional_positions = self.get_fractional_positions(indices_grid)
+        indices = (
+            generate_freq_grid_np(
+                self.positional_embedding_theta,
+                indices_grid.shape[1],
+                self.inner_dim,
+            )
+            if self.double_precision_rope
+            else self.generate_freq_grid(spacing, dtype, fractional_positions.device)
+        ).to(device=fractional_positions.device)
+
+        if spacing == "exp_2":
+            freqs = (
+                (indices * fractional_positions.unsqueeze(-1))
+                .transpose(-1, -2)
+                .flatten(2)
+            )
+        else:
+            freqs = (
+                (indices * (fractional_positions.unsqueeze(-1) * 2 - 1))
+                .transpose(-1, -2)
+                .flatten(2)
+            )
+        return freqs
+
+    def generate_freq_grid(self, spacing, dtype, device):
+        dim = self.inner_dim
+        theta = self.positional_embedding_theta
+        n_pos_dims = 1
+        n_elem = 2 * n_pos_dims  # 2 for cos and sin e.g. x 3 = 6
+        start = 1
+        end = theta
+
+        if spacing == "exp":
+            indices = theta ** (torch.arange(0, dim, n_elem, device="cpu", dtype=torch.float32) / (dim - n_elem))
+            indices = indices.to(dtype=dtype, device=device)
+        elif spacing == "exp_2":
+            indices = 1.0 / theta ** (torch.arange(0, dim, n_elem, device=device) / dim)
+            indices = indices.to(dtype=dtype)
+        elif spacing == "linear":
+            indices = torch.linspace(
+                start, end, dim // n_elem, device=device, dtype=dtype
+            )
+        elif spacing == "sqrt":
+            indices = torch.linspace(
+                start**2, end**2, dim // n_elem, device=device, dtype=dtype
+            ).sqrt()
+
+        indices = indices * math.pi / 2
+
+        return indices
+
+    def precompute_freqs_cis(self, indices_grid, spacing="exp"):
+        dim = self.inner_dim
+        n_elem = 2  # 2 because of cos and sin
+        freqs = self.precompute_freqs(indices_grid, spacing)
+        if self.split_rope:
+            expected_freqs = dim // 2
+            current_freqs = freqs.shape[-1]
+            pad_size = expected_freqs - current_freqs
+            cos_freq, sin_freq = split_freqs_cis(
+                freqs, pad_size, self.num_attention_heads
+            )
+        else:
+            cos_freq, sin_freq = interleaved_freqs_cis(freqs, dim % n_elem)
+        return cos_freq.to(self.dtype), sin_freq.to(self.dtype), self.split_rope
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        """
+        The [`Transformer2DModel`] forward method.
+
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input `hidden_states`.
+            indices_grid (`torch.LongTensor` of shape `(batch size, 3, num latent pixels)`):
+            attention_mask ( `torch.Tensor`, *optional*):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        # 1. Input
+
+        if self.num_learnable_registers:
+            num_registers_duplications = math.ceil(
+                max(1024, hidden_states.shape[1]) / self.num_learnable_registers
+            )
+            learnable_registers = torch.tile(
+                self.learnable_registers.to(hidden_states), (num_registers_duplications, 1)
+            )
+
+            hidden_states = torch.cat((hidden_states, learnable_registers[hidden_states.shape[1]:].unsqueeze(0).repeat(hidden_states.shape[0], 1, 1)), dim=1)
+
+            if attention_mask is not None:
+                attention_mask = torch.zeros([1, 1, 1, hidden_states.shape[1]], dtype=attention_mask.dtype, device=attention_mask.device)
+
+        indices_grid = torch.arange(
+            hidden_states.shape[1], dtype=torch.float32, device=hidden_states.device
+        )
+        indices_grid = indices_grid[None, None, :]
+        freqs_cis = self.precompute_freqs_cis(indices_grid)
+
+        # 2. Blocks
+        for block_idx, block in enumerate(self.transformer_1d_blocks):
+            hidden_states = block(
+                hidden_states, attention_mask=attention_mask, pe=freqs_cis
+            )
+
+        # 3. Output
+        # if self.output_scale is not None:
+        #     hidden_states = hidden_states / self.output_scale
+
+        hidden_states = comfy.ldm.common_dit.rms_norm(hidden_states)
+
+        return hidden_states, attention_mask
--- a/comfy/ldm/lightricks/latent_upsampler.py
+++ b/comfy/ldm/lightricks/latent_upsampler.py
@ -0,0 +1,292 @@
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+
+
+def _rational_for_scale(scale: float) -> Tuple[int, int]:
+    mapping = {0.75: (3, 4), 1.5: (3, 2), 2.0: (2, 1), 4.0: (4, 1)}
+    if float(scale) not in mapping:
+        raise ValueError(
+            f"Unsupported spatial_scale {scale}. Choose from {list(mapping.keys())}"
+        )
+    return mapping[float(scale)]
+
+
+class PixelShuffleND(nn.Module):
+    def __init__(self, dims, upscale_factors=(2, 2, 2)):
+        super().__init__()
+        assert dims in [1, 2, 3], "dims must be 1, 2, or 3"
+        self.dims = dims
+        self.upscale_factors = upscale_factors
+
+    def forward(self, x):
+        if self.dims == 3:
+            return rearrange(
+                x,
+                "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",
+                p1=self.upscale_factors[0],
+                p2=self.upscale_factors[1],
+                p3=self.upscale_factors[2],
+            )
+        elif self.dims == 2:
+            return rearrange(
+                x,
+                "b (c p1 p2) h w -> b c (h p1) (w p2)",
+                p1=self.upscale_factors[0],
+                p2=self.upscale_factors[1],
+            )
+        elif self.dims == 1:
+            return rearrange(
+                x,
+                "b (c p1) f h w -> b c (f p1) h w",
+                p1=self.upscale_factors[0],
+            )
+
+
+class BlurDownsample(nn.Module):
+    """
+    Anti-aliased spatial downsampling by integer stride using a fixed separable binomial kernel.
+    Applies only on H,W. Works for dims=2 or dims=3 (per-frame).
+    """
+
+    def __init__(self, dims: int, stride: int):
+        super().__init__()
+        assert dims in (2, 3)
+        assert stride >= 1 and isinstance(stride, int)
+        self.dims = dims
+        self.stride = stride
+
+        # 5x5 separable binomial kernel [1,4,6,4,1] (outer product), normalized
+        k = torch.tensor([1.0, 4.0, 6.0, 4.0, 1.0])
+        k2d = k[:, None] @ k[None, :]
+        k2d = (k2d / k2d.sum()).float()  # shape (5,5)
+        self.register_buffer("kernel", k2d[None, None, :, :])  # (1,1,5,5)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.stride == 1:
+            return x
+
+        def _apply_2d(x2d: torch.Tensor) -> torch.Tensor:
+            # x2d: (B, C, H, W)
+            B, C, H, W = x2d.shape
+            weight = self.kernel.expand(C, 1, 5, 5)  # depthwise
+            x2d = F.conv2d(
+                x2d, weight=weight, bias=None, stride=self.stride, padding=2, groups=C
+            )
+            return x2d
+
+        if self.dims == 2:
+            return _apply_2d(x)
+        else:
+            # dims == 3: apply per-frame on H,W
+            b, c, f, h, w = x.shape
+            x = rearrange(x, "b c f h w -> (b f) c h w")
+            x = _apply_2d(x)
+            h2, w2 = x.shape[-2:]
+            x = rearrange(x, "(b f) c h w -> b c f h w", b=b, f=f, h=h2, w=w2)
+            return x
+
+
+class SpatialRationalResampler(nn.Module):
+    """
+    Fully-learned rational spatial scaling: up by 'num' via PixelShuffle, then anti-aliased
+    downsample by 'den' using fixed blur + stride. Operates on H,W only.
+
+    For dims==3, work per-frame for spatial scaling (temporal axis untouched).
+    """
+
+    def __init__(self, mid_channels: int, scale: float):
+        super().__init__()
+        self.scale = float(scale)
+        self.num, self.den = _rational_for_scale(self.scale)
+        self.conv = nn.Conv2d(
+            mid_channels, (self.num**2) * mid_channels, kernel_size=3, padding=1
+        )
+        self.pixel_shuffle = PixelShuffleND(2, upscale_factors=(self.num, self.num))
+        self.blur_down = BlurDownsample(dims=2, stride=self.den)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        b, c, f, h, w = x.shape
+        x = rearrange(x, "b c f h w -> (b f) c h w")
+        x = self.conv(x)
+        x = self.pixel_shuffle(x)
+        x = self.blur_down(x)
+        x = rearrange(x, "(b f) c h w -> b c f h w", b=b, f=f)
+        return x
+
+
+class ResBlock(nn.Module):
+    def __init__(
+        self, channels: int, mid_channels: Optional[int] = None, dims: int = 3
+    ):
+        super().__init__()
+        if mid_channels is None:
+            mid_channels = channels
+
+        Conv = nn.Conv2d if dims == 2 else nn.Conv3d
+
+        self.conv1 = Conv(channels, mid_channels, kernel_size=3, padding=1)
+        self.norm1 = nn.GroupNorm(32, mid_channels)
+        self.conv2 = Conv(mid_channels, channels, kernel_size=3, padding=1)
+        self.norm2 = nn.GroupNorm(32, channels)
+        self.activation = nn.SiLU()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        residual = x
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.activation(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        x = self.activation(x + residual)
+        return x
+
+
+class LatentUpsampler(nn.Module):
+    """
+    Model to spatially upsample VAE latents.
+
+    Args:
+        in_channels (`int`): Number of channels in the input latent
+        mid_channels (`int`): Number of channels in the middle layers
+        num_blocks_per_stage (`int`): Number of ResBlocks to use in each stage (pre/post upsampling)
+        dims (`int`): Number of dimensions for convolutions (2 or 3)
+        spatial_upsample (`bool`): Whether to spatially upsample the latent
+        temporal_upsample (`bool`): Whether to temporally upsample the latent
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 128,
+        mid_channels: int = 512,
+        num_blocks_per_stage: int = 4,
+        dims: int = 3,
+        spatial_upsample: bool = True,
+        temporal_upsample: bool = False,
+        spatial_scale: float = 2.0,
+        rational_resampler: bool = False,
+    ):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.mid_channels = mid_channels
+        self.num_blocks_per_stage = num_blocks_per_stage
+        self.dims = dims
+        self.spatial_upsample = spatial_upsample
+        self.temporal_upsample = temporal_upsample
+        self.spatial_scale = float(spatial_scale)
+        self.rational_resampler = rational_resampler
+
+        Conv = nn.Conv2d if dims == 2 else nn.Conv3d
+
+        self.initial_conv = Conv(in_channels, mid_channels, kernel_size=3, padding=1)
+        self.initial_norm = nn.GroupNorm(32, mid_channels)
+        self.initial_activation = nn.SiLU()
+
+        self.res_blocks = nn.ModuleList(
+            [ResBlock(mid_channels, dims=dims) for _ in range(num_blocks_per_stage)]
+        )
+
+        if spatial_upsample and temporal_upsample:
+            self.upsampler = nn.Sequential(
+                nn.Conv3d(mid_channels, 8 * mid_channels, kernel_size=3, padding=1),
+                PixelShuffleND(3),
+            )
+        elif spatial_upsample:
+            if rational_resampler:
+                self.upsampler = SpatialRationalResampler(
+                    mid_channels=mid_channels, scale=self.spatial_scale
+                )
+            else:
+                self.upsampler = nn.Sequential(
+                    nn.Conv2d(mid_channels, 4 * mid_channels, kernel_size=3, padding=1),
+                    PixelShuffleND(2),
+                )
+        elif temporal_upsample:
+            self.upsampler = nn.Sequential(
+                nn.Conv3d(mid_channels, 2 * mid_channels, kernel_size=3, padding=1),
+                PixelShuffleND(1),
+            )
+        else:
+            raise ValueError(
+                "Either spatial_upsample or temporal_upsample must be True"
+            )
+
+        self.post_upsample_res_blocks = nn.ModuleList(
+            [ResBlock(mid_channels, dims=dims) for _ in range(num_blocks_per_stage)]
+        )
+
+        self.final_conv = Conv(mid_channels, in_channels, kernel_size=3, padding=1)
+
+    def forward(self, latent: torch.Tensor) -> torch.Tensor:
+        b, c, f, h, w = latent.shape
+
+        if self.dims == 2:
+            x = rearrange(latent, "b c f h w -> (b f) c h w")
+            x = self.initial_conv(x)
+            x = self.initial_norm(x)
+            x = self.initial_activation(x)
+
+            for block in self.res_blocks:
+                x = block(x)
+
+            x = self.upsampler(x)
+
+            for block in self.post_upsample_res_blocks:
+                x = block(x)
+
+            x = self.final_conv(x)
+            x = rearrange(x, "(b f) c h w -> b c f h w", b=b, f=f)
+        else:
+            x = self.initial_conv(latent)
+            x = self.initial_norm(x)
+            x = self.initial_activation(x)
+
+            for block in self.res_blocks:
+                x = block(x)
+
+            if self.temporal_upsample:
+                x = self.upsampler(x)
+                x = x[:, :, 1:, :, :]
+            else:
+                if isinstance(self.upsampler, SpatialRationalResampler):
+                    x = self.upsampler(x)
+                else:
+                    x = rearrange(x, "b c f h w -> (b f) c h w")
+                    x = self.upsampler(x)
+                    x = rearrange(x, "(b f) c h w -> b c f h w", b=b, f=f)
+
+            for block in self.post_upsample_res_blocks:
+                x = block(x)
+
+            x = self.final_conv(x)
+
+        return x
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(
+            in_channels=config.get("in_channels", 4),
+            mid_channels=config.get("mid_channels", 128),
+            num_blocks_per_stage=config.get("num_blocks_per_stage", 4),
+            dims=config.get("dims", 2),
+            spatial_upsample=config.get("spatial_upsample", True),
+            temporal_upsample=config.get("temporal_upsample", False),
+            spatial_scale=config.get("spatial_scale", 2.0),
+            rational_resampler=config.get("rational_resampler", False),
+        )
+
+    def config(self):
+        return {
+            "_class_name": "LatentUpsampler",
+            "in_channels": self.in_channels,
+            "mid_channels": self.mid_channels,
+            "num_blocks_per_stage": self.num_blocks_per_stage,
+            "dims": self.dims,
+            "spatial_upsample": self.spatial_upsample,
+            "temporal_upsample": self.temporal_upsample,
+            "spatial_scale": self.spatial_scale,
+            "rational_resampler": self.rational_resampler,
+        }
--- a/comfy/ldm/lightricks/model.py
+++ b/comfy/ldm/lightricks/model.py
@ -1,14 +1,47 @@
+from abc import ABC, abstractmethod
+from enum import Enum
+import functools
+import math
+from typing import Dict, Optional, Tuple
+
+from einops import rearrange
+import numpy as np
 import torch
 from torch import nn
 import comfy.patcher_extension
 import comfy.ldm.modules.attention
 import comfy.ldm.common_dit
-from einops import rearrange
-import math
-from typing import Dict, Optional, Tuple

 from .symmetric_patchifier import SymmetricPatchifier, latent_to_pixel_coords

+def _log_base(x, base):
+    return np.log(x) / np.log(base)
+
+class LTXRopeType(str, Enum):
+    INTERLEAVED = "interleaved"
+    SPLIT = "split"
+
+    KEY = "rope_type"
+
+    @classmethod
+    def from_dict(cls, kwargs, default=None):
+        if default is None:
+            default = cls.INTERLEAVED
+        return cls(kwargs.get(cls.KEY, default))
+
+
+class LTXFrequenciesPrecision(str, Enum):
+    FLOAT32 = "float32"
+    FLOAT64 = "float64"
+
+    KEY = "frequencies_precision"
+
+    @classmethod
+    def from_dict(cls, kwargs, default=None):
+        if default is None:
+            default = cls.FLOAT32
+        return cls(kwargs.get(cls.KEY, default))
+

 def get_timestep_embedding(
    timesteps: torch.Tensor,
@ -40,9 +73,7 @@ def get_timestep_embedding(
    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"

    half_dim = embedding_dim // 2
-    exponent = -math.log(max_period) * torch.arange(
-        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
-    )
+    exponent = -math.log(max_period) * torch.arange(start=0, end=half_dim, dtype=torch.float32, device=timesteps.device)
    exponent = exponent / (half_dim - downscale_freq_shift)

    emb = torch.exp(exponent)
@ -74,7 +105,9 @@ class TimestepEmbedding(nn.Module):
        post_act_fn: Optional[str] = None,
        cond_proj_dim=None,
        sample_proj_bias=True,
-        dtype=None, device=None, operations=None,
+        dtype=None,
+        device=None,
+        operations=None,
    ):
        super().__init__()

@ -91,7 +124,9 @@ class TimestepEmbedding(nn.Module):
            time_embed_dim_out = out_dim
        else:
            time_embed_dim_out = time_embed_dim
-        self.linear_2 = operations.Linear(time_embed_dim, time_embed_dim_out, sample_proj_bias, dtype=dtype, device=device)
+        self.linear_2 = operations.Linear(
+            time_embed_dim, time_embed_dim_out, sample_proj_bias, dtype=dtype, device=device
+        )

        if post_act_fn is None:
            self.post_act = None
@ -140,12 +175,22 @@ class PixArtAlphaCombinedTimestepSizeEmbeddings(nn.Module):
    https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L164C9-L168C29
    """

-    def __init__(self, embedding_dim, size_emb_dim, use_additional_conditions: bool = False, dtype=None, device=None, operations=None):
+    def __init__(
+        self,
+        embedding_dim,
+        size_emb_dim,
+        use_additional_conditions: bool = False,
+        dtype=None,
+        device=None,
+        operations=None,
+    ):
        super().__init__()

        self.outdim = size_emb_dim
        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim, dtype=dtype, device=device, operations=operations)
+        self.timestep_embedder = TimestepEmbedding(
+            in_channels=256, time_embed_dim=embedding_dim, dtype=dtype, device=device, operations=operations
+        )

    def forward(self, timestep, resolution, aspect_ratio, batch_size, hidden_dtype):
        timesteps_proj = self.time_proj(timestep)
@ -164,15 +209,22 @@ class AdaLayerNormSingle(nn.Module):
        use_additional_conditions (`bool`): To use additional conditions for normalization or not.
    """

-    def __init__(self, embedding_dim: int, use_additional_conditions: bool = False, dtype=None, device=None, operations=None):
+    def __init__(
+        self, embedding_dim: int, embedding_coefficient: int = 6, use_additional_conditions: bool = False, dtype=None, device=None, operations=None
+    ):
        super().__init__()

        self.emb = PixArtAlphaCombinedTimestepSizeEmbeddings(
-            embedding_dim, size_emb_dim=embedding_dim // 3, use_additional_conditions=use_additional_conditions, dtype=dtype, device=device, operations=operations
+            embedding_dim,
+            size_emb_dim=embedding_dim // 3,
+            use_additional_conditions=use_additional_conditions,
+            dtype=dtype,
+            device=device,
+            operations=operations,
        )

        self.silu = nn.SiLU()
-        self.linear = operations.Linear(embedding_dim, 6 * embedding_dim, bias=True, dtype=dtype, device=device)
+        self.linear = operations.Linear(embedding_dim, embedding_coefficient * embedding_dim, bias=True, dtype=dtype, device=device)

    def forward(
        self,
@ -186,6 +238,7 @@ class AdaLayerNormSingle(nn.Module):
        embedded_timestep = self.emb(timestep, **added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_dtype)
        return self.linear(self.silu(embedded_timestep)), embedded_timestep

+
 class PixArtAlphaTextProjection(nn.Module):
    """
    Projects caption embeddings. Also handles dropout for classifier-free guidance.
@ -193,18 +246,24 @@ class PixArtAlphaTextProjection(nn.Module):
    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
    """

-    def __init__(self, in_features, hidden_size, out_features=None, act_fn="gelu_tanh", dtype=None, device=None, operations=None):
+    def __init__(
+        self, in_features, hidden_size, out_features=None, act_fn="gelu_tanh", dtype=None, device=None, operations=None
+    ):
        super().__init__()
        if out_features is None:
            out_features = hidden_size
-        self.linear_1 = operations.Linear(in_features=in_features, out_features=hidden_size, bias=True, dtype=dtype, device=device)
+        self.linear_1 = operations.Linear(
+            in_features=in_features, out_features=hidden_size, bias=True, dtype=dtype, device=device
+        )
        if act_fn == "gelu_tanh":
            self.act_1 = nn.GELU(approximate="tanh")
        elif act_fn == "silu":
            self.act_1 = nn.SiLU()
        else:
            raise ValueError(f"Unknown activation function: {act_fn}")
-        self.linear_2 = operations.Linear(in_features=hidden_size, out_features=out_features, bias=True, dtype=dtype, device=device)
+        self.linear_2 = operations.Linear(
+            in_features=hidden_size, out_features=out_features, bias=True, dtype=dtype, device=device
+        )

    def forward(self, caption):
        hidden_states = self.linear_1(caption)
@ -223,25 +282,28 @@ class GELU_approx(nn.Module):


 class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out, mult=4, glu=False, dropout=0., dtype=None, device=None, operations=None):
+    def __init__(self, dim, dim_out, mult=4, glu=False, dropout=0.0, dtype=None, device=None, operations=None):
        super().__init__()
        inner_dim = int(dim * mult)
        project_in = GELU_approx(dim, inner_dim, dtype=dtype, device=device, operations=operations)

        self.net = nn.Sequential(
-            project_in,
-            nn.Dropout(dropout),
-            operations.Linear(inner_dim, dim_out, dtype=dtype, device=device)
+            project_in, nn.Dropout(dropout), operations.Linear(inner_dim, dim_out, dtype=dtype, device=device)
        )

    def forward(self, x):
        return self.net(x)

+def apply_rotary_emb(input_tensor, freqs_cis):
+    cos_freqs, sin_freqs = freqs_cis[0], freqs_cis[1]
+    split_pe = freqs_cis[2] if len(freqs_cis) > 2 else False
+    return (
+        apply_split_rotary_emb(input_tensor, cos_freqs, sin_freqs)
+        if split_pe else
+        apply_interleaved_rotary_emb(input_tensor, cos_freqs, sin_freqs)
+    )

-def apply_rotary_emb(input_tensor, freqs_cis): #TODO: remove duplicate funcs and pick the best/fastest one
-    cos_freqs = freqs_cis[0]
-    sin_freqs = freqs_cis[1]
-
+def apply_interleaved_rotary_emb(input_tensor, cos_freqs, sin_freqs):  # TODO: remove duplicate funcs and pick the best/fastest one
    t_dup = rearrange(input_tensor, "... (d r) -> ... d r", r=2)
    t1, t2 = t_dup.unbind(dim=-1)
    t_dup = torch.stack((-t2, t1), dim=-1)
@ -251,9 +313,37 @@ def apply_rotary_emb(input_tensor, freqs_cis): #TODO: remove duplicate funcs and

    return out

+def apply_split_rotary_emb(input_tensor, cos, sin):
+    needs_reshape = False
+    if input_tensor.ndim != 4 and cos.ndim == 4:
+        B, H, T, _ = cos.shape
+        input_tensor = input_tensor.reshape(B, T, H, -1).swapaxes(1, 2)
+        needs_reshape = True
+    split_input = rearrange(input_tensor, "... (d r) -> ... d r", d=2)
+    first_half_input = split_input[..., :1, :]
+    second_half_input = split_input[..., 1:, :]
+    output = split_input * cos.unsqueeze(-2)
+    first_half_output = output[..., :1, :]
+    second_half_output = output[..., 1:, :]
+    first_half_output.addcmul_(-sin.unsqueeze(-2), second_half_input)
+    second_half_output.addcmul_(sin.unsqueeze(-2), first_half_input)
+    output = rearrange(output, "... d r -> ... (d r)")
+    return output.swapaxes(1, 2).reshape(B, T, -1) if needs_reshape else output
+

 class CrossAttention(nn.Module):
-    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0., attn_precision=None, dtype=None, device=None, operations=None):
+    def __init__(
+        self,
+        query_dim,
+        context_dim=None,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        attn_precision=None,
+        dtype=None,
+        device=None,
+        operations=None,
+    ):
        super().__init__()
        inner_dim = dim_head * heads
        context_dim = query_dim if context_dim is None else context_dim
@ -269,9 +359,11 @@ class CrossAttention(nn.Module):
        self.to_k = operations.Linear(context_dim, inner_dim, bias=True, dtype=dtype, device=device)
        self.to_v = operations.Linear(context_dim, inner_dim, bias=True, dtype=dtype, device=device)

-        self.to_out = nn.Sequential(operations.Linear(inner_dim, query_dim, dtype=dtype, device=device), nn.Dropout(dropout))
+        self.to_out = nn.Sequential(
+            operations.Linear(inner_dim, query_dim, dtype=dtype, device=device), nn.Dropout(dropout)
+        )

-    def forward(self, x, context=None, mask=None, pe=None, transformer_options={}):
+    def forward(self, x, context=None, mask=None, pe=None, k_pe=None, transformer_options={}):
        q = self.to_q(x)
        context = x if context is None else context
        k = self.to_k(context)
@ -282,7 +374,7 @@ class CrossAttention(nn.Module):

        if pe is not None:
            q = apply_rotary_emb(q, pe)
-            k = apply_rotary_emb(k, pe)
+            k = apply_rotary_emb(k, pe if k_pe is None else k_pe)

        if mask is None:
            out = comfy.ldm.modules.attention.optimized_attention(q, k, v, self.heads, attn_precision=self.attn_precision, transformer_options=transformer_options)
@ -292,146 +384,495 @@ class CrossAttention(nn.Module):


 class BasicTransformerBlock(nn.Module):
-    def __init__(self, dim, n_heads, d_head, context_dim=None, attn_precision=None, dtype=None, device=None, operations=None):
+    def __init__(
+        self, dim, n_heads, d_head, context_dim=None, attn_precision=None, dtype=None, device=None, operations=None
+    ):
        super().__init__()

        self.attn_precision = attn_precision
-        self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, context_dim=None, attn_precision=self.attn_precision, dtype=dtype, device=device, operations=operations)
+        self.attn1 = CrossAttention(
+            query_dim=dim,
+            heads=n_heads,
+            dim_head=d_head,
+            context_dim=None,
+            attn_precision=self.attn_precision,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
        self.ff = FeedForward(dim, dim_out=dim, glu=True, dtype=dtype, device=device, operations=operations)

-        self.attn2 = CrossAttention(query_dim=dim, context_dim=context_dim, heads=n_heads, dim_head=d_head, attn_precision=self.attn_precision, dtype=dtype, device=device, operations=operations)
+        self.attn2 = CrossAttention(
+            query_dim=dim,
+            context_dim=context_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            attn_precision=self.attn_precision,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )

        self.scale_shift_table = nn.Parameter(torch.empty(6, dim, device=device, dtype=dtype))

    def forward(self, x, context=None, attention_mask=None, timestep=None, pe=None, transformer_options={}):
        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None, None].to(device=x.device, dtype=x.dtype) + timestep.reshape(x.shape[0], timestep.shape[1], self.scale_shift_table.shape[0], -1)).unbind(dim=2)

-        x += self.attn1(comfy.ldm.common_dit.rms_norm(x) * (1 + scale_msa) + shift_msa, pe=pe, transformer_options=transformer_options) * gate_msa
+        attn1_input = comfy.ldm.common_dit.rms_norm(x)
+        attn1_input = torch.addcmul(attn1_input, attn1_input, scale_msa).add_(shift_msa)
+        attn1_input = self.attn1(attn1_input, pe=pe, transformer_options=transformer_options)
+        x.addcmul_(attn1_input, gate_msa)
+        del attn1_input

        x += self.attn2(x, context=context, mask=attention_mask, transformer_options=transformer_options)

-        y = comfy.ldm.common_dit.rms_norm(x) * (1 + scale_mlp) + shift_mlp
-        x += self.ff(y) * gate_mlp
+        y = comfy.ldm.common_dit.rms_norm(x)
+        y = torch.addcmul(y, y, scale_mlp).add_(shift_mlp)
+        x.addcmul_(self.ff(y), gate_mlp)

        return x

 def get_fractional_positions(indices_grid, max_pos):
+    n_pos_dims = indices_grid.shape[1]
+    assert n_pos_dims == len(max_pos), f'Number of position dimensions ({n_pos_dims}) must match max_pos length ({len(max_pos)})'
    fractional_positions = torch.stack(
-        [
-            indices_grid[:, i] / max_pos[i]
-            for i in range(3)
-        ],
-        dim=-1,
+        [indices_grid[:, i] / max_pos[i] for i in range(n_pos_dims)],
+        axis=-1,
    )
    return fractional_positions


-def precompute_freqs_cis(indices_grid, dim, out_dtype, theta=10000.0, max_pos=[20, 2048, 2048]):
-    dtype = torch.float32 #self.dtype
-
-    fractional_positions = get_fractional_positions(indices_grid, max_pos)
-
+@functools.lru_cache(maxsize=5)
+def generate_freq_grid_np(positional_embedding_theta, positional_embedding_max_pos_count, inner_dim, _ = None):
+    theta = positional_embedding_theta
    start = 1
    end = theta
-    device = fractional_positions.device
+
+    n_elem = 2 * positional_embedding_max_pos_count
+    pow_indices = np.power(
+        theta,
+        np.linspace(
+            _log_base(start, theta),
+            _log_base(end, theta),
+            inner_dim // n_elem,
+            dtype=np.float64,
+        ),
+    )
+    return torch.tensor(pow_indices * math.pi / 2, dtype=torch.float32)
+
+def generate_freq_grid_pytorch(positional_embedding_theta, positional_embedding_max_pos_count, inner_dim, device):
+    theta = positional_embedding_theta
+    start = 1
+    end = theta
+    n_elem = 2 * positional_embedding_max_pos_count

    indices = theta ** (
        torch.linspace(
            math.log(start, theta),
            math.log(end, theta),
-            dim // 6,
+            inner_dim // n_elem,
            device=device,
-            dtype=dtype,
+            dtype=torch.float32,
        )
    )
-    indices = indices.to(dtype=dtype)
+    indices = indices.to(dtype=torch.float32)

    indices = indices * math.pi / 2

+    return indices
+
+def generate_freqs(indices, indices_grid, max_pos, use_middle_indices_grid):
+    if use_middle_indices_grid:
+        assert(len(indices_grid.shape) == 4 and indices_grid.shape[-1] ==2)
+        indices_grid_start, indices_grid_end = indices_grid[..., 0], indices_grid[..., 1]
+        indices_grid = (indices_grid_start + indices_grid_end) / 2.0
+    elif len(indices_grid.shape) == 4:
+        indices_grid = indices_grid[..., 0]
+
+    # Get fractional positions and compute frequency indices
+    fractional_positions = get_fractional_positions(indices_grid, max_pos)
+    indices = indices.to(device=fractional_positions.device)
+
    freqs = (
        (indices * (fractional_positions.unsqueeze(-1) * 2 - 1))
        .transpose(-1, -2)
        .flatten(2)
    )
+    return freqs

+def interleaved_freqs_cis(freqs, pad_size):
    cos_freq = freqs.cos().repeat_interleave(2, dim=-1)
    sin_freq = freqs.sin().repeat_interleave(2, dim=-1)
-    if dim % 6 != 0:
-        cos_padding = torch.ones_like(cos_freq[:, :, : dim % 6])
-        sin_padding = torch.zeros_like(cos_freq[:, :, : dim % 6])
+    if pad_size != 0:
+        cos_padding = torch.ones_like(cos_freq[:, :, : pad_size])
+        sin_padding = torch.zeros_like(cos_freq[:, :, : pad_size])
        cos_freq = torch.cat([cos_padding, cos_freq], dim=-1)
        sin_freq = torch.cat([sin_padding, sin_freq], dim=-1)
-    return cos_freq.to(out_dtype), sin_freq.to(out_dtype)
+    return cos_freq, sin_freq

+def split_freqs_cis(freqs, pad_size, num_attention_heads):
+    cos_freq = freqs.cos()
+    sin_freq = freqs.sin()

-class LTXVModel(torch.nn.Module):
-    def __init__(self,
-                 in_channels=128,
-                 cross_attention_dim=2048,
-                 attention_head_dim=64,
-                 num_attention_heads=32,
+    if pad_size != 0:
+        cos_padding = torch.ones_like(cos_freq[:, :, :pad_size])
+        sin_padding = torch.zeros_like(sin_freq[:, :, :pad_size])

-                 caption_channels=4096,
-                 num_layers=28,
+        cos_freq = torch.concatenate([cos_padding, cos_freq], axis=-1)
+        sin_freq = torch.concatenate([sin_padding, sin_freq], axis=-1)

+    # Reshape freqs to be compatible with multi-head attention
+    B , T, half_HD = cos_freq.shape

-                 positional_embedding_theta=10000.0,
-                 positional_embedding_max_pos=[20, 2048, 2048],
-                 causal_temporal_positioning=False,
-                 vae_scale_factors=(8, 32, 32),
-                 dtype=None, device=None, operations=None, **kwargs):
+    cos_freq = cos_freq.reshape(B, T, num_attention_heads, half_HD // num_attention_heads)
+    sin_freq = sin_freq.reshape(B, T, num_attention_heads, half_HD // num_attention_heads)
+
+    cos_freq = torch.swapaxes(cos_freq, 1, 2)  # (B,H,T,D//2)
+    sin_freq = torch.swapaxes(sin_freq, 1, 2)  # (B,H,T,D//2)
+    return cos_freq, sin_freq
+
+class LTXBaseModel(torch.nn.Module, ABC):
+    """
+    Abstract base class for LTX models (Lightricks Transformer models).
+
+    This class defines the common interface and shared functionality for all LTX models,
+    including LTXV (video) and LTXAV (audio-video) variants.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        cross_attention_dim: int,
+        attention_head_dim: int,
+        num_attention_heads: int,
+        caption_channels: int,
+        num_layers: int,
+        positional_embedding_theta: float = 10000.0,
+        positional_embedding_max_pos: list = [20, 2048, 2048],
+        causal_temporal_positioning: bool = False,
+        vae_scale_factors: tuple = (8, 32, 32),
+        use_middle_indices_grid=False,
+        timestep_scale_multiplier = 1000.0,
+        dtype=None,
+        device=None,
+        operations=None,
+        **kwargs,
+    ):
        super().__init__()
        self.generator = None
        self.vae_scale_factors = vae_scale_factors
+        self.use_middle_indices_grid = use_middle_indices_grid
        self.dtype = dtype
-        self.out_channels = in_channels
-        self.inner_dim = num_attention_heads * attention_head_dim
+        self.in_channels = in_channels
+        self.cross_attention_dim = cross_attention_dim
+        self.attention_head_dim = attention_head_dim
+        self.num_attention_heads = num_attention_heads
+        self.caption_channels = caption_channels
+        self.num_layers = num_layers
+        self.positional_embedding_theta = positional_embedding_theta
+        self.positional_embedding_max_pos = positional_embedding_max_pos
+        self.split_positional_embedding = LTXRopeType.from_dict(kwargs)
+        self.freq_grid_generator = (
+            generate_freq_grid_np if LTXFrequenciesPrecision.from_dict(kwargs) == LTXFrequenciesPrecision.FLOAT64
+            else generate_freq_grid_pytorch
+        )
        self.causal_temporal_positioning = causal_temporal_positioning
+        self.operations = operations
+        self.timestep_scale_multiplier = timestep_scale_multiplier

-        self.patchify_proj = operations.Linear(in_channels, self.inner_dim, bias=True, dtype=dtype, device=device)
+        # Common dimensions
+        self.inner_dim = num_attention_heads * attention_head_dim
+        self.out_channels = in_channels
+
+        # Initialize common components
+        self._init_common_components(device, dtype)
+
+        # Initialize model-specific components
+        self._init_model_components(device, dtype, **kwargs)
+
+        # Initialize transformer blocks
+        self._init_transformer_blocks(device, dtype, **kwargs)
+
+        # Initialize output components
+        self._init_output_components(device, dtype)
+
+    def _init_common_components(self, device, dtype):
+        """Initialize components common to all LTX models
+        - patchify_proj: Linear projection for patchifying input
+        - adaln_single: AdaLN layer for timestep embedding
+        - caption_projection: Linear projection for caption embedding
+        """
+        self.patchify_proj = self.operations.Linear(
+            self.in_channels, self.inner_dim, bias=True, dtype=dtype, device=device
+        )

        self.adaln_single = AdaLayerNormSingle(
-            self.inner_dim, use_additional_conditions=False, dtype=dtype, device=device, operations=operations
+            self.inner_dim, use_additional_conditions=False, dtype=dtype, device=device, operations=self.operations
        )

-        # self.adaln_single.linear = operations.Linear(self.inner_dim, 4 * self.inner_dim, bias=True, dtype=dtype, device=device)
-
        self.caption_projection = PixArtAlphaTextProjection(
-            in_features=caption_channels, hidden_size=self.inner_dim, dtype=dtype, device=device, operations=operations
+            in_features=self.caption_channels,
+            hidden_size=self.inner_dim,
+            dtype=dtype,
+            device=device,
+            operations=self.operations,
        )

+    @abstractmethod
+    def _init_model_components(self, device, dtype, **kwargs):
+        """Initialize model-specific components. Must be implemented by subclasses."""
+        pass
+
+    @abstractmethod
+    def _init_transformer_blocks(self, device, dtype, **kwargs):
+        """Initialize transformer blocks. Must be implemented by subclasses."""
+        pass
+
+    @abstractmethod
+    def _init_output_components(self, device, dtype):
+        """Initialize output components. Must be implemented by subclasses."""
+        pass
+
+    @abstractmethod
+    def _process_input(self, x, keyframe_idxs, denoise_mask, **kwargs):
+        """Process input data. Must be implemented by subclasses."""
+        pass
+
+    @abstractmethod
+    def _process_transformer_blocks(self, x, context, attention_mask, timestep, pe, **kwargs):
+        """Process transformer blocks. Must be implemented by subclasses."""
+        pass
+
+    @abstractmethod
+    def _process_output(self, x, embedded_timestep, keyframe_idxs, **kwargs):
+        """Process output data. Must be implemented by subclasses."""
+        pass
+
+    def _prepare_timestep(self, timestep, batch_size, hidden_dtype, **kwargs):
+        """Prepare timestep embeddings."""
+        grid_mask = kwargs.get("grid_mask", None)
+        if grid_mask is not None:
+            timestep = timestep[:, grid_mask]
+
+        timestep = timestep * self.timestep_scale_multiplier
+        timestep, embedded_timestep = self.adaln_single(
+            timestep.flatten(),
+            {"resolution": None, "aspect_ratio": None},
+            batch_size=batch_size,
+            hidden_dtype=hidden_dtype,
+        )
+
+        # Second dimension is 1 or number of tokens (if timestep_per_token)
+        timestep = timestep.view(batch_size, -1, timestep.shape[-1])
+        embedded_timestep = embedded_timestep.view(batch_size, -1, embedded_timestep.shape[-1])
+
+        return timestep, embedded_timestep
+
+    def _prepare_context(self, context, batch_size, x, attention_mask=None):
+        """Prepare context for transformer blocks."""
+        if self.caption_projection is not None:
+            context = self.caption_projection(context)
+            context = context.view(batch_size, -1, x.shape[-1])
+
+        return context, attention_mask
+
+    def _precompute_freqs_cis(
+        self,
+        indices_grid,
+        dim,
+        out_dtype,
+        theta=10000.0,
+        max_pos=[20, 2048, 2048],
+        use_middle_indices_grid=False,
+        num_attention_heads=32,
+    ):
+        split_mode = self.split_positional_embedding == LTXRopeType.SPLIT
+        indices = self.freq_grid_generator(theta, indices_grid.shape[1], dim, indices_grid.device)
+        freqs = generate_freqs(indices, indices_grid, max_pos, use_middle_indices_grid)
+
+        if split_mode:
+            expected_freqs = dim // 2
+            current_freqs = freqs.shape[-1]
+            pad_size = expected_freqs - current_freqs
+            cos_freq, sin_freq = split_freqs_cis(freqs, pad_size, num_attention_heads)
+        else:
+            # 2 because of cos and sin by 3 for (t, x, y), 1 for temporal only
+            n_elem = 2 * indices_grid.shape[1]
+            cos_freq, sin_freq = interleaved_freqs_cis(freqs, dim % n_elem)
+        return cos_freq.to(out_dtype), sin_freq.to(out_dtype), split_mode
+
+    def _prepare_positional_embeddings(self, pixel_coords, frame_rate, x_dtype):
+        """Prepare positional embeddings."""
+        fractional_coords = pixel_coords.to(torch.float32)
+        fractional_coords[:, 0] = fractional_coords[:, 0] * (1.0 / frame_rate)
+        pe = self._precompute_freqs_cis(
+            fractional_coords,
+            dim=self.inner_dim,
+            out_dtype=x_dtype,
+            max_pos=self.positional_embedding_max_pos,
+            use_middle_indices_grid=self.use_middle_indices_grid,
+            num_attention_heads=self.num_attention_heads,
+        )
+        return pe
+
+    def _prepare_attention_mask(self, attention_mask, x_dtype):
+        """Prepare attention mask."""
+        if attention_mask is not None and not torch.is_floating_point(attention_mask):
+            attention_mask = (attention_mask - 1).to(x_dtype).reshape(
+                (attention_mask.shape[0], 1, -1, attention_mask.shape[-1])
+            ) * torch.finfo(x_dtype).max
+        return attention_mask
+
+    def forward(
+        self, x, timestep, context, attention_mask, frame_rate=25, transformer_options={}, keyframe_idxs=None, denoise_mask=None, **kwargs
+    ):
+        """
+        Forward pass for LTX models.
+
+        Args:
+            x: Input tensor
+            timestep: Timestep tensor
+            context: Context tensor (e.g., text embeddings)
+            attention_mask: Attention mask tensor
+            frame_rate: Frame rate for temporal processing
+            transformer_options: Additional options for transformer blocks
+            keyframe_idxs: Keyframe indices for temporal processing
+            **kwargs: Additional keyword arguments
+
+        Returns:
+            Processed output tensor
+        """
+        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
+            self._forward,
+            self,
+            comfy.patcher_extension.get_all_wrappers(
+                comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options
+            ),
+        ).execute(x, timestep, context, attention_mask, frame_rate, transformer_options, keyframe_idxs, denoise_mask=denoise_mask, **kwargs)
+
+    def _forward(
+        self, x, timestep, context, attention_mask, frame_rate=25, transformer_options={}, keyframe_idxs=None, denoise_mask=None, **kwargs
+    ):
+        """
+        Internal forward pass for LTX models.
+
+        Args:
+            x: Input tensor
+            timestep: Timestep tensor
+            context: Context tensor (e.g., text embeddings)
+            attention_mask: Attention mask tensor
+            frame_rate: Frame rate for temporal processing
+            transformer_options: Additional options for transformer blocks
+            keyframe_idxs: Keyframe indices for temporal processing
+            **kwargs: Additional keyword arguments
+
+        Returns:
+            Processed output tensor
+        """
+        if isinstance(x, list):
+            input_dtype = x[0].dtype
+            batch_size = x[0].shape[0]
+        else:
+            input_dtype = x.dtype
+            batch_size = x.shape[0]
+        # Process input
+        merged_args = {**transformer_options, **kwargs}
+        x, pixel_coords, additional_args = self._process_input(x, keyframe_idxs, denoise_mask, **merged_args)
+        merged_args.update(additional_args)
+
+        # Prepare timestep and context
+        timestep, embedded_timestep = self._prepare_timestep(timestep, batch_size, input_dtype, **merged_args)
+        context, attention_mask = self._prepare_context(context, batch_size, x, attention_mask)
+
+        # Prepare attention mask and positional embeddings
+        attention_mask = self._prepare_attention_mask(attention_mask, input_dtype)
+        pe = self._prepare_positional_embeddings(pixel_coords, frame_rate, input_dtype)
+
+        # Process transformer blocks
+        x = self._process_transformer_blocks(
+            x, context, attention_mask, timestep, pe, transformer_options=transformer_options, **merged_args
+        )
+
+        # Process output
+        x = self._process_output(x, embedded_timestep, keyframe_idxs, **merged_args)
+        return x
+
+
+class LTXVModel(LTXBaseModel):
+    """LTXV model for video generation."""
+
+    def __init__(
+        self,
+        in_channels=128,
+        cross_attention_dim=2048,
+        attention_head_dim=64,
+        num_attention_heads=32,
+        caption_channels=4096,
+        num_layers=28,
+        positional_embedding_theta=10000.0,
+        positional_embedding_max_pos=[20, 2048, 2048],
+        causal_temporal_positioning=False,
+        vae_scale_factors=(8, 32, 32),
+        use_middle_indices_grid=False,
+        timestep_scale_multiplier = 1000.0,
+        dtype=None,
+        device=None,
+        operations=None,
+        **kwargs,
+    ):
+        super().__init__(
+            in_channels=in_channels,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            num_attention_heads=num_attention_heads,
+            caption_channels=caption_channels,
+            num_layers=num_layers,
+            positional_embedding_theta=positional_embedding_theta,
+            positional_embedding_max_pos=positional_embedding_max_pos,
+            causal_temporal_positioning=causal_temporal_positioning,
+            vae_scale_factors=vae_scale_factors,
+            use_middle_indices_grid=use_middle_indices_grid,
+            timestep_scale_multiplier=timestep_scale_multiplier,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+            **kwargs,
+        )
+
+    def _init_model_components(self, device, dtype, **kwargs):
+        """Initialize LTXV-specific components."""
+        # No additional components needed for LTXV beyond base class
+        pass
+
+    def _init_transformer_blocks(self, device, dtype, **kwargs):
+        """Initialize transformer blocks for LTXV."""
        self.transformer_blocks = nn.ModuleList(
            [
                BasicTransformerBlock(
                    self.inner_dim,
-                    num_attention_heads,
-                    attention_head_dim,
-                    context_dim=cross_attention_dim,
-                    # attn_precision=attn_precision,
-                    dtype=dtype, device=device, operations=operations
+                    self.num_attention_heads,
+                    self.attention_head_dim,
+                    context_dim=self.cross_attention_dim,
+                    dtype=dtype,
+                    device=device,
+                    operations=self.operations,
                )
-                for d in range(num_layers)
+                for _ in range(self.num_layers)
            ]
        )

+    def _init_output_components(self, device, dtype):
+        """Initialize output components for LTXV."""
        self.scale_shift_table = nn.Parameter(torch.empty(2, self.inner_dim, dtype=dtype, device=device))
-        self.norm_out = operations.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.proj_out = operations.Linear(self.inner_dim, self.out_channels, dtype=dtype, device=device)
-
-        self.patchifier = SymmetricPatchifier(1)
-
-    def forward(self, x, timestep, context, attention_mask, frame_rate=25, transformer_options={}, keyframe_idxs=None, **kwargs):
-        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
-            self._forward,
-            self,
-            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
-        ).execute(x, timestep, context, attention_mask, frame_rate, transformer_options, keyframe_idxs, **kwargs)
-
-    def _forward(self, x, timestep, context, attention_mask, frame_rate=25, transformer_options={}, keyframe_idxs=None, **kwargs):
-        patches_replace = transformer_options.get("patches_replace", {})
-
-        orig_shape = list(x.shape)
+        self.norm_out = self.operations.LayerNorm(
+            self.inner_dim, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device
+        )
+        self.proj_out = self.operations.Linear(self.inner_dim, self.out_channels, dtype=dtype, device=device)
+        self.patchifier = SymmetricPatchifier(1, start_end=True)

+    def _process_input(self, x, keyframe_idxs, denoise_mask, **kwargs):
+        """Process input for LTXV."""
+        additional_args = {"orig_shape": list(x.shape)}
        x, latent_coords = self.patchifier.patchify(x)
        pixel_coords = latent_to_pixel_coords(
            latent_coords=latent_coords,
@ -439,44 +880,30 @@ class LTXVModel(torch.nn.Module):
            causal_fix=self.causal_temporal_positioning,
        )

+        grid_mask = None
        if keyframe_idxs is not None:
-            pixel_coords[:, :, -keyframe_idxs.shape[2]:] = keyframe_idxs
+            additional_args.update({ "orig_patchified_shape": list(x.shape)})
+            denoise_mask = self.patchifier.patchify(denoise_mask)[0]
+            grid_mask = ~torch.any(denoise_mask < 0, dim=-1)[0]
+            additional_args.update({"grid_mask": grid_mask})
+            x = x[:, grid_mask, :]
+            pixel_coords = pixel_coords[:, :, grid_mask, ...]

-        fractional_coords = pixel_coords.to(torch.float32)
-        fractional_coords[:, 0] = fractional_coords[:, 0] * (1.0 / frame_rate)
+            kf_grid_mask = grid_mask[-keyframe_idxs.shape[2]:]
+            keyframe_idxs = keyframe_idxs[..., kf_grid_mask, :]
+            pixel_coords[:, :, -keyframe_idxs.shape[2]:, :] = keyframe_idxs

        x = self.patchify_proj(x)
-        timestep = timestep * 1000.0
-
-        if attention_mask is not None and not torch.is_floating_point(attention_mask):
-            attention_mask = (attention_mask - 1).to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])) * torch.finfo(x.dtype).max
-
-        pe = precompute_freqs_cis(fractional_coords, dim=self.inner_dim, out_dtype=x.dtype)
-
-        batch_size = x.shape[0]
-        timestep, embedded_timestep = self.adaln_single(
-            timestep.flatten(),
-            {"resolution": None, "aspect_ratio": None},
-            batch_size=batch_size,
-            hidden_dtype=x.dtype,
-        )
-        # Second dimension is 1 or number of tokens (if timestep_per_token)
-        timestep = timestep.view(batch_size, -1, timestep.shape[-1])
-        embedded_timestep = embedded_timestep.view(
-            batch_size, -1, embedded_timestep.shape[-1]
-        )
-
-        # 2. Blocks
-        if self.caption_projection is not None:
-            batch_size = x.shape[0]
-            context = self.caption_projection(context)
-            context = context.view(
-                batch_size, -1, x.shape[-1]
-            )
+        return x, pixel_coords, additional_args

+    def _process_transformer_blocks(self, x, context, attention_mask, timestep, pe, transformer_options={}, **kwargs):
+        """Process transformer blocks for LTXV."""
+        patches_replace = transformer_options.get("patches_replace", {})
        blocks_replace = patches_replace.get("dit", {})
+
        for i, block in enumerate(self.transformer_blocks):
            if ("double_block", i) in blocks_replace:
+
                def block_wrap(args):
                    out = {}
                    out["img"] = block(args["img"], context=args["txt"], attention_mask=args["attention_mask"], timestep=args["vec"], pe=args["pe"], transformer_options=args["transformer_options"])
@ -494,16 +921,28 @@ class LTXVModel(torch.nn.Module):
                    transformer_options=transformer_options,
                )

-        # 3. Output
+        return x
+
+    def _process_output(self, x, embedded_timestep, keyframe_idxs, **kwargs):
+        """Process output for LTXV."""
+        # Apply scale-shift modulation
        scale_shift_values = (
            self.scale_shift_table[None, None].to(device=x.device, dtype=x.dtype) + embedded_timestep[:, :, None]
        )
        shift, scale = scale_shift_values[:, :, 0], scale_shift_values[:, :, 1]
+
        x = self.norm_out(x)
-        # Modulation
        x = x * (1 + scale) + shift
        x = self.proj_out(x)

+        if keyframe_idxs is not None:
+            grid_mask = kwargs["grid_mask"]
+            orig_patchified_shape = kwargs["orig_patchified_shape"]
+            full_x = torch.zeros(orig_patchified_shape, dtype=x.dtype, device=x.device)
+            full_x[:, grid_mask, :] = x
+            x = full_x
+        # Unpatchify to restore original dimensions
+        orig_shape = kwargs["orig_shape"]
        x = self.patchifier.unpatchify(
            latents=x,
            output_height=orig_shape[3],
--- a/comfy/ldm/lightricks/symmetric_patchifier.py
+++ b/comfy/ldm/lightricks/symmetric_patchifier.py
@ -21,20 +21,23 @@ def latent_to_pixel_coords(
    Returns:
        Tensor: A tensor of pixel coordinates corresponding to the input latent coordinates.
    """
+    shape = [1] * latent_coords.ndim
+    shape[1] = -1
    pixel_coords = (
        latent_coords
-        * torch.tensor(scale_factors, device=latent_coords.device)[None, :, None]
+        * torch.tensor(scale_factors, device=latent_coords.device).view(*shape)
    )
    if causal_fix:
        # Fix temporal scale for first frame to 1 due to causality
-        pixel_coords[:, 0] = (pixel_coords[:, 0] + 1 - scale_factors[0]).clamp(min=0)
+        pixel_coords[:, 0, ...] = (pixel_coords[:, 0, ...] + 1 - scale_factors[0]).clamp(min=0)
    return pixel_coords


 class Patchifier(ABC):
-    def __init__(self, patch_size: int):
+    def __init__(self, patch_size: int, start_end: bool=False):
        super().__init__()
        self._patch_size = (1, patch_size, patch_size)
+        self.start_end = start_end

    @abstractmethod
    def patchify(
@ -71,11 +74,23 @@ class Patchifier(ABC):
            torch.arange(0, latent_width, self._patch_size[2], device=device),
            indexing="ij",
        )
-        latent_sample_coords = torch.stack(latent_sample_coords, dim=0)
-        latent_coords = latent_sample_coords.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
-        latent_coords = rearrange(
-            latent_coords, "b c f h w -> b c (f h w)", b=batch_size
+        latent_sample_coords_start = torch.stack(latent_sample_coords, dim=0)
+        delta = torch.tensor(self._patch_size, device=latent_sample_coords_start.device, dtype=latent_sample_coords_start.dtype)[:, None, None, None]
+        latent_sample_coords_end = latent_sample_coords_start + delta
+
+        latent_sample_coords_start = latent_sample_coords_start.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
+        latent_sample_coords_start = rearrange(
+            latent_sample_coords_start, "b c f h w -> b c (f h w)", b=batch_size
        )
+        if self.start_end:
+            latent_sample_coords_end = latent_sample_coords_end.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
+            latent_sample_coords_end = rearrange(
+                latent_sample_coords_end, "b c f h w -> b c (f h w)", b=batch_size
+            )
+
+            latent_coords = torch.stack((latent_sample_coords_start, latent_sample_coords_end), dim=-1)
+        else:
+            latent_coords = latent_sample_coords_start
        return latent_coords


@ -115,3 +130,61 @@ class SymmetricPatchifier(Patchifier):
            q=self._patch_size[2],
        )
        return latents
+
+
+class AudioPatchifier(Patchifier):
+    def __init__(self, patch_size: int,
+        sample_rate=16000,
+        hop_length=160,
+        audio_latent_downsample_factor=4,
+        is_causal=True,
+        start_end=False,
+        shift = 0
+    ):
+        super().__init__(patch_size, start_end=start_end)
+        self.hop_length = hop_length
+        self.sample_rate = sample_rate
+        self.audio_latent_downsample_factor = audio_latent_downsample_factor
+        self.is_causal = is_causal
+        self.shift = shift
+
+    def copy_with_shift(self, shift):
+        return AudioPatchifier(
+            self.patch_size, self.sample_rate, self.hop_length, self.audio_latent_downsample_factor,
+            self.is_causal, self.start_end, shift
+        )
+
+    def _get_audio_latent_time_in_sec(self, start_latent, end_latent: int, dtype: torch.dtype, device=torch.device):
+        audio_latent_frame = torch.arange(start_latent, end_latent, dtype=dtype, device=device)
+        audio_mel_frame = audio_latent_frame * self.audio_latent_downsample_factor
+        if self.is_causal:
+            audio_mel_frame = (audio_mel_frame + 1 - self.audio_latent_downsample_factor).clip(min=0)
+        return audio_mel_frame * self.hop_length / self.sample_rate
+
+
+    def patchify(self, audio_latents: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # audio_latents: (batch, channels, time, freq)
+        b, _, t, _ = audio_latents.shape
+        audio_latents = rearrange(
+            audio_latents,
+            "b c t f -> b t (c f)",
+        )
+
+        audio_latents_start_timings = self._get_audio_latent_time_in_sec(self.shift, t + self.shift, torch.float32, audio_latents.device)
+        audio_latents_start_timings = audio_latents_start_timings.unsqueeze(0).expand(b, -1).unsqueeze(1)
+
+        if self.start_end:
+            audio_latents_end_timings = self._get_audio_latent_time_in_sec(self.shift + 1, t + self.shift + 1, torch.float32, audio_latents.device)
+            audio_latents_end_timings = audio_latents_end_timings.unsqueeze(0).expand(b, -1).unsqueeze(1)
+
+            audio_latents_timings = torch.stack([audio_latents_start_timings, audio_latents_end_timings], dim=-1)
+        else:
+            audio_latents_timings = audio_latents_start_timings
+        return audio_latents, audio_latents_timings
+
+    def unpatchify(self, audio_latents: torch.Tensor, channels: int, freq: int) -> torch.Tensor:
+        # audio_latents: (batch, time, freq * channels)
+        audio_latents = rearrange(
+            audio_latents, "b t (c f) -> b c t f", c=channels, f=freq
+        )
+        return audio_latents
--- a/comfy/ldm/lightricks/vae/audio_vae.py
+++ b/comfy/ldm/lightricks/vae/audio_vae.py
@ -0,0 +1,279 @@
+import json
+from dataclasses import dataclass
+import math
+import torch
+import torchaudio
+
+import comfy.model_management
+import comfy.model_patcher
+import comfy.utils as utils
+from comfy.ldm.mmaudio.vae.distributions import DiagonalGaussianDistribution
+from comfy.ldm.lightricks.symmetric_patchifier import AudioPatchifier
+from comfy.ldm.lightricks.vae.causal_audio_autoencoder import (
+    CausalityAxis,
+    CausalAudioAutoencoder,
+)
+from comfy.ldm.lightricks.vocoders.vocoder import Vocoder
+
+LATENT_DOWNSAMPLE_FACTOR = 4
+
+
+@dataclass(frozen=True)
+class AudioVAEComponentConfig:
+    """Container for model component configuration extracted from metadata."""
+
+    autoencoder: dict
+    vocoder: dict
+
+    @classmethod
+    def from_metadata(cls, metadata: dict) -> "AudioVAEComponentConfig":
+        assert metadata is not None and "config" in metadata, "Metadata is required for audio VAE"
+
+        raw_config = metadata["config"]
+        if isinstance(raw_config, str):
+            parsed_config = json.loads(raw_config)
+        else:
+            parsed_config = raw_config
+
+        audio_config = parsed_config.get("audio_vae")
+        vocoder_config = parsed_config.get("vocoder")
+
+        assert audio_config is not None, "Audio VAE config is required for audio VAE"
+        assert vocoder_config is not None, "Vocoder config is required for audio VAE"
+
+        return cls(autoencoder=audio_config, vocoder=vocoder_config)
+
+
+class ModelDeviceManager:
+    """Manages device placement and GPU residency for the composed model."""
+
+    def __init__(self, module: torch.nn.Module):
+        load_device = comfy.model_management.get_torch_device()
+        offload_device = comfy.model_management.vae_offload_device()
+        self.patcher = comfy.model_patcher.ModelPatcher(module, load_device, offload_device)
+
+    def ensure_model_loaded(self) -> None:
+        comfy.model_management.free_memory(
+            self.patcher.model_size(),
+            self.patcher.load_device,
+        )
+        comfy.model_management.load_model_gpu(self.patcher)
+
+    def move_to_load_device(self, tensor: torch.Tensor) -> torch.Tensor:
+        return tensor.to(self.patcher.load_device)
+
+    @property
+    def load_device(self):
+        return self.patcher.load_device
+
+
+class AudioLatentNormalizer:
+    """Applies per-channel statistics in patch space and restores original layout."""
+
+    def __init__(self, patchfier: AudioPatchifier, statistics_processor: torch.nn.Module):
+        self.patchifier = patchfier
+        self.statistics = statistics_processor
+
+    def normalize(self, latents: torch.Tensor) -> torch.Tensor:
+        channels = latents.shape[1]
+        freq = latents.shape[3]
+        patched, _ = self.patchifier.patchify(latents)
+        normalized = self.statistics.normalize(patched)
+        return self.patchifier.unpatchify(normalized, channels=channels, freq=freq)
+
+    def denormalize(self, latents: torch.Tensor) -> torch.Tensor:
+        channels = latents.shape[1]
+        freq = latents.shape[3]
+        patched, _ = self.patchifier.patchify(latents)
+        denormalized = self.statistics.un_normalize(patched)
+        return self.patchifier.unpatchify(denormalized, channels=channels, freq=freq)
+
+
+class AudioPreprocessor:
+    """Prepares raw waveforms for the autoencoder by matching training conditions."""
+
+    def __init__(self, target_sample_rate: int, mel_bins: int, mel_hop_length: int, n_fft: int):
+        self.target_sample_rate = target_sample_rate
+        self.mel_bins = mel_bins
+        self.mel_hop_length = mel_hop_length
+        self.n_fft = n_fft
+
+    def resample(self, waveform: torch.Tensor, source_rate: int) -> torch.Tensor:
+        if source_rate == self.target_sample_rate:
+            return waveform
+        return torchaudio.functional.resample(waveform, source_rate, self.target_sample_rate)
+
+    def waveform_to_mel(
+        self, waveform: torch.Tensor, waveform_sample_rate: int, device
+    ) -> torch.Tensor:
+        waveform = self.resample(waveform, waveform_sample_rate)
+
+        mel_transform = torchaudio.transforms.MelSpectrogram(
+            sample_rate=self.target_sample_rate,
+            n_fft=self.n_fft,
+            win_length=self.n_fft,
+            hop_length=self.mel_hop_length,
+            f_min=0.0,
+            f_max=self.target_sample_rate / 2.0,
+            n_mels=self.mel_bins,
+            window_fn=torch.hann_window,
+            center=True,
+            pad_mode="reflect",
+            power=1.0,
+            mel_scale="slaney",
+            norm="slaney",
+        ).to(device)
+
+        mel = mel_transform(waveform)
+        mel = torch.log(torch.clamp(mel, min=1e-5))
+        return mel.permute(0, 1, 3, 2).contiguous()
+
+
+class AudioVAE(torch.nn.Module):
+    """High-level Audio VAE wrapper exposing encode and decode entry points."""
+
+    def __init__(self, state_dict: dict, metadata: dict):
+        super().__init__()
+
+        component_config = AudioVAEComponentConfig.from_metadata(metadata)
+
+        vae_sd = utils.state_dict_prefix_replace(state_dict, {"audio_vae.": ""}, filter_keys=True)
+        vocoder_sd = utils.state_dict_prefix_replace(state_dict, {"vocoder.": ""}, filter_keys=True)
+
+        self.autoencoder = CausalAudioAutoencoder(config=component_config.autoencoder)
+        self.vocoder = Vocoder(config=component_config.vocoder)
+
+        self.autoencoder.load_state_dict(vae_sd, strict=False)
+        self.vocoder.load_state_dict(vocoder_sd, strict=False)
+
+        autoencoder_config = self.autoencoder.get_config()
+        self.normalizer = AudioLatentNormalizer(
+            AudioPatchifier(
+                patch_size=1,
+                audio_latent_downsample_factor=LATENT_DOWNSAMPLE_FACTOR,
+                sample_rate=autoencoder_config["sampling_rate"],
+                hop_length=autoencoder_config["mel_hop_length"],
+                is_causal=autoencoder_config["is_causal"],
+            ),
+            self.autoencoder.per_channel_statistics,
+        )
+
+        self.preprocessor = AudioPreprocessor(
+            target_sample_rate=autoencoder_config["sampling_rate"],
+            mel_bins=autoencoder_config["mel_bins"],
+            mel_hop_length=autoencoder_config["mel_hop_length"],
+            n_fft=autoencoder_config["n_fft"],
+        )
+
+        self.device_manager = ModelDeviceManager(self)
+
+    def encode(self, audio: dict) -> torch.Tensor:
+        """Encode a waveform dictionary into normalized latent tensors."""
+
+        waveform = audio["waveform"]
+        waveform_sample_rate = audio["sample_rate"]
+        input_device = waveform.device
+        # Ensure that Audio VAE is loaded on the correct device.
+        self.device_manager.ensure_model_loaded()
+
+        waveform = self.device_manager.move_to_load_device(waveform)
+        expected_channels = self.autoencoder.encoder.in_channels
+        if waveform.shape[1] != expected_channels:
+            if waveform.shape[1] == 1:
+                waveform = waveform.expand(-1, expected_channels, *waveform.shape[2:])
+            else:
+                raise ValueError(
+                    f"Input audio must have {expected_channels} channels, got {waveform.shape[1]}"
+                )
+
+        mel_spec = self.preprocessor.waveform_to_mel(
+            waveform, waveform_sample_rate, device=self.device_manager.load_device
+        )
+
+        latents = self.autoencoder.encode(mel_spec)
+        posterior = DiagonalGaussianDistribution(latents)
+        latent_mode = posterior.mode()
+
+        normalized = self.normalizer.normalize(latent_mode)
+        return normalized.to(input_device)
+
+    def decode(self, latents: torch.Tensor) -> torch.Tensor:
+        """Decode normalized latent tensors into an audio waveform."""
+        original_shape = latents.shape
+
+        # Ensure that Audio VAE is loaded on the correct device.
+        self.device_manager.ensure_model_loaded()
+
+        latents = self.device_manager.move_to_load_device(latents)
+        latents = self.normalizer.denormalize(latents)
+
+        target_shape = self.target_shape_from_latents(original_shape)
+        mel_spec = self.autoencoder.decode(latents, target_shape=target_shape)
+
+        waveform = self.run_vocoder(mel_spec)
+        return self.device_manager.move_to_load_device(waveform)
+
+    def target_shape_from_latents(self, latents_shape):
+        batch, _, time, _ = latents_shape
+        target_length = time * LATENT_DOWNSAMPLE_FACTOR
+        if self.autoencoder.causality_axis != CausalityAxis.NONE:
+            target_length -= LATENT_DOWNSAMPLE_FACTOR - 1
+        return (
+            batch,
+            self.autoencoder.decoder.out_ch,
+            target_length,
+            self.autoencoder.mel_bins,
+        )
+
+    def num_of_latents_from_frames(self, frames_number: int, frame_rate: int) -> int:
+        return math.ceil((float(frames_number) / frame_rate) * self.latents_per_second)
+
+    def run_vocoder(self, mel_spec: torch.Tensor) -> torch.Tensor:
+        audio_channels = self.autoencoder.decoder.out_ch
+        vocoder_input = mel_spec.transpose(2, 3)
+
+        if audio_channels == 1:
+            vocoder_input = vocoder_input.squeeze(1)
+        elif audio_channels != 2:
+            raise ValueError(f"Unsupported audio_channels: {audio_channels}")
+
+        return self.vocoder(vocoder_input)
+
+    @property
+    def sample_rate(self) -> int:
+        return int(self.autoencoder.sampling_rate)
+
+    @property
+    def mel_hop_length(self) -> int:
+        return int(self.autoencoder.mel_hop_length)
+
+    @property
+    def mel_bins(self) -> int:
+        return int(self.autoencoder.mel_bins)
+
+    @property
+    def latent_channels(self) -> int:
+        return int(self.autoencoder.decoder.z_channels)
+
+    @property
+    def latent_frequency_bins(self) -> int:
+        return int(self.mel_bins // LATENT_DOWNSAMPLE_FACTOR)
+
+    @property
+    def latents_per_second(self) -> float:
+        return self.sample_rate / self.mel_hop_length / LATENT_DOWNSAMPLE_FACTOR
+
+    @property
+    def output_sample_rate(self) -> int:
+        output_rate = getattr(self.vocoder, "output_sample_rate", None)
+        if output_rate is not None:
+            return int(output_rate)
+        upsample_factor = getattr(self.vocoder, "upsample_factor", None)
+        if upsample_factor is None:
+            raise AttributeError(
+                "Vocoder is missing upsample_factor; cannot infer output sample rate"
+            )
+        return int(self.sample_rate * upsample_factor / self.mel_hop_length)
+
+    def memory_required(self, input_shape):
+        return self.device_manager.patcher.model_size()
--- a/Show More
+++ b/Show More