fix(assets): seed added_at past max(existing) to survive Windows clock collisions

The per-tag microsecond stagger preserves intra-batch order, but two back-to-back write batches on the same reference (e.g. set_reference_tags for path tags, then add_tags_to_reference for user tags) call get_utc_now() independently. On Windows the system clock can return the same datetime for both calls if no OS tick elapsed between the commits — both batches end up sharing microseconds and ORDER BY added_at, tag_name falls back to the alphabetic tiebreaker, sorting user tags ahead of path tags they were meant to follow. Add _next_added_at_base(reference_id) that reads max(existing added_at) and returns max(existing + 1us, get_utc_now()), guaranteeing the new batch sorts strictly after anything previously written for that reference. Used by set_reference_tags and add_tags_to_reference; batch_insert_seed_assets stays on raw get_utc_now() since seed inserts are always the first writes for a new reference. The accompanying regression test pins get_utc_now() to a frozen value so the previously-Windows-only race becomes a platform-independent failure mode under test.
fix(assets): expand standalone bucket tag for nested category paths
2026-05-22 01:00:07 +08:00 · 2026-05-20 20:33:39 -07:00 · 2026-05-20 20:33:39 -07:00 · 2026-05-20 19:20:33 -07:00 · 2026-05-20 17:17:55 -07:00 · 2026-05-20 17:03:58 -07:00
36 changed files with 2664 additions and 256 deletions
--- a/5
+++ b/5
@ -1,2 +1,5 @@
-# Admins
 * @comfyanonymous @kosinkadink @guill @alexisrolland @rattus128 @kijai
+
+/CODEOWNERS @comfyanonymous
+/.ci/ @comfyanonymous
+/.github/ @comfyanonymous
--- a/app/assets/api/routes.py
+++ b/app/assets/api/routes.py
@ -401,12 +401,16 @@ async def upload_asset(request: web.Request) -> web.Response:
        )

    if spec.tags and spec.tags[0] == "models":
+        # tag[1] may be the standalone category ("checkpoints") or the
+        # slash-joined shape ("checkpoints/flux/...") that
+        # `get_name_and_tags_from_asset_path` and cloud both emit. Match
+        # `resolve_destination_from_tags` by extracting the first segment.
+        category = spec.tags[1].split("/", 1)[0] if len(spec.tags) >= 2 else ""
        if (
            len(spec.tags) < 2
-            or spec.tags[1] not in folder_paths.folder_names_and_paths
+            or category not in folder_paths.folder_names_and_paths
        ):
            delete_temp_file_if_exists(parsed.tmp_path)
-            category = spec.tags[1] if len(spec.tags) >= 2 else ""
            return _build_error_response(
                400, "INVALID_BODY", f"unknown models category '{category}'"
            )
--- a/app/assets/database/queries/asset_reference.py
+++ b/app/assets/database/queries/asset_reference.py
@ -327,7 +327,12 @@ def list_references_page(
            select(AssetReferenceTag.asset_reference_id, Tag.name)
            .join(Tag, Tag.name == AssetReferenceTag.tag_name)
            .where(AssetReferenceTag.asset_reference_id.in_(id_list))
-            .order_by(AssetReferenceTag.tag_name.asc())
+            # Preserve insertion order so the structural first tag (the root
+            # category like "models") stays in position 0 and the path-derived
+            # sub-path tag stays in position 1, matching cloud's behavior.
+            # tag_name is a deterministic tiebreaker when multiple tags share
+            # an added_at (same-batch insert via set_reference_tags).
+            .order_by(AssetReferenceTag.added_at.asc(), AssetReferenceTag.tag_name.asc())
        )
        for ref_id, tag_name in rows.all():
            tag_map[ref_id].append(tag_name)
@ -355,7 +360,8 @@ def fetch_reference_asset_and_tags(
            build_visible_owner_clause(owner_id),
        )
        .options(noload(AssetReference.tags))
-        .order_by(Tag.name.asc())
+        # See list_references_page for the rationale behind ordering by added_at.
+        .order_by(AssetReferenceTag.added_at.asc(), Tag.name.asc())
    )

    rows = session.execute(stmt).all()
--- a/app/assets/database/queries/tags.py
+++ b/app/assets/database/queries/tags.py
@ -1,4 +1,5 @@
 from dataclasses import dataclass
+from datetime import datetime, timedelta
 from typing import Iterable, Sequence

 import sqlalchemy as sa
@ -20,7 +21,12 @@ from app.assets.database.queries.common import (
    build_visible_owner_clause,
    iter_row_chunks,
 )
-from app.assets.helpers import escape_sql_like_string, get_utc_now, normalize_tags
+from app.assets.helpers import (
+    escape_sql_like_string,
+    expand_bucket_prefixes,
+    get_utc_now,
+    normalize_tags,
+)


@dataclass(frozen=True)
@ -44,6 +50,26 @@ class SetTagsResult:
    total: list[str]


+def _next_added_at_base(session: Session, reference_id: str) -> datetime:
+    """Return a timestamp strictly greater than any existing
+    `added_at` for this reference. On platforms where the wall clock
+    has insufficient resolution between back-to-back commits (notably
+    Windows), two write batches on the same reference can otherwise
+    share a microsecond — the `ORDER BY added_at, tag_name` retrieval
+    then falls back to the alphabetic tiebreaker and user-tier tags
+    sort ahead of path-tier tags they were meant to follow.
+    """
+    existing_max = session.execute(
+        sa.select(sa.func.max(AssetReferenceTag.added_at)).where(
+            AssetReferenceTag.asset_reference_id == reference_id
+        )
+    ).scalar()
+    now = get_utc_now()
+    if existing_max is None:
+        return now
+    return max(existing_max + timedelta(microseconds=1), now)
+
+
 def validate_tags_exist(session: Session, tags: list[str]) -> None:
    """Raise ValueError if any of the given tag names do not exist."""
    existing_tag_names = set(
@ -77,7 +103,13 @@ def get_reference_tags(session: Session, reference_id: str) -> list[str]:
            session.execute(
                select(AssetReferenceTag.tag_name)
                .where(AssetReferenceTag.asset_reference_id == reference_id)
-                .order_by(AssetReferenceTag.tag_name.asc())
+                # Match the response-path ordering used by
+                # list_references_page / fetch_reference_asset_and_tags so
+                # upload responses and subsequent GETs agree on tag order.
+                .order_by(
+                    AssetReferenceTag.added_at.asc(),
+                    AssetReferenceTag.tag_name.asc(),
+                )
            )
        ).all()
    ]
@ -89,7 +121,7 @@ def set_reference_tags(
    tags: Sequence[str],
    origin: str = "manual",
 ) -> SetTagsResult:
-    desired = normalize_tags(tags)
+    desired = expand_bucket_prefixes(normalize_tags(tags))

    current = set(get_reference_tags(session, reference_id))

@ -98,15 +130,22 @@ def set_reference_tags(

    if to_add:
        ensure_tags_exist(session, to_add, tag_type="user")
+        # Stagger added_at by microsecond per tag so the retrieval ORDER BY
+        # added_at preserves input order. Per-tag get_utc_now() calls can
+        # collide at microsecond resolution on fast machines, dropping the
+        # query to the tag_name alphabetical tiebreaker — same fix as in
+        # batch_insert_seed_assets. Read max(existing) so this batch sorts
+        # strictly after any prior batch on the same reference.
+        base_ts = _next_added_at_base(session, reference_id)
        session.add_all(
            [
                AssetReferenceTag(
                    asset_reference_id=reference_id,
                    tag_name=t,
                    origin=origin,
-                    added_at=get_utc_now(),
+                    added_at=base_ts + timedelta(microseconds=i),
                )
-                for t in to_add
+                for i, t in enumerate(to_add)
            ]
        )
        session.flush()
@ -136,7 +175,7 @@ def add_tags_to_reference(
        if not ref:
            raise ValueError(f"AssetReference {reference_id} not found")

-    norm = normalize_tags(tags)
+    norm = expand_bucket_prefixes(normalize_tags(tags))
    if not norm:
        total = get_reference_tags(session, reference_id=reference_id)
        return AddTagsResult(added=[], already_present=[], total_tags=total)
@ -146,10 +185,17 @@ def add_tags_to_reference(

    current = set(get_reference_tags(session, reference_id))

+    # Preserve the caller's insertion order rather than alphabetizing —
+    # the retrieval ORDER BY added_at + microsecond stagger only meaningfully
+    # preserves insertion order if "the order we insert in" actually matches
+    # the caller's intent.
    want = set(norm)
-    to_add = sorted(want - current)
+    to_add = [t for t in norm if t not in current]

    if to_add:
+        # See set_reference_tags for the rationale behind the per-tag stagger
+        # and the max(existing) seed.
+        base_ts = _next_added_at_base(session, reference_id)
        with session.begin_nested() as nested:
            try:
                session.add_all(
@ -158,9 +204,9 @@ def add_tags_to_reference(
                            asset_reference_id=reference_id,
                            tag_name=t,
                            origin=origin,
-                            added_at=get_utc_now(),
+                            added_at=base_ts + timedelta(microseconds=i),
                        )
-                        for t in to_add
+                        for i, t in enumerate(to_add)
                    ]
                )
                session.flush()
--- a/app/assets/helpers.py
+++ b/app/assets/helpers.py
@ -47,6 +47,50 @@ def normalize_tags(tags: list[str] | None) -> list[str]:
    return list(dict.fromkeys(t.strip().lower() for t in (tags or []) if (t or "").strip()))


+def _known_bucket_prefixes() -> set[str]:
+    """Lowercased model-category names eligible for standalone-prefix
+    expansion. Tags whose first slash segment matches one of these get
+    the bucket inserted as a separate token, so FE filters like
+    ``include_tags=models,checkpoints`` keep matching even when the
+    asset lives in a nested subfolder (`models/checkpoints/flux/foo`).
+
+    Bare user labels with slashes whose first segment is not a registered
+    bucket (e.g. ``my-org/team-a``) pass through unchanged.
+    """
+    try:
+        import folder_paths
+
+        return {
+            name.lower()
+            for name in folder_paths.folder_names_and_paths.keys()
+            if name != "custom_nodes"
+        }
+    except Exception:
+        return set()
+
+
+def expand_bucket_prefixes(tags: list[str]) -> list[str]:
+    """Insert standalone bucket tokens after any slash-joined tag whose
+    first segment is a registered model category. Preserves caller order
+    and is idempotent (existing bucket tokens are not duplicated).
+    """
+    if not tags:
+        return list(tags)
+    buckets = _known_bucket_prefixes()
+    if not buckets:
+        return list(tags)
+    seen = set(tags)
+    result: list[str] = []
+    for t in tags:
+        result.append(t)
+        if "/" in t:
+            prefix = t.split("/", 1)[0]
+            if prefix.lower() in buckets and prefix not in seen:
+                result.append(prefix)
+                seen.add(prefix)
+    return result
+
+
 def validate_blake3_hash(s: str) -> str:
    """Validate and normalize a blake3 hash string.

--- a/app/assets/services/bulk_ingest.py
+++ b/app/assets/services/bulk_ingest.py
@ -3,7 +3,7 @@ from __future__ import annotations
 import os
 import uuid
 from dataclasses import dataclass
-from datetime import datetime
+from datetime import datetime, timedelta
 from typing import TYPE_CHECKING, Any, TypedDict

 from sqlalchemy.orm import Session
@ -13,13 +13,14 @@ from app.assets.database.queries import (
    bulk_insert_references_ignore_conflicts,
    bulk_insert_tags_and_meta,
    delete_assets_by_ids,
+    ensure_tags_exist,
    get_existing_asset_ids,
    get_reference_ids_by_ids,
    get_references_by_paths_and_asset_ids,
    get_unreferenced_unhashed_asset_ids,
    restore_references_by_paths,
 )
-from app.assets.helpers import get_utc_now
+from app.assets.helpers import expand_bucket_prefixes, get_utc_now

 if TYPE_CHECKING:
    from app.assets.services.metadata_extract import ExtractedMetadata
@ -233,13 +234,20 @@ def batch_insert_seed_assets(
            if ref_id not in inserted_ref_ids:
                continue

-            for tag in ref_data["tags"]:
+            # Stagger added_at by microsecond per tag within a reference so
+            # the retrieval ORDER BY added_at preserves the input list order
+            # (the path-derived root category stays at position 0). Without
+            # this, every tag in a bulk-insert batch shares current_time and
+            # the tag_name tiebreaker sorts them alphabetically — putting the
+            # subpath tag ahead of "models" since "c"/"d"/"l" < "m".
+            ref_tags = expand_bucket_prefixes(ref_data["tags"])
+            for tag_idx, tag in enumerate(ref_tags):
                tag_rows.append(
                    {
                        "asset_reference_id": ref_id,
                        "tag_name": tag,
                        "origin": "automatic",
-                        "added_at": current_time,
+                        "added_at": current_time + timedelta(microseconds=tag_idx),
                    }
                )

@ -261,6 +269,16 @@ def batch_insert_seed_assets(
                    }
                )

+    if tag_rows:
+        # Bucket-prefix expansion may have introduced tags the caller did
+        # not register via the upstream tag_pool (e.g. `checkpoints` for a
+        # nested `checkpoints/flux/foo` path). Pre-register the full set so
+        # the AssetReferenceTag.tag_name FK is satisfied; the underlying
+        # insert is ON CONFLICT DO NOTHING so re-registration is idempotent.
+        ensure_tags_exist(
+            session, {row["tag_name"] for row in tag_rows}, tag_type="user"
+        )
+
    bulk_insert_tags_and_meta(session, tag_rows=tag_rows, meta_rows=metadata_rows)

    return BulkInsertResult(
--- a/app/assets/services/path_utils.py
+++ b/app/assets/services/path_utils.py
@ -3,7 +3,6 @@ from pathlib import Path
 from typing import Literal

 import folder_paths
-from app.assets.helpers import normalize_tags


 _NON_MODEL_FOLDER_NAMES = frozenset({"custom_nodes"})
@ -27,27 +26,51 @@ def get_comfy_models_folders() -> list[tuple[str, list[str]]]:


 def resolve_destination_from_tags(tags: list[str]) -> tuple[str, list[str]]:
-    """Validates and maps tags -> (base_dir, subdirs_for_fs)"""
+    """Validates and maps tags -> (base_dir, subdirs_for_fs).
+
+    Accepts both the legacy one-tag-per-directory shape
+    (``["models", "diffusers", "Kolors", "text_encoder"]``) and the
+    slash-joined shape emitted by :func:`get_name_and_tags_from_asset_path`
+    (``["models", "diffusers/Kolors/text_encoder"]``). Hybrid shapes that
+    mix the two within a single call (e.g.
+    ``["models", "diffusers", "Kolors/text_encoder"]``) are also
+    accepted: each entry after ``tags[0]`` is split on ``/`` and
+    concatenated, so the two shapes — and any mix of them — resolve to
+    the same destination. The same safety checks are applied to each
+    component after expansion.
+    """
    if not tags:
        raise ValueError("tags must not be empty")
    root = tags[0].lower()
+
+    # Expand any slash-joined entries into individual path components so
+    # the rest of the function can treat both tag shapes uniformly. Each
+    # component is also stripped, so " a / b " behaves like ["a", "b"].
+    expanded: list[str] = []
+    for t in tags[1:]:
+        for part in str(t).split("/"):
+            part = part.strip()
+            if part:
+                expanded.append(part)
+
    if root == "models":
-        if len(tags) < 2:
+        if not expanded:
            raise ValueError("at least two tags required for model asset")
+        category = expanded[0]
        try:
-            bases = folder_paths.folder_names_and_paths[tags[1]][0]
+            bases = folder_paths.folder_names_and_paths[category][0]
        except KeyError:
-            raise ValueError(f"unknown model category '{tags[1]}'")
+            raise ValueError(f"unknown model category '{category}'")
        if not bases:
-            raise ValueError(f"no base path configured for category '{tags[1]}'")
+            raise ValueError(f"no base path configured for category '{category}'")
        base_dir = os.path.abspath(bases[0])
-        raw_subdirs = tags[2:]
+        raw_subdirs = expanded[1:]
    elif root == "input":
        base_dir = os.path.abspath(folder_paths.get_input_directory())
-        raw_subdirs = tags[1:]
+        raw_subdirs = expanded
    elif root == "output":
        base_dir = os.path.abspath(folder_paths.get_output_directory())
-        raw_subdirs = tags[1:]
+        raw_subdirs = expanded
    else:
        raise ValueError(f"unknown root tag '{tags[0]}'; expected 'models', 'input', or 'output'")
    _sep_chars = frozenset(("/", "\\", os.sep))
@ -160,7 +183,21 @@ def get_name_and_tags_from_asset_path(file_path: str) -> tuple[str, list[str]]:
    """Return (name, tags) derived from a filesystem path.

    - name: base filename with extension
-    - tags: [root_category] + parent folder names in order
+    - tags: [root_category] for paths with no parent subdirectories,
+      [root_category, slash_joined_subpath] otherwise. The parent subpath
+      (everything between the root category and the filename) is collapsed
+      into a single tag rather than emitted as one tag per directory, so
+      consumers can use ``tags[1]`` as a stable category identifier that
+      survives nested directory layouts (e.g. diffusers components).
+
+      The subpath is lowercased to match the canonicalization applied by
+      :func:`ensure_tags_exist`; without that, the
+      ``asset_reference_tags.tag_name`` FK to the lowercased ``tags.name``
+      would fail for any path containing uppercase letters. The root
+      category is lowercase by construction in
+      :func:`get_asset_category_and_relative_path`, so no separate cast
+      is applied here. Consumers that need to look up providers keyed on
+      original-case paths should normalize their lookup key to lowercase.

    Raises:
        ValueError: path does not belong to any known root.
@ -170,4 +207,7 @@ def get_name_and_tags_from_asset_path(file_path: str) -> tuple[str, list[str]]:
    parent_parts = [
        part for part in p.parent.parts if part not in (".", "..", p.anchor)
    ]
-    return p.name, list(dict.fromkeys(normalize_tags([root_category, *parent_parts])))
+    tags = [root_category]
+    if parent_parts:
+        tags.append("/".join(parent_parts).lower())
+    return p.name, list(dict.fromkeys(t.strip() for t in tags if t.strip()))
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -110,13 +110,11 @@ parser.add_argument("--preview-method", type=LatentPreviewMethod, default=Latent

 parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.")

-CACHE_RAM_AUTO_GB = -1.0
-
 cache_group = parser.add_mutually_exclusive_group()
+cache_group.add_argument("--cache-ram", nargs='*', type=float, default=[], metavar="GB", help="Use RAM pressure caching with the specified headroom thresholds. This is the default caching mode. The first value sets the active-cache threshold; the optional second value sets the inactive-cache/pin threshold. Defaults when no values are provided: active 25%% of system RAM (min 4GB, max 32GB), inactive 75%% of system RAM (min 12GB, max 96GB).")
 cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
 cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
 cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")
-cache_group.add_argument("--cache-ram", nargs='?', const=CACHE_RAM_AUTO_GB, type=float, default=0, help="Use RAM pressure caching with the specified headroom threshold. If available RAM drops below the threshold the cache removes large items to free RAM. Default (when no value is provided): 25%% of system RAM (min 4GB, max 32GB).")

 attn_group = parser.add_mutually_exclusive_group()
 attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
@ -245,6 +243,9 @@ if comfy.options.args_parsing:
 else:
    args = parser.parse_args([])

+if args.cache_ram is not None and len(args.cache_ram) > 2:
+    parser.error("--cache-ram accepts at most two values: active GB and inactive GB")
+
 if args.windows_standalone_build:
    args.auto_launch = True

--- a/comfy/lora.py
+++ b/comfy/lora.py
@ -484,16 +484,23 @@ def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32, ori

    return weight

-def prefetch_prepared_value(value, allocate_buffer, stream):
+def prefetch_prepared_value(value, counter, destination, stream, copy):
    if isinstance(value, torch.Tensor):
-        dest = allocate_buffer(comfy.memory_management.vram_aligned_size(value))
-        comfy.model_management.cast_to_gathered([value], dest, non_blocking=True, stream=stream)
+        size = comfy.memory_management.vram_aligned_size(value)
+        offset = counter[0]
+        counter[0] += size
+        if destination is None:
+            return value
+
+        dest = destination[offset:offset + size]
+        if copy:
+            comfy.model_management.cast_to_gathered([value], dest, non_blocking=True, stream=stream)
        return comfy.memory_management.interpret_gathered_like([value], dest)[0]
    elif isinstance(value, weight_adapter.WeightAdapterBase):
-        return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, allocate_buffer, stream))
+        return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, counter, destination, stream, copy))
    elif isinstance(value, tuple):
-        return tuple(prefetch_prepared_value(item, allocate_buffer, stream) for item in value)
+        return tuple(prefetch_prepared_value(item, counter, destination, stream, copy) for item in value)
    elif isinstance(value, list):
-        return [prefetch_prepared_value(item, allocate_buffer, stream) for item in value]
+        return [prefetch_prepared_value(item, counter, destination, stream, copy) for item in value]

    return value
--- a/comfy/memory_management.py
+++ b/comfy/memory_management.py
@ -15,7 +15,7 @@ class TensorFileSlice(NamedTuple):
    size: int


-def read_tensor_file_slice_into(tensor, destination):
+def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=None):

    if isinstance(tensor, QuantizedTensor):
        if not isinstance(destination, QuantizedTensor):
@ -23,12 +23,17 @@ def read_tensor_file_slice_into(tensor, destination):
        if tensor._layout_cls != destination._layout_cls:
            return False

-        if not read_tensor_file_slice_into(tensor._qdata, destination._qdata):
+        if not read_tensor_file_slice_into(tensor._qdata, destination._qdata, stream=stream,
+                                           destination2=(destination2._qdata if destination2 is not None else None)):
            return False

        dst_orig_dtype = destination._params.orig_dtype
        destination._params.copy_from(tensor._params, non_blocking=False)
        destination._params = dataclasses.replace(destination._params, orig_dtype=dst_orig_dtype)
+        if destination2 is not None:
+            dst_orig_dtype = destination2._params.orig_dtype
+            destination2._params.copy_from(destination._params, non_blocking=True)
+            destination2._params = dataclasses.replace(destination2._params, orig_dtype=dst_orig_dtype)
        return True

    info = getattr(tensor.untyped_storage(), "_comfy_tensor_file_slice", None)
@ -48,6 +53,17 @@ def read_tensor_file_slice_into(tensor, destination):
    if info.size == 0:
        return True

+    hostbuf = getattr(destination.untyped_storage(), "_comfy_hostbuf", None)
+    if hostbuf is not None:
+        stream_ptr = getattr(stream, "cuda_stream", 0) if stream is not None else 0
+        device_ptr = destination2.data_ptr() if destination2 is not None else 0
+        hostbuf.read_file_slice(file_obj, info.offset, info.size,
+                                offset=destination.data_ptr() - hostbuf.get_raw_address(),
+                                stream=stream_ptr,
+                                device_ptr=device_ptr,
+                                device=None if destination2 is None else destination2.device.index)
+        return True
+
    buf_type = ctypes.c_ubyte * info.size
    view = memoryview(buf_type.from_address(destination.data_ptr()))

@ -151,7 +167,7 @@ def set_ram_cache_release_state(callback, headroom):
    extra_ram_release_callback = callback
    RAM_CACHE_HEADROOM = max(0, int(headroom))

-def extra_ram_release(target):
+def extra_ram_release(target, free_active=False):
    if extra_ram_release_callback is None:
        return 0
-    return extra_ram_release_callback(target)
+    return extra_ram_release_callback(target, free_active=free_active)
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -31,6 +31,7 @@ from contextlib import nullcontext
 import comfy.memory_management
 import comfy.utils
 import comfy.quant_ops
+import comfy_aimdo.host_buffer
 import comfy_aimdo.vram_buffer

 class VRAMState(Enum):
@ -495,6 +496,14 @@ except:

 current_loaded_models = []

+DIRTY_MMAPS = set()
+
+PIN_PRESSURE_HYSTERESIS = 256 * 1024 * 1024
+
+#Freeing registerables on pressure does imply a GPU sync, so go big on
+#the hysteresis so each expensive sync gives us back a good chunk.
+REGISTERABLE_PIN_HYSTERESIS = 2048 * 1024 * 1024
+
 def module_size(module):
    module_mem = 0
    sd = module.state_dict()
@ -503,27 +512,46 @@ def module_size(module):
        module_mem += t.nbytes
    return module_mem

-def module_mmap_residency(module, free=False):
-    mmap_touched_mem = 0
-    module_mem = 0
-    bounced_mmaps = set()
-    sd = module.state_dict()
-    for k in sd:
-        t = sd[k]
-        module_mem += t.nbytes
-        storage = t._qdata.untyped_storage() if isinstance(t, comfy.quant_ops.QuantizedTensor) else t.untyped_storage()
-        if not getattr(storage, "_comfy_tensor_mmap_touched", False):
-            continue
-        mmap_touched_mem += t.nbytes
-        if not free:
-            continue
-        storage._comfy_tensor_mmap_touched = False
-        mmap_obj = storage._comfy_tensor_mmap_refs[0]
-        if mmap_obj in bounced_mmaps:
-            continue
-        mmap_obj.bounce()
-        bounced_mmaps.add(mmap_obj)
-    return mmap_touched_mem, module_mem
+def mark_mmap_dirty(storage):
+    mmap_refs = getattr(storage, "_comfy_tensor_mmap_refs", None)
+    if mmap_refs is not None:
+        DIRTY_MMAPS.add(mmap_refs[0])
+
+def free_pins(size, evict_active=False):
+    freed_total = 0
+    for loaded_model in reversed(current_loaded_models):
+        if size <= 0:
+            return freed_total
+        model = loaded_model.model
+        if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]):
+            freed = model.partially_unload_ram(size)
+            freed_total += freed
+            size -= freed
+    return freed_total
+
+def ensure_pin_budget(size, evict_active=False):
+    shortfall = size + comfy.memory_management.RAM_CACHE_HEADROOM / 2 - psutil.virtual_memory().available
+    if shortfall <= 0:
+        return True
+
+    to_free = shortfall + PIN_PRESSURE_HYSTERESIS
+    return free_pins(to_free, evict_active=evict_active) >= shortfall
+
+def ensure_pin_registerable(size, evict_active=False):
+    shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY
+    if MAX_PINNED_MEMORY <= 0:
+        return False
+    if shortfall <= 0:
+        return True
+
+    shortfall += REGISTERABLE_PIN_HYSTERESIS
+    for loaded_model in reversed(current_loaded_models):
+        model = loaded_model.model
+        if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]):
+            shortfall -= model.unregister_inactive_pins(shortfall)
+            if shortfall <= 0:
+                return True
+    return shortfall <= REGISTERABLE_PIN_HYSTERESIS

 class LoadedModel:
    def __init__(self, model):
@ -553,9 +581,6 @@ class LoadedModel:
    def model_memory(self):
        return self.model.model_size()

-    def model_mmap_residency(self, free=False):
-        return self.model.model_mmap_residency(free=free)
-
    def model_loaded_memory(self):
        return self.model.loaded_size()

@ -635,15 +660,9 @@ WINDOWS = any(platform.win32_ver())

 EXTRA_RESERVED_VRAM = 400 * 1024 * 1024
 if WINDOWS:
-    import comfy.windows
    EXTRA_RESERVED_VRAM = 600 * 1024 * 1024 #Windows is higher because of the shared vram issue
    if total_vram > (15 * 1024):  # more extra reserved vram on 16GB+ cards
        EXTRA_RESERVED_VRAM += 100 * 1024 * 1024
-    def get_free_ram():
-        return comfy.windows.get_free_ram()
-else:
-    def get_free_ram():
-        return psutil.virtual_memory().available

 if args.reserve_vram is not None:
    EXTRA_RESERVED_VRAM = args.reserve_vram * 1024 * 1024 * 1024
@ -657,7 +676,6 @@ def minimum_inference_memory():

 def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins_required=0, ram_required=0):
    cleanup_models_gc()
-    comfy.memory_management.extra_ram_release(max(pins_required, ram_required))
    unloaded_model = []
    can_unload = []
    unloaded_models = []
@ -673,11 +691,9 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins
    for x in can_unload_sorted:
        i = x[-1]
        memory_to_free = 1e32
-        pins_to_free = 1e32
-        if not DISABLE_SMART_MEMORY or device is None:
+        if current_loaded_models[i].model.is_dynamic() and (not DISABLE_SMART_MEMORY or device is None):
            memory_to_free = 0 if device is None else memory_required - get_free_memory(device)
-            pins_to_free = pins_required - get_free_ram()
-            if current_loaded_models[i].model.is_dynamic() and for_dynamic:
+            if for_dynamic:
                #don't actually unload dynamic models for the sake of other dynamic models
                #as that works on-demand.
                memory_required -= current_loaded_models[i].model.loaded_size()
@ -685,18 +701,6 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins
        if memory_to_free > 0 and current_loaded_models[i].model_unload(memory_to_free):
            logging.debug(f"Unloading {current_loaded_models[i].model.model.__class__.__name__}")
            unloaded_model.append(i)
-        if pins_to_free > 0:
-            logging.debug(f"PIN Unloading {current_loaded_models[i].model.model.__class__.__name__}")
-            current_loaded_models[i].model.partially_unload_ram(pins_to_free)
-
-    for x in can_unload_sorted:
-        i = x[-1]
-        ram_to_free = ram_required - psutil.virtual_memory().available
-        if ram_to_free <= 0 and i not in unloaded_model:
-            continue
-        resident_memory, _ = current_loaded_models[i].model_mmap_residency(free=True)
-        if resident_memory > 0:
-            logging.debug(f"RAM Unloading {current_loaded_models[i].model.model.__class__.__name__}")

    for i in sorted(unloaded_model, reverse=True):
        unloaded_models.append(current_loaded_models.pop(i))
@ -762,29 +766,16 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
            model_to_unload.model.detach(unpatch_all=False)
            model_to_unload.model_finalizer.detach()

-
    total_memory_required = {}
-    total_pins_required = {}
-    total_ram_required = {}
    for loaded_model in models_to_load:
        device = loaded_model.device
        total_memory_required[device] = total_memory_required.get(device, 0) + loaded_model.model_memory_required(device)
-        resident_memory, model_memory = loaded_model.model_mmap_residency()
-        pinned_memory = loaded_model.model.pinned_memory_size()
-        #FIXME: This can over-free the pins as it budgets to pin the entire model. We should
-        #make this JIT to keep as much pinned as possible.
-        pins_required = model_memory - pinned_memory
-        ram_required = model_memory - resident_memory
-        total_pins_required[device] = total_pins_required.get(device, 0) + pins_required
-        total_ram_required[device] = total_ram_required.get(device, 0) + ram_required

    for device in total_memory_required:
        if device != torch.device("cpu"):
            free_memory(total_memory_required[device] * 1.1 + extra_mem,
                        device,
-                        for_dynamic=free_for_dynamic,
-                        pins_required=total_pins_required[device],
-                        ram_required=total_ram_required[device])
+                        for_dynamic=free_for_dynamic)

    for device in total_memory_required:
        if device != torch.device("cpu"):
@ -1180,6 +1171,7 @@ STREAM_CAST_BUFFERS = {}
 LARGEST_CASTED_WEIGHT = (None, 0)
 STREAM_AIMDO_CAST_BUFFERS = {}
 LARGEST_AIMDO_CASTED_WEIGHT = (None, 0)
+STREAM_PIN_BUFFERS = {}

 DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE = 16 * 1024 ** 3

@ -1220,21 +1212,66 @@ def get_aimdo_cast_buffer(offload_stream, device):
    if cast_buffer is None:
        cast_buffer = comfy_aimdo.vram_buffer.VRAMBuffer(DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE, device.index)
        STREAM_AIMDO_CAST_BUFFERS[offload_stream] = cast_buffer
-
    return cast_buffer
+
+def get_pin_buffer(offload_stream):
+    pin_buffer = STREAM_PIN_BUFFERS.get(offload_stream, None)
+    if pin_buffer is None:
+        pin_buffer = comfy_aimdo.host_buffer.HostBuffer(0, 0, pinned_hostbuf_size(8 * 1024**3))
+        STREAM_PIN_BUFFERS[offload_stream] = pin_buffer
+    elif offload_stream is not None:
+        event = getattr(pin_buffer, "_comfy_event", None)
+        if event is not None:
+            event.synchronize()
+            delattr(pin_buffer, "_comfy_event")
+    return pin_buffer
+
+def resize_pin_buffer(pin_buffer, size):
+    global TOTAL_PINNED_MEMORY
+    old_size = pin_buffer.size
+    if size <= old_size:
+        return True
+    growth = size - old_size
+    comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM)
+    ensure_pin_budget(growth, evict_active=True)
+    ensure_pin_registerable(growth, evict_active=True)
+    try:
+        pin_buffer.extend(size=size, reallocate=True)
+    except RuntimeError:
+        return False
+    TOTAL_PINNED_MEMORY += pin_buffer.size - old_size
+    return True
+
 def reset_cast_buffers():
+    global TOTAL_PINNED_MEMORY
    global LARGEST_CASTED_WEIGHT
    global LARGEST_AIMDO_CASTED_WEIGHT

    LARGEST_CASTED_WEIGHT = (None, 0)
    LARGEST_AIMDO_CASTED_WEIGHT = (None, 0)
-    for offload_stream in set(STREAM_CAST_BUFFERS) | set(STREAM_AIMDO_CAST_BUFFERS):
+    for offload_stream in set(STREAM_CAST_BUFFERS) | set(STREAM_AIMDO_CAST_BUFFERS) | set(STREAM_PIN_BUFFERS):
        if offload_stream is not None:
            offload_stream.synchronize()
    synchronize()

+    for mmap_obj in DIRTY_MMAPS:
+        mmap_obj.bounce()
+    DIRTY_MMAPS.clear()
+
+    for pin_buffer in STREAM_PIN_BUFFERS.values():
+        TOTAL_PINNED_MEMORY -= pin_buffer.size
+    TOTAL_PINNED_MEMORY = max(0, TOTAL_PINNED_MEMORY)
+
+    for loaded_model in current_loaded_models:
+        model = loaded_model.model
+        if model is not None and model.is_dynamic():
+            model.model.dynamic_pins[model.load_device]["active"] = False
+            model.partially_unload_ram(1e30, subsets=[ "patches" ])
+            model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024, pinned_hostbuf_size(model.model_size())), [], [-1], [0])
+
    STREAM_CAST_BUFFERS.clear()
    STREAM_AIMDO_CAST_BUFFERS.clear()
+    STREAM_PIN_BUFFERS.clear()
    soft_empty_cache()

 def get_offload_stream(device):
@ -1280,7 +1317,7 @@ def sync_stream(device, stream):
    current_stream(device).wait_stream(stream)


-def cast_to_gathered(tensors, r, non_blocking=False, stream=None):
+def cast_to_gathered(tensors, r, non_blocking=False, stream=None, r2=None):
    wf_context = nullcontext()
    if stream is not None:
       wf_context = stream
@ -1288,17 +1325,20 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None):
           wf_context = wf_context.as_context(stream)

    dest_views = comfy.memory_management.interpret_gathered_like(tensors, r)
+    dest2_views = comfy.memory_management.interpret_gathered_like(tensors, r2) if r2 is not None else None
    with wf_context:
        for tensor in tensors:
            dest_view = dest_views.pop(0)
+            dest2_view = dest2_views.pop(0) if dest2_views is not None else None
            if tensor is None:
                continue
-            if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view):
+            if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view, stream=stream, destination2=dest2_view):
                continue
            storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage()
-            if hasattr(storage, "_comfy_tensor_mmap_touched"):
-                storage._comfy_tensor_mmap_touched = True
+            mark_mmap_dirty(storage)
            dest_view.copy_(tensor, non_blocking=non_blocking)
+            if dest2_view is not None:
+                dest2_view.copy_(dest_view, non_blocking=non_blocking)


 def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None, r=None):
@ -1339,14 +1379,18 @@ TOTAL_PINNED_MEMORY = 0
 MAX_PINNED_MEMORY = -1
 if not args.disable_pinned_memory:
    if is_nvidia() or is_amd():
+        ram = get_total_memory(torch.device("cpu"))
        if WINDOWS:
-            MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.40  # Windows limit is apparently 50%
+            MAX_PINNED_MEMORY = ram * 0.40  # Windows limit is apparently 50%
        else:
-            MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.90
+            MAX_PINNED_MEMORY = ram * 0.90
        logging.info("Enabled pinned memory {}".format(MAX_PINNED_MEMORY // (1024 * 1024)))

 PINNING_ALLOWED_TYPES = set(["Tensor", "Parameter", "QuantizedTensor"])

+def pinned_hostbuf_size(size):
+    return max(0, int(min(size, MAX_PINNED_MEMORY) * 2))
+
 def discard_cuda_async_error():
    try:
        a = torch.tensor([1], dtype=torch.uint8, device=get_torch_device())
@ -1378,8 +1422,8 @@ def pin_memory(tensor):
        return False

    size = tensor.nbytes
-    if (TOTAL_PINNED_MEMORY + size) > MAX_PINNED_MEMORY:
-        return False
+    comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM)
+    ensure_pin_registerable(size)

    ptr = tensor.data_ptr()
    if ptr == 0:
@ -1416,7 +1460,8 @@ def unpin_memory(tensor):
        return False

    if torch.cuda.cudart().cudaHostUnregister(ptr) == 0:
-        TOTAL_PINNED_MEMORY -= PINNED_MEMORY.pop(ptr)
+        size = PINNED_MEMORY.pop(ptr)
+        TOTAL_PINNED_MEMORY -= size
        return True
    else:
        logging.warning("Unpin error.")
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@ -35,6 +35,7 @@ import comfy.model_management
 import comfy.ops
 import comfy.patcher_extension
 import comfy.utils
+import comfy_aimdo.host_buffer
 from comfy.comfy_types import UnetWrapperFunction
 from comfy.quant_ops import QuantizedTensor
 from comfy.patcher_extension import CallbacksMP, PatcherInjection, WrappersMP
@ -117,6 +118,8 @@ def string_to_seed(data):
    return comfy.utils.string_to_seed(data)

 class LowVramPatch:
+    is_lowvram_patch = True
+
    def __init__(self, key, patches, convert_func=None, set_func=None):
        self.key = key
        self.patches = patches
@ -124,11 +127,21 @@ class LowVramPatch:
        self.set_func = set_func
        self.prepared_patches = None

-    def prepare(self, allocate_buffer, stream):
-        self.prepared_patches = [
-            (patch[0], comfy.lora.prefetch_prepared_value(patch[1], allocate_buffer, stream), patch[2], patch[3], patch[4])
+    def memory_required(self):
+        counter = [0]
+        for patch in self.patches[self.key]:
+            comfy.lora.prefetch_prepared_value(patch[1], counter, None, None, False)
+        return counter[0]
+
+    def prepare(self, destination, stream, copy=True, commit=True):
+        counter = [0]
+        prepared_patches = [
+            (patch[0], comfy.lora.prefetch_prepared_value(patch[1], counter, destination, stream, copy), patch[2], patch[3], patch[4])
            for patch in self.patches[self.key]
        ]
+        if commit:
+            self.prepared_patches = prepared_patches
+        return prepared_patches

    def clear_prepared(self):
        self.prepared_patches = None
@ -341,9 +354,6 @@ class ModelPatcher:
        self.size = comfy.model_management.module_size(self.model)
        return self.size

-    def model_mmap_residency(self, free=False):
-        return comfy.model_management.module_mmap_residency(self.model, free=free)
-
    def loaded_size(self):
        return self.model.model_loaded_weight_memory

@ -1118,8 +1128,12 @@ class ModelPatcher:
        # Pinned memory pressure tracking is only implemented for DynamicVram loading
        return 0

+    def loaded_ram_size(self):
+        # Loaded RAM pressure tracking is only implemented for DynamicVram loading
+        return 0
+
    def partially_unload_ram(self, ram_to_unload):
-        pass
+        return 0

    def detach(self, unpatch_all=True):
        self.eject_model()
@ -1550,6 +1564,16 @@ class ModelPatcherDynamic(ModelPatcher):
        super().__init__(model, load_device, offload_device, size, weight_inplace_update)
        if not hasattr(self.model, "dynamic_vbars"):
            self.model.dynamic_vbars = {}
+        if not hasattr(self.model, "dynamic_pins"):
+            self.model.dynamic_pins = {}
+        if self.load_device not in self.model.dynamic_pins:
+            self.model.dynamic_pins[self.load_device] = {
+                "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 0, 0), [], [-1], [0]),
+                "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 0, 0), [], [-1], [0]),
+                "hostbufs_initialized": False,
+                "failed": False,
+                "active": False,
+            }
        self.non_dynamic_delegate_model = None
        assert load_device is not None

@ -1611,6 +1635,14 @@ class ModelPatcherDynamic(ModelPatcher):
            self.unpatch_hooks()

            vbar = self._vbar_get(create=True)
+            pin_state = self.model.dynamic_pins[self.load_device]
+            if not pin_state["hostbufs_initialized"]:
+                hostbuf_size = comfy.model_management.pinned_hostbuf_size(self.model_size())
+                pin_state["weights"] = (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024, hostbuf_size), [], [-1], [0])
+                pin_state["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024, hostbuf_size), [], [-1], [0])
+                pin_state["hostbufs_initialized"] = True
+            pin_state["failed"] = False
+            pin_state["active"] = True
            if vbar is not None:
                vbar.prioritize()

@ -1636,7 +1668,9 @@ class ModelPatcherDynamic(ModelPatcher):
                    if key in self.patches:
                        if comfy.lora.calculate_shape(self.patches[key], weight, key) != weight.shape:
                            return (True, 0)
-                        setattr(m, param_key + "_lowvram_function", LowVramPatch(key, self.patches))
+                        lowvram_patch = LowVramPatch(key, self.patches)
+                        lowvram_patch._pin_state = pin_state
+                        setattr(m, param_key + "_lowvram_function", lowvram_patch)
                        num_patches += 1
                    else:
                        setattr(m, param_key + "_lowvram_function", None)
@ -1653,6 +1687,9 @@ class ModelPatcherDynamic(ModelPatcher):

                def force_load_param(self, param_key, device_to):
                    key = key_param_name_to_key(n, param_key)
+                    weight, _, _ = get_key_weight(self.model, key)
+                    if weight is None:
+                        return
                    if key in self.backup:
                        comfy.utils.set_attr_param(self.model, key, self.backup[key].weight)
                    self.patch_weight_to_device(key, device_to=device_to, force_cast=True)
@ -1662,17 +1699,23 @@ class ModelPatcherDynamic(ModelPatcher):

                if hasattr(m, "comfy_cast_weights"):
                    m.comfy_cast_weights = True
-                    m.pin_failed = False
                    m.seed_key = n
+                    m._pin_state = pin_state
                    set_dirty(m, dirty)

-                    force_load, v_weight_size = setup_param(self, m, n, "weight")
-                    force_load_bias, v_weight_bias = setup_param(self, m, n, "bias")
-                    force_load = force_load or force_load_bias
-                    v_weight_size += v_weight_bias
+                    #Models that mix tiny and giant weights can causing lopsided stream buffer
+                    #rotations and stall. force the tinys over.
+                    if module_mem > 16 * 1024:
+                        force_load, v_weight_size = setup_param(self, m, n, "weight")
+                        force_load_bias, v_weight_bias = setup_param(self, m, n, "bias")
+                        force_load = force_load or force_load_bias
+                        v_weight_size += v_weight_bias
+                        if force_load:
+                            logging.info(f"Module {n} has resizing Lora - force loading")
+                    else:
+                        force_load=True

                    if force_load:
-                        logging.info(f"Module {n} has resizing Lora - force loading")
                        force_load_param(self, "weight", device_to)
                        force_load_param(self, "bias", device_to)
                    else:
@ -1740,23 +1783,58 @@ class ModelPatcherDynamic(ModelPatcher):

        return freed

-    def pinned_memory_size(self):
-        total = 0
-        loading = self._load_list(for_dynamic=True)
-        for x in loading:
-            _, _, _, _, m, _ = x
-            pin = comfy.pinned_memory.get_pin(m)
-            if pin is not None:
-                total += pin.numel() * pin.element_size()
-        return total
+    def loaded_ram_size(self):
+        return (self.model.dynamic_pins[self.load_device]["weights"][0].size +
+                self.model.dynamic_pins[self.load_device]["patches"][0].size)

-    def partially_unload_ram(self, ram_to_unload):
-        loading = self._load_list(for_dynamic=True, default_device=self.offload_device)
-        for x in loading:
-            *_, m, _ = x
-            ram_to_unload -= comfy.pinned_memory.unpin_memory(m)
-            if ram_to_unload <= 0:
-                return
+    def pinned_memory_size(self):
+        return (self.model.dynamic_pins[self.load_device]["weights"][3][0] +
+                self.model.dynamic_pins[self.load_device]["patches"][3][0])
+
+    def unregister_inactive_pins(self, ram_to_unload, subsets=[ "weights", "patches" ]):
+        freed = 0
+        pin_state = self.model.dynamic_pins[self.load_device]
+        for subset in subsets:
+            hostbuf, stack, stack_split, pinned_size = pin_state[subset]
+            split = stack_split[0]
+            while split >= 0:
+                module, offset = stack[split]
+                split -= 1
+                stack_split[0] = split
+                if not module._pin_registered:
+                    continue
+                size = module._pin.numel() * module._pin.element_size()
+                if torch.cuda.cudart().cudaHostUnregister(module._pin.data_ptr()) != 0:
+                    comfy.model_management.discard_cuda_async_error()
+                    continue
+                module._pin_registered = False
+                comfy.model_management.TOTAL_PINNED_MEMORY = max(0, comfy.model_management.TOTAL_PINNED_MEMORY - size)
+                pinned_size[0] = max(0, pinned_size[0] - size)
+                freed += size
+                ram_to_unload -= size
+                if ram_to_unload <= 0:
+                    return freed
+        return freed
+
+    def partially_unload_ram(self, ram_to_unload, subsets=[ "weights", "patches" ]):
+        freed = 0
+        pin_state = self.model.dynamic_pins[self.load_device]
+        for subset in subsets:
+            hostbuf, stack, stack_split, pinned_size = pin_state[subset]
+            while len(stack) > 0:
+                module, offset = stack.pop()
+                size = module._pin.numel() * module._pin.element_size()
+                del module._pin
+                hostbuf.truncate(offset, do_unregister=module._pin_registered)
+                stack_split[0] = min(stack_split[0], len(stack) - 1)
+                if module._pin_registered:
+                    comfy.model_management.TOTAL_PINNED_MEMORY = max(0, comfy.model_management.TOTAL_PINNED_MEMORY - size)
+                    pinned_size[0] = max(0, pinned_size[0] - size)
+                freed += size
+                ram_to_unload -= size
+                if ram_to_unload <= 0:
+                    return freed
+        return freed

    def patch_model(self, device_to=None, lowvram_model_memory=0, load_weights=True, force_patch_weights=False):
        #This isn't used by the core at all and can only be to load a model out of
--- a/comfy/ops.py
+++ b/comfy/ops.py
@ -75,6 +75,8 @@ except:

 cast_to = comfy.model_management.cast_to #TODO: remove once no more references

+STREAM_PIN_BUFFER_HEADROOM = 8 * 1024 * 1024
+
 def cast_to_input(weight, input, non_blocking=False, copy=True):
    return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)

@ -91,6 +93,9 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin
    offload_stream = None
    cast_buffer = None
    cast_buffer_offset = 0
+    stream_pin_hostbuf = None
+    stream_pin_offset = 0
+    stream_pin_queue = []

    def ensure_offload_stream(module, required_size, check_largest):
        nonlocal offload_stream
@ -124,6 +129,22 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin
        cast_buffer_offset += buffer_size
        return buffer

+    def get_stream_pin_buffer_offset(buffer_size):
+        nonlocal stream_pin_hostbuf
+        nonlocal stream_pin_offset
+
+        if buffer_size == 0 or offload_stream is None:
+            return None
+
+        if stream_pin_hostbuf is None:
+            stream_pin_hostbuf = comfy.model_management.get_pin_buffer(offload_stream)
+            if stream_pin_hostbuf is None:
+                return None
+
+        offset = stream_pin_offset
+        stream_pin_offset += buffer_size
+        return offset
+
    for s in comfy_modules:
        signature = comfy_aimdo.model_vbar.vbar_fault(s._v)
        resident = comfy_aimdo.model_vbar.vbar_signature_compare(signature, s._v_signature)
@ -162,23 +183,47 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin
        if xfer_dest is None:
            xfer_dest = get_cast_buffer(dest_size)

-        if signature is None and pin is None:
-            comfy.pinned_memory.pin_memory(s)
-            pin = comfy.pinned_memory.get_pin(s)
-        else:
-            pin = None
+        def cast_maybe_lowvram_patch(xfer_source, xfer_dest, stream):
+            if xfer_source is not None:
+                if getattr(xfer_source, "is_lowvram_patch", False):
+                    xfer_source.prepare(xfer_dest, stream, copy=True, commit=False)
+                else:
+                    comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=stream)

-        if pin is not None:
-            comfy.model_management.cast_to_gathered(xfer_source, pin)
-            xfer_source = [ pin ]
-        #send it over
-        comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream)
+        def handle_pin(m, pin, source, dest, subset="weights", size=None):
+            if pin is not None:
+                cast_maybe_lowvram_patch([pin], dest, offload_stream)
+                return
+            if signature is None:
+                comfy.pinned_memory.pin_memory(m, subset=subset, size=size)
+                pin = comfy.pinned_memory.get_pin(m, subset=subset)
+                if pin is not None:
+                    if isinstance(source, list):
+                        comfy.model_management.cast_to_gathered(source, pin, non_blocking=non_blocking, stream=offload_stream, r2=dest)
+                    else:
+                        cast_maybe_lowvram_patch(source, pin, None)
+                        cast_maybe_lowvram_patch([ pin ], dest, offload_stream)
+                    return
+            if pin is None:
+                pin_offset = get_stream_pin_buffer_offset(size)
+                if pin_offset is not None:
+                    stream_pin_queue.append((source, pin_offset, size, dest))
+                    return
+            cast_maybe_lowvram_patch(source, dest, offload_stream)
+
+        handle_pin(s, pin, xfer_source, xfer_dest, size=dest_size)

        for param_key in ("weight", "bias"):
-            lowvram_fn = getattr(s, param_key + "_lowvram_function", None)
-            if lowvram_fn is not None:
+            lowvram_source = getattr(s, param_key + "_lowvram_function", None)
+            if lowvram_source is not None:
                ensure_offload_stream(s, cast_buffer_offset, False)
-                lowvram_fn.prepare(lambda size: get_cast_buffer(size), offload_stream)
+                lowvram_size = lowvram_source.memory_required()
+                lowvram_dest = get_cast_buffer(lowvram_size)
+                lowvram_source.prepare(lowvram_dest, None, copy=False, commit=True)
+
+                pin = comfy.pinned_memory.get_pin(lowvram_source, subset="patches")
+                handle_pin(lowvram_source, pin, lowvram_source, lowvram_dest, subset="patches", size=lowvram_size)
+

        prefetch["xfer_dest"] = xfer_dest
        prefetch["cast_dest"] = cast_dest
@ -186,6 +231,23 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin
        prefetch["needs_cast"] = needs_cast
        s._prefetch = prefetch

+    if stream_pin_offset > 0:
+        if stream_pin_hostbuf.size < stream_pin_offset:
+            if not comfy.model_management.resize_pin_buffer(stream_pin_hostbuf, stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM):
+                for xfer_source, _, _, xfer_dest in stream_pin_queue:
+                    cast_maybe_lowvram_patch(xfer_source, xfer_dest, offload_stream)
+                return offload_stream
+        stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf)
+        stream_pin_tensor.untyped_storage()._comfy_hostbuf = stream_pin_hostbuf
+        for xfer_source, pin_offset, pin_size, xfer_dest in stream_pin_queue:
+            pin = stream_pin_tensor[pin_offset:pin_offset + pin_size]
+            if isinstance(xfer_source, list):
+                comfy.model_management.cast_to_gathered(xfer_source, pin, non_blocking=non_blocking, stream=offload_stream, r2=xfer_dest)
+            else:
+                cast_maybe_lowvram_patch(xfer_source, pin, None)
+                comfy.model_management.cast_to_gathered([ pin ], xfer_dest, non_blocking=non_blocking, stream=offload_stream)
+        stream_pin_hostbuf._comfy_event = offload_stream.record_event()
+
    return offload_stream


--- a/comfy/pinned_memory.py
+++ b/comfy/pinned_memory.py
@ -2,42 +2,62 @@ import comfy.model_management
 import comfy.memory_management
 import comfy_aimdo.host_buffer
 import comfy_aimdo.torch
+import torch

 from comfy.cli_args import args

-def get_pin(module):
-    return getattr(module, "_pin", None)
+def get_pin(module, subset="weights"):
+    pin = getattr(module, "_pin", None)
+    if pin is None or module._pin_registered or args.disable_pinned_memory:
+        return pin

-def pin_memory(module):
-    if module.pin_failed or args.disable_pinned_memory or get_pin(module) is not None:
+    _, _, stack_split, pinned_size = module._pin_state[subset]
+    size = pin.nbytes
+    comfy.model_management.ensure_pin_registerable(size)
+
+    if torch.cuda.cudart().cudaHostRegister(pin.data_ptr(), size, 1) != 0:
+        comfy.model_management.discard_cuda_async_error()
+        return pin
+
+    module._pin_registered = True
+    stack_split[0] = max(stack_split[0], module._pin_stack_index)
+    comfy.model_management.TOTAL_PINNED_MEMORY += size
+    pinned_size[0] += size
+    return pin
+
+def pin_memory(module, subset="weights", size=None):
+    pin_state = module._pin_state
+    if args.disable_pinned_memory:
        return

-    size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ])
+    pin = get_pin(module, subset)
+    if pin is not None or pin_state["failed"]:
+        return

-    if comfy.model_management.MAX_PINNED_MEMORY <= 0 or (comfy.model_management.TOTAL_PINNED_MEMORY + size) > comfy.model_management.MAX_PINNED_MEMORY:
-        module.pin_failed = True
+    hostbuf, stack, stack_split, pinned_size = pin_state[subset]
+    if size is None:
+        size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ])
+    offset = hostbuf.size
+    registerable_size = size + max(0, hostbuf.size - pinned_size[0])
+
+    comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM)
+    if (not comfy.model_management.ensure_pin_budget(size) or
+        not comfy.model_management.ensure_pin_registerable(registerable_size)):
+        pin_state["failed"] = True
        return False

    try:
-        hostbuf = comfy_aimdo.host_buffer.HostBuffer(size)
+        hostbuf.extend(size=size)
    except RuntimeError:
-        module.pin_failed = True
+        pin_state["failed"] = True
        return False

-    module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)
-    module._pin_hostbuf = hostbuf
+    module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size]
+    module._pin.untyped_storage()._comfy_hostbuf = hostbuf
+    stack.append((module, offset))
+    module._pin_registered = True
+    module._pin_stack_index = len(stack) - 1
+    stack_split[0] = max(stack_split[0], module._pin_stack_index)
    comfy.model_management.TOTAL_PINNED_MEMORY += size
+    pinned_size[0] += size
    return True
-
-def unpin_memory(module):
-    if get_pin(module) is None:
-        return 0
-    size = module._pin.numel() * module._pin.element_size()
-
-    comfy.model_management.TOTAL_PINNED_MEMORY -= size
-    if comfy.model_management.TOTAL_PINNED_MEMORY < 0:
-        comfy.model_management.TOTAL_PINNED_MEMORY = 0
-
-    del module._pin
-    del module._pin_hostbuf
-    return size
--- a/comfy/utils.py
+++ b/comfy/utils.py
@ -113,7 +113,6 @@ def load_safetensors(ckpt):
                        "_comfy_tensor_file_slice",
                        comfy.memory_management.TensorFileSlice(f, threading.get_ident(), data_base_offset + start, end - start))
                setattr(storage, "_comfy_tensor_mmap_refs", (model_mmap, mv))
-                setattr(storage, "_comfy_tensor_mmap_touched", False)
                sd[name] = tensor

    return sd, header.get("__metadata__", {}),
@ -1451,4 +1450,3 @@ def deepcopy_list_dict(obj, memo=None):

    memo[obj_id] = res
    return res
-
--- a/comfy/windows.py
+++ b/comfy/windows.py
@ -1,52 +0,0 @@
-import ctypes
-import logging
-import psutil
-from ctypes import wintypes
-
-import comfy_aimdo.control
-
-psapi = ctypes.WinDLL("psapi")
-kernel32 = ctypes.WinDLL("kernel32")
-
-class PERFORMANCE_INFORMATION(ctypes.Structure):
-    _fields_ = [
-        ("cb", wintypes.DWORD),
-        ("CommitTotal", ctypes.c_size_t),
-        ("CommitLimit", ctypes.c_size_t),
-        ("CommitPeak", ctypes.c_size_t),
-        ("PhysicalTotal", ctypes.c_size_t),
-        ("PhysicalAvailable", ctypes.c_size_t),
-        ("SystemCache", ctypes.c_size_t),
-        ("KernelTotal", ctypes.c_size_t),
-        ("KernelPaged", ctypes.c_size_t),
-        ("KernelNonpaged", ctypes.c_size_t),
-        ("PageSize", ctypes.c_size_t),
-        ("HandleCount", wintypes.DWORD),
-        ("ProcessCount", wintypes.DWORD),
-        ("ThreadCount", wintypes.DWORD),
-    ]
-
-def get_free_ram():
-    #Windows is way too conservative and chalks recently used uncommitted model RAM
-    #as "in-use". So, calculate free RAM for the sake of general use as the greater of:
-    #
-    #1: What psutil says
-    #2: Total Memory - (Committed Memory - VRAM in use)
-    #
-    #We have to subtract VRAM in use from the comitted memory as WDDM creates a naked
-    #commit charge for all VRAM used just incase it wants to page it all out. This just
-    #isn't realistic so "overcommit" on our calculations by just subtracting it off.
-
-    pi = PERFORMANCE_INFORMATION()
-    pi.cb = ctypes.sizeof(pi)
-
-    if not psapi.GetPerformanceInfo(ctypes.byref(pi), pi.cb):
-        logging.warning("WARNING: Failed to query windows performance info. RAM usage may be sub optimal")
-        return psutil.virtual_memory().available
-
-    committed = pi.CommitTotal * pi.PageSize
-    total = pi.PhysicalTotal * pi.PageSize
-
-    return max(psutil.virtual_memory().available,
-               total - (committed - comfy_aimdo.control.get_total_vram_usage()))
-
--- a/comfy_extras/mediapipe/face_geometry.py
+++ b/comfy_extras/mediapipe/face_geometry.py
@ -0,0 +1,111 @@
+"""Pure-numpy port of MediaPipe's face_geometry (FACE_LANDMARK_PIPELINE mode)
+ weighted Procrustes solver. Computes the 4x4 facial transformation matrix.
+"""
+
+from __future__ import annotations
+
+import math
+import numpy as np
+
+
+def _solve_weighted_orthogonal_problem(src: np.ndarray, tgt: np.ndarray, weights: np.ndarray) -> np.ndarray:
+    """Weighted orthogonal Procrustes (similarity). Returns 4x4 M with
+    `target ≈ M @ homogeneous(source)` in the weighted LS sense. fp64 for
+    SVD stability. Port of procrustes_solver.cc."""
+    sqrt_w = np.sqrt(weights.astype(np.float64))
+    w_total = float((sqrt_w ** 2).sum())
+    ws = src.astype(np.float64) * sqrt_w
+    wt = tgt.astype(np.float64) * sqrt_w
+
+    c_w = (ws @ sqrt_w) / w_total
+    centered = ws - np.outer(c_w, sqrt_w)
+    U, _S, Vt = np.linalg.svd(wt @ centered.T, full_matrices=True)
+    # Disallow reflection: flip the least-significant axis when det(U)·det(V)<0.
+    post, pre = U.copy(), Vt.T.copy()
+    if np.linalg.det(post) * np.linalg.det(pre) < 0:
+        post[:, 2] *= -1.0
+    R = post @ pre.T
+
+    denom = float((centered * ws).sum())
+    if denom < 1e-12:
+        raise ValueError("Procrustes denominator collapsed (degenerate source).")
+    scale = float((R @ centered * wt).sum()) / denom
+    translation = ((wt - scale * (R @ ws)) @ sqrt_w) / w_total
+
+    M = np.eye(4, dtype=np.float64)
+    M[:3, :3] = scale * R
+    M[:3, 3] = translation
+    return M
+
+
+def _estimate_scale(canonical: np.ndarray, runtime: np.ndarray, weights: np.ndarray) -> float:
+    """scale = ‖first column of M[:3]‖ per geometry_pipeline.cc::EstimateScale."""
+    return float(np.linalg.norm(_solve_weighted_orthogonal_problem(canonical, runtime, weights)[:3, 0]))
+
+
+def solve_facial_transformation_matrix(
+    landmarks_normalized: np.ndarray,
+    canonical_vertices: np.ndarray,
+    procrustes_indices: np.ndarray,
+    procrustes_weights: np.ndarray,
+    image_width: int,
+    image_height: int,
+    # face_geometry_calculator_options.pbtxt defaults
+    vertical_fov_degrees: float = 63.0,
+    near: float = 1.0,
+) -> np.ndarray:
+    """4x4 facial transformation matrix via two-pass scale recovery
+    `landmarks_normalized` is (N, 3) in MediaPipe normalized convention: x, y
+    in [0,1] with TOP-LEFT origin, z in width-scaled units.
+    """
+
+    h_near = 2.0 * near * math.tan(0.5 * math.radians(vertical_fov_degrees))
+    w_near = image_width * h_near / image_height
+
+    sub = procrustes_indices.astype(np.int64)
+    screen = landmarks_normalized[sub].T.astype(np.float64).copy()
+    canon = canonical_vertices[sub].T.astype(np.float64).copy()
+    weights = procrustes_weights.astype(np.float64)
+
+    # ProjectXY (TOP_LEFT y-flip, then scale all 3 axes; z uses x-scale).
+    screen[1] = 1.0 - screen[1]
+    screen[0] = screen[0] * w_near - 0.5 * w_near
+    screen[1] = screen[1] * h_near - 0.5 * h_near
+    screen[2] = screen[2] * w_near
+    depth_offset = float(screen[2].mean())
+
+    def _unproject(s: np.ndarray, scale: float) -> np.ndarray:
+        s = s.copy()
+        s[2] = (s[2] - depth_offset + near) / scale
+        s[0] *= s[2] / near
+        s[1] *= s[2] / near
+        s[2] *= -1.0
+        return s
+
+    first = screen.copy()
+    first[2] *= -1.0
+    s1 = _estimate_scale(canon, first, weights) # 1st pass: Procrustes on projected XY
+    s2 = _estimate_scale(canon, _unproject(screen, s1), weights) # 2nd pass: rescale z by s1, un-project XY
+    return _solve_weighted_orthogonal_problem(canon, _unproject(screen, s1 * s2), weights).astype(np.float32)
+
+
+def transformation_matrix_from_detection(face_dict: dict, image_width: int, image_height: int, canonical_data: dict) -> np.ndarray:
+    """Adapt a FaceLandmarker face dict to MP's normalized convention and solve.
+    FaceMesh emits (x, y, z) in 192-canonical units; MP's geometry expects
+    z_norm = z_canonical * scale_x / image_width"""
+
+    lmks_xy, lmks_3d = face_dict["landmarks_xy"], face_dict["landmarks_3d"]
+    aug = np.concatenate([lmks_3d[:, :2].astype(np.float64), np.ones((lmks_xy.shape[0], 1))], axis=1)
+    M, *_ = np.linalg.lstsq(aug, lmks_xy.astype(np.float64), rcond=None)
+    scale_x = float(np.linalg.norm(M[0]))
+    z_scale = scale_x / image_width if scale_x > 1e-6 else 1.0 / image_width
+
+    normalized = np.empty((lmks_xy.shape[0], 3), dtype=np.float32)
+    normalized[:, 0] = lmks_xy[:, 0] / image_width
+    normalized[:, 1] = lmks_xy[:, 1] / image_height
+    normalized[:, 2] = lmks_3d[:, 2] * z_scale
+    return solve_facial_transformation_matrix(
+        normalized, canonical_data["canonical_vertices"],
+        canonical_data["procrustes_indices"], canonical_data["procrustes_weights"],
+        image_width=image_width, image_height=image_height,
+    )
--- a/comfy_extras/mediapipe/face_landmarker.py
+++ b/comfy_extras/mediapipe/face_landmarker.py
@ -0,0 +1,682 @@
+"""Pure-PyTorch port of MediaPipe's face_landmarker_v2_with_blendshapes.task:
+BlazeFace detector → FaceMesh v2 → ARKit-52 blendshapes."""
+
+from __future__ import annotations
+
+import math
+from functools import lru_cache
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from scipy.special import expit
+from torch import Tensor, nn
+
+
+# Values below must stay verbatim with the published face_landmarker_v2 graph
+
+# face_blendshapes_graph.cc::kLandmarksSubsetIdxs
+_BS_INPUT_INDICES: Tuple[int, ...] = (
+    0, 1, 4, 5, 6, 7, 8, 10, 13, 14, 17, 21, 33, 37, 39, 40, 46, 52, 53, 54,
+    55, 58, 61, 63, 65, 66, 67, 70, 78, 80, 81, 82, 84, 87, 88, 91, 93, 95,
+    103, 105, 107, 109, 127, 132, 133, 136, 144, 145, 146, 148, 149, 150, 152,
+    153, 154, 155, 157, 158, 159, 160, 161, 162, 163, 168, 172, 173, 176, 178,
+    181, 185, 191, 195, 197, 234, 246, 249, 251, 263, 267, 269, 270, 276, 282,
+    283, 284, 285, 288, 291, 293, 295, 296, 297, 300, 308, 310, 311, 312, 314,
+    317, 318, 321, 323, 324, 332, 334, 336, 338, 356, 361, 362, 365, 373, 374,
+    375, 377, 378, 379, 380, 381, 382, 384, 385, 386, 387, 388, 389, 390, 397,
+    398, 400, 402, 405, 409, 415, 454, 466, 468, 469, 470, 471, 472, 473, 474,
+    475, 476, 477,
+)
+
+# face_blendshapes_graph.cc::kCategoryNames
+BLENDSHAPE_NAMES: Tuple[str, ...] = (
+    "_neutral", "browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft",
+    "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight",
+    "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight",
+    "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight",
+    "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight",
+    "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen",
+    "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight",
+    "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft",
+    "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft",
+    "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower",
+    "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft",
+    "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight",
+    "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight",
+)
+
+# face_detection.pbtxt — short-range BlazeFace.
+_BF_NUM_LAYERS = 4
+_BF_INPUT_SIZE = 128
+_BF_STRIDES = (8, 16, 16, 16)
+_BF_ANCHOR_OFFSET_X = 0.5
+_BF_ANCHOR_OFFSET_Y = 0.5
+_BF_ASPECT_RATIOS = (1.0,)
+_BF_INTERP_SCALE_AR = 1.0
+_BF_BOX_SCALE = 128.0
+_BF_KP_OFFSET = 4
+_BF_SCORE_CLIP = 100.0
+_BF_MIN_SCORE = 0.5
+
+# face_detection_full_range.pbtxt — 48x48 grid at stride 4, 1 anchor/cell.
+_BF_FR_INPUT_SIZE = 192
+_BF_FR_GRID = 48
+_BF_FR_NUM_ANCHORS = _BF_FR_GRID * _BF_FR_GRID
+_BF_FR_BOX_SCALE = 192.0
+_BF_FR_SCORE_CLIP = 100.0
+
+_FM_INPUT_SIZE = 192
+
+# Face ROI: 1.5xbbox rect warped anisotropically into 192x192.
+_FACE_LEFT_EYE_KP = 0
+_FACE_RIGHT_EYE_KP = 1
+_FACE_ROI_SCALE_X = 1.5
+_FACE_ROI_SCALE_Y = 1.5
+_FACE_ROI_TARGET_ANGLE = 0.0
+
+
+def _tf_same_pad(x: Tensor, kernel: int, stride: int) -> Tensor:
+    """TF SAME pad (asymmetric on stride-2; PyTorch's symmetric pad undershoots by 1 px)."""
+    H, W = x.shape[-2], x.shape[-1]
+    pad_h = max(((H + stride - 1) // stride - 1) * stride + kernel - H, 0)
+    pad_w = max(((W + stride - 1) // stride - 1) * stride + kernel - W, 0)
+    if pad_h == 0 and pad_w == 0:
+        return x
+    return F.pad(x, (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2))
+
+
+# BlazeFace short-range: stem 5x5/s2 → 16 BlazeBlocks → parallel heads at
+# 16²x88 (2 anchors/cell) and 8²x96 (6/cell) = 896 anchors. (in, out, stride):
+_BLAZEFACE_BLOCKS = [
+    (24, 24, 1), (24, 28, 1), (28, 32, 2), (32, 36, 1),
+    (36, 42, 1), (42, 48, 2), (48, 56, 1), (56, 64, 1),
+    (64, 72, 1), (72, 80, 1), (80, 88, 1), (88, 96, 2),
+    (96, 96, 1), (96, 96, 1), (96, 96, 1), (96, 96, 1),
+]
+
+
+class BlazeFaceBlock(nn.Module):
+    """DW 3x3 + PW + residual. Residual max-pools on stride>1, channel-pads on out_ch>in_ch."""
+
+    def __init__(self, in_ch: int, out_ch: int, stride: int, device=None, dtype=None, operations=None):
+        super().__init__()
+        ops = operations if operations is not None else nn
+        self.in_ch, self.out_ch, self.stride = in_ch, out_ch, stride
+        self.depthwise = ops.Conv2d(in_ch, in_ch, 3, stride=stride, padding=0, groups=in_ch, bias=True, device=device, dtype=dtype)
+        self.pointwise = ops.Conv2d(in_ch, out_ch, 1, padding=0, bias=True, device=device, dtype=dtype)
+
+    def forward(self, x: Tensor) -> Tensor:
+        residual = F.max_pool2d(x, 2, 2) if self.stride > 1 else x
+        if self.out_ch > self.in_ch:
+            residual = F.pad(residual, (0, 0, 0, 0, 0, self.out_ch - self.in_ch))
+        x = _tf_same_pad(x, 3, self.stride) if self.stride > 1 else F.pad(x, (1, 1, 1, 1))
+        return F.relu(self.pointwise(self.depthwise(x)) + residual)
+
+
+class BlazeFace(nn.Module):
+    """Short-range BlazeFace: (B, 3, 128, 128) in [-1, 1] → 896 anchors x 17."""
+
+    def __init__(self, device=None, dtype=None, operations=None):
+        super().__init__()
+        ops = operations if operations is not None else nn
+        kw = dict(device=device, dtype=dtype)
+        self.stem = ops.Conv2d(3, 24, 5, stride=2, padding=0, bias=True, **kw)
+        self.blocks = nn.ModuleList(BlazeFaceBlock(i, o, s, device=device, dtype=dtype, operations=operations)
+                                    for (i, o, s) in _BLAZEFACE_BLOCKS)
+        # 16²x2 + 8²x6 = 512 + 384 = 896 anchors.
+        self.cls_16 = ops.Conv2d(88, 2, 1, padding=0, bias=True, **kw)
+        self.cls_8 = ops.Conv2d(96, 6, 1, padding=0, bias=True, **kw)
+        self.reg_16 = ops.Conv2d(88, 32, 1, padding=0, bias=True, **kw)
+        self.reg_8 = ops.Conv2d(96, 96, 1, padding=0, bias=True, **kw)
+
+    def forward(self, image_chw_normalized: Tensor) -> tuple[Tensor, Tensor]:
+        x = F.relu(self.stem(_tf_same_pad(image_chw_normalized, 5, 2)))
+        # 16x16 tap is block-10 output (before the 88→96 stride-2 in block 11).
+        for i in range(11):
+            x = self.blocks[i](x)
+        feat_16 = x
+        for i in range(11, 16):
+            x = self.blocks[i](x)
+        feat_8 = x
+
+        def flat(t, a, k):  # NHWC flatten → (B, H*W*A, K)
+            B, _, H, W = t.shape
+            return t.permute(0, 2, 3, 1).reshape(B, H * W * a, k)
+
+        cls = torch.cat([flat(self.cls_16(feat_16), 2, 1), flat(self.cls_8(feat_8), 6, 1)], dim=1)
+        reg = torch.cat([flat(self.reg_16(feat_16), 2, 16), flat(self.reg_8(feat_8), 6, 16)], dim=1)
+        return reg, cls
+
+
+# BlazeFace full-range (face_detection_full_range_sparse.tflite): MobileNetV2-ish
+# backbone + top-down FPN, 192² input → 2304 anchors at the 48x48 grid.
+class FRBlock(nn.Module):
+    """Double inverted residual: DW → PW(mid) → DW → PW(out) [+ residual].
+
+    Per source tflite: dw* have no fused activation, pw1 is always ReLU, pw2
+    is ReLU only when no residual (else ReLU fuses into the ADD).
+    """
+
+    def __init__(self, in_ch: int, mid_ch: int, out_ch: int, stride: int, device=None, dtype=None, operations=None):
+        super().__init__()
+        ops = operations if operations is not None else nn
+        kw = dict(device=device, dtype=dtype)
+        self.has_residual = (in_ch == out_ch and stride == 1)
+        self.dw1 = ops.Conv2d(in_ch, in_ch, 3, stride=stride, padding=0, groups=in_ch, bias=True, **kw)
+        self.pw1 = ops.Conv2d(in_ch, mid_ch, 1, padding=0, bias=True, **kw)
+        self.dw2 = ops.Conv2d(mid_ch, mid_ch, 3, stride=1, padding=0, groups=mid_ch, bias=True, **kw)
+        self.pw2 = ops.Conv2d(mid_ch, out_ch, 1, padding=0, bias=True, **kw)
+
+    def forward(self, x: Tensor) -> Tensor:
+        residual = x if self.has_residual else None
+        x = F.relu(self.pw1(self.dw1(F.pad(x, (1, 1, 1, 1)))))
+        x = self.pw2(self.dw2(F.pad(x, (1, 1, 1, 1))))
+        return F.relu(x + residual) if residual is not None else F.relu(x)
+
+
+# (in_ch, mid_ch, out_ch, stride). Stages downsample 96²x32 → 48²x64 → 24²x128
+# → 12²x192 → 6²x384. Lateral taps at indices 4, 7, 10 (see _FR_LATERAL_*).
+_FR_BACKBONE_BLOCKS = [
+    (32, 8, 32, 1),    (32, 8, 32, 1),                                            # 96²x32
+    (32, 16, 64, 2),   (64, 16, 64, 1),   (64, 16, 64, 1),                        # 48²x64 — tap[0]
+    (64, 32, 128, 2),  (128, 32, 128, 1), (128, 32, 128, 1),                      # 24²x128 — tap[1]
+    (128, 48, 192, 2), (192, 48, 192, 1), (192, 48, 192, 1),                      # 12²x192 — tap[2]
+    (192, 96, 384, 2), (384, 96, 384, 1), (384, 96, 384, 1), (384, 96, 384, 1),   # 6²x384
+]
+_FR_LATERAL_TAP_INDICES = (4, 7, 10)
+_FR_LATERAL_CHANNELS = ((64, 48), (128, 64), (192, 96))  # (in, out) per side-conv
+
+# Decoder blocks per FPN level (after upsample-and-merge with the lateral).
+_FR_DECODER_BLOCKS = [
+    [(96, 48, 96, 1), (96, 48, 96, 1)],  # 12²x96
+    [(64, 32, 64, 1), (64, 32, 64, 1)],  # 24²x64
+    [(48, 24, 48, 1)],                   # 48²x48 — feeds the heads
+]
+
+
+def _dcr_depth_to_space(t: Tensor, r: int, c_out: int) -> Tensor:
+    """TF DEPTH_TO_SPACE in DCR layout (input channels = (i, j, c_out)).
+    pixel_shuffle uses CRD which permutes output channels for c_out > 1."""
+    B_, _, H_, W_ = t.shape
+    t = t.reshape(B_, r, r, c_out, H_, W_)
+    t = t.permute(0, 3, 4, 1, 5, 2).contiguous()
+    return t.reshape(B_, c_out, H_ * r, W_ * r)
+
+
+class BlazeFaceFullRange(nn.Module):
+    """Full-range face detector: (B, 3, 192, 192) in [-1, 1] → 2304 anchors x 17 values."""
+
+    def __init__(self, device=None, dtype=None, operations=None):
+        super().__init__()
+        ops = operations if operations is not None else nn
+        kw = dict(device=device, dtype=dtype)
+        mk_block = lambda i, m, o, s: FRBlock(i, m, o, s, device=device, dtype=dtype, operations=operations)
+        self.stem = ops.Conv2d(3, 32, 3, stride=2, padding=0, bias=True, **kw)
+        self.backbone = nn.ModuleList(mk_block(i, m, o, s) for (i, m, o, s) in _FR_BACKBONE_BLOCKS)
+        self.lateral_convs = nn.ModuleList(ops.Conv2d(i, o, 1, padding=0, bias=True, **kw) for (i, o) in _FR_LATERAL_CHANNELS)
+        self.top_conv = ops.Conv2d(384, 96, 1, padding=0, bias=True, **kw)
+        self.decoder_levels = nn.ModuleList(
+            nn.ModuleList(mk_block(i, m, o, s) for (i, m, o, s) in lvl) for lvl in _FR_DECODER_BLOCKS
+        )
+        # 96→64 before 12→24, 64→48 before 24→48.
+        self.decoder_reduce_convs = nn.ModuleList([
+            ops.Conv2d(96, 64, 1, padding=0, bias=True, **kw),
+            ops.Conv2d(64, 48, 1, padding=0, bias=True, **kw),
+        ])
+        # Heads mix 2x2-cell info via DW-stride-2 + depth_to_space block_size=2.
+        self.cls_conv = ops.Conv2d(48, 4, 1, padding=0, bias=True, **kw)
+        self.cls_dw = ops.Conv2d(4, 4, 3, stride=2, padding=0, groups=4, bias=True, **kw)
+        self.reg_conv = ops.Conv2d(48, 64, 1, padding=0, bias=True, **kw)
+        self.reg_dw = ops.Conv2d(64, 64, 3, stride=2, padding=0, groups=64, bias=True, **kw)
+
+    def forward(self, image_chw_normalized: Tensor) -> tuple[Tensor, Tensor]:
+        # Symmetric pad-1 throughout (full-range tflite uses explicit TF PAD, not SAME).
+        x = F.relu(self.stem(F.pad(image_chw_normalized, (1, 1, 1, 1))))
+        tap_set = set(_FR_LATERAL_TAP_INDICES)
+        laterals: list[Tensor] = []
+        for i, blk in enumerate(self.backbone):
+            x = blk(x)
+            if i in tap_set:
+                laterals.append(x)
+
+        # top_conv / lateral_convs / decoder_reduce_convs all have fused ReLU in the tflite.
+        p = F.relu(self.top_conv(x))
+        laterals_rev = list(reversed(laterals))
+        lateral_convs_rev = list(reversed(self.lateral_convs))
+        for level in range(len(self.decoder_levels)):
+            lateral = laterals_rev[level]
+            p = F.interpolate(p, size=lateral.shape[-2:], mode="bilinear", align_corners=False)
+            p = p + F.relu(lateral_convs_rev[level](lateral))
+            for blk in self.decoder_levels[level]:
+                p = blk(p)
+            if level < len(self.decoder_reduce_convs):
+                p = F.relu(self.decoder_reduce_convs[level](p))
+
+        c = self.cls_dw(F.pad(self.cls_conv(p), (1, 1, 1, 1)))
+        c = _dcr_depth_to_space(c, r=2, c_out=1)
+        r = self.reg_dw(F.pad(self.reg_conv(p), (1, 1, 1, 1)))
+        r = _dcr_depth_to_space(r, r=2, c_out=16)
+        B = c.shape[0]
+        cls_out = c.permute(0, 2, 3, 1).reshape(B, _BF_FR_NUM_ANCHORS, 1)
+        reg_out = r.permute(0, 2, 3, 1).reshape(B, _BF_FR_NUM_ANCHORS, 16)
+        return reg_out, cls_out
+
+
+@lru_cache(maxsize=1)
+def _blazeface_full_range_anchors() -> np.ndarray:
+    """2304 anchors over 48x48; anchor_w=anchor_h=1 (fixed_anchor_size)."""
+    feat = _BF_FR_GRID
+    yy, xx = np.meshgrid(np.arange(feat, dtype=np.float32), np.arange(feat, dtype=np.float32), indexing="ij")
+    cx, cy, ones = (xx + 0.5) / feat, (yy + 0.5) / feat, np.ones_like(xx)
+    return np.stack([cx, cy, ones, ones], axis=-1).reshape(_BF_FR_NUM_ANCHORS, 4)
+
+
+def _decode_blazeface_full_range(regressors: np.ndarray, classificators: np.ndarray,
+                                 score_thresh: float = _BF_MIN_SCORE) -> np.ndarray:
+    """Same decode as short-range with 2304-anchor grid and box_scale=192."""
+    scores = expit(np.clip(classificators[:, 0], -_BF_FR_SCORE_CLIP, _BF_FR_SCORE_CLIP))
+    keep = scores >= score_thresh
+    if not keep.any():
+        return np.empty((0, 17), dtype=np.float32)
+    r = regressors[keep] / _BF_FR_BOX_SCALE
+    a = _blazeface_full_range_anchors()[keep]
+    cxs, cys, aws, ahs = a[:, 0:1], a[:, 1:2], a[:, 2:3], a[:, 3:4]
+    xc, yc = r[:, 0:1] * aws + cxs, r[:, 1:2] * ahs + cys
+    w, h = r[:, 2:3] * aws, r[:, 3:4] * ahs
+    out = np.empty((r.shape[0], 17), dtype=np.float32)
+    out[:, 0:1], out[:, 1:2], out[:, 2:3], out[:, 3:4] = xc - w / 2, yc - h / 2, xc + w / 2, yc + h / 2
+    out[:, 4:16:2] = r[:, _BF_KP_OFFSET::2] * aws + cxs
+    out[:, 5:16:2] = r[:, _BF_KP_OFFSET + 1::2] * ahs + cys
+    out[:, 16] = scores[keep]
+    return out
+
+
+# FaceMesh (face_landmarks_detector.tflite): PReLU variant of BlazeBlock,
+# 17 blocks, heads for 478x3 landmarks + presence.
+_FACEMESH_BLOCKS = [  # (in_ch, out_ch, stride)
+    (16, 16, 1),  (16, 16, 1),  (16, 32, 2),  (32, 32, 1), (32, 32, 1), (32, 64, 2),
+    (64, 64, 1),  (64, 64, 1),  (64, 128, 2), (128, 128, 1), (128, 128, 1), (128, 128, 2),
+    (128, 128, 1), (128, 128, 1), (128, 128, 2), (128, 128, 1), (128, 128, 1),
+]
+
+
+class FaceMeshBlock(nn.Module):
+    """PReLU BlazeBlock: PReLU between DW and PW, and after the residual add."""
+
+    def __init__(self, in_ch: int, out_ch: int, stride: int, device=None, dtype=None, operations=None):
+        super().__init__()
+        ops = operations if operations is not None else nn
+        kw = dict(device=device, dtype=dtype)
+        self.in_ch, self.out_ch, self.stride = in_ch, out_ch, stride
+        self.depthwise = ops.Conv2d(in_ch, in_ch, 3, stride=stride, padding=0, groups=in_ch, bias=True, **kw)
+        self.prelu_dwise = nn.PReLU(num_parameters=in_ch, **kw)
+        self.pointwise = ops.Conv2d(in_ch, out_ch, 1, padding=0, bias=True, **kw)
+        self.prelu_out = nn.PReLU(num_parameters=out_ch, **kw)
+
+    def forward(self, x: Tensor) -> Tensor:
+        residual = F.max_pool2d(x, 2, 2) if self.stride > 1 else x
+        if self.out_ch > self.in_ch:
+            residual = F.pad(residual, (0, 0, 0, 0, 0, self.out_ch - self.in_ch))
+        x = _tf_same_pad(x, 3, self.stride) if self.stride > 1 else F.pad(x, (1, 1, 1, 1))
+        return self.prelu_out(self.pointwise(self.prelu_dwise(self.depthwise(x))) + residual)
+
+
+class FaceMesh(nn.Module):
+    NUM_LANDMARKS = 478
+
+    def __init__(self, device=None, dtype=None, operations=None):
+        super().__init__()
+        ops = operations if operations is not None else nn
+        kw = dict(device=device, dtype=dtype)
+        self.stem = ops.Conv2d(3, 16, 3, stride=2, padding=0, bias=True, **kw)
+        self.prelu_stem = nn.PReLU(num_parameters=16, **kw)
+        self.blocks = nn.ModuleList(FaceMeshBlock(i, o, s, device=device, dtype=dtype, operations=operations)
+                                    for (i, o, s) in _FACEMESH_BLOCKS)
+        self.head_reduce = ops.Conv2d(128, 8, 1, padding=0, bias=True, **kw)
+        self.prelu_head_reduce = nn.PReLU(num_parameters=8, **kw)
+        self.head_block = FaceMeshBlock(8, 8, 1, device=device, dtype=dtype, operations=operations)
+        self.head_presence = ops.Conv2d(8, 1, 3, padding=0, bias=True, **kw)
+        self.head_landmarks = ops.Conv2d(8, self.NUM_LANDMARKS * 3, 3, padding=0, bias=True, **kw)
+
+    def forward(self, face_chw_normalized: Tensor) -> tuple[Tensor, Tensor]:
+        """(B, 3, 192, 192) in [0, 1] → ((B, 478, 3) landmarks in 192-canonical, (B,) presence)."""
+        x = self.prelu_stem(self.stem(_tf_same_pad(face_chw_normalized, 3, 2)))
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.prelu_head_reduce(self.head_reduce(x))
+        x = self.head_block(x)
+        B = x.shape[0]
+        presence = self.head_presence(x).reshape(B)
+        lmks = self.head_landmarks(x).reshape(B, self.NUM_LANDMARKS, 3)
+        return lmks, presence
+
+
+# FaceBlendshapes (MLP-Mixer "GhumMarkerPoserMlpMixerGeneral"):
+# 146x2 → token-reduce 146→96 → embed 2→64 → +cls token → 4x mixer → cls→52.
+_BS_NUM_INPUT_LANDMARKS = 146
+_BS_NUM_TOKENS_REDUCED = 96
+_BS_NUM_TOKENS = 97  # +1 cls
+_BS_TOKEN_DIM = 64
+_BS_TOKEN_MIX_HIDDEN = 384
+_BS_CHANNEL_MIX_HIDDEN = 256
+_BS_NUM_BLENDSHAPES = 52
+_BS_LN_EPS = 1e-6
+
+
+class MlpMixerBlock(nn.Module):
+    """MLP-Mixer block: token-mixing MLP (over tokens) → channel-mixing MLP (over dim).
+    Both pre-LN, both residual. LN has no beta (bias=False) to match MP."""
+
+    def __init__(self, num_tokens: int, token_dim: int, token_hidden: int, channel_hidden: int,
+                 device=None, dtype=None, operations=None):
+        super().__init__()
+        ops = operations if operations is not None else nn
+        kw = dict(device=device, dtype=dtype)
+        # bias=False → no LN beta (matches MP).
+        self.ln1 = ops.LayerNorm(token_dim, eps=_BS_LN_EPS, bias=False, **kw)
+        self.ln2 = ops.LayerNorm(token_dim, eps=_BS_LN_EPS, bias=False, **kw)
+        self.token_mlp1 = ops.Linear(num_tokens, token_hidden, bias=True, **kw)
+        self.token_mlp2 = ops.Linear(token_hidden, num_tokens, bias=True, **kw)
+        self.channel_mlp1 = ops.Linear(token_dim, channel_hidden, bias=True, **kw)
+        self.channel_mlp2 = ops.Linear(channel_hidden, token_dim, bias=True, **kw)
+
+    def forward(self, x: Tensor) -> Tensor:
+        y = self.ln1(x).transpose(1, 2)
+        x = x + self.token_mlp2(F.relu(self.token_mlp1(y))).transpose(1, 2)
+        return x + self.channel_mlp2(F.relu(self.channel_mlp1(self.ln2(x))))
+
+
+class FaceBlendshapes(nn.Module):
+    def __init__(self, device=None, dtype=None, operations=None):
+        super().__init__()
+        ops = operations if operations is not None else nn
+        kw = dict(device=device, dtype=dtype)
+        self.token_reduce = ops.Linear(_BS_NUM_INPUT_LANDMARKS, _BS_NUM_TOKENS_REDUCED, bias=True, **kw)
+        self.token_embed = ops.Linear(2, _BS_TOKEN_DIM, bias=True, **kw)
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, _BS_TOKEN_DIM, **kw))
+        self.blocks = nn.ModuleList(
+            MlpMixerBlock(_BS_NUM_TOKENS, _BS_TOKEN_DIM, _BS_TOKEN_MIX_HIDDEN, _BS_CHANNEL_MIX_HIDDEN,
+                          device=device, dtype=dtype, operations=operations) for _ in range(4)
+        )
+        self.head = ops.Linear(_BS_TOKEN_DIM, _BS_NUM_BLENDSHAPES, bias=True, **kw)
+
+    @staticmethod
+    def _input_normalize(landmarks_2d: Tensor) -> Tensor:
+        # Centroid-subtract → L2 scale → x0.5. The 0.5 is baked into training.
+        centroid = landmarks_2d.mean(dim=1, keepdim=True)
+        x = landmarks_2d - centroid
+        mag = torch.sqrt((x * x).sum(dim=-1, keepdim=True))
+        scale = mag.mean(dim=1, keepdim=True)
+        return (x / scale.clamp(min=1e-12)) * 0.5
+
+    def forward(self, landmarks_2d: Tensor) -> Tensor:
+        """(B, 146, 2) → (B, 52) in [0, 1]. Input units don't matter (centroid + L2 normalize)."""
+        x = self._input_normalize(landmarks_2d)
+        x = self.token_reduce(x.transpose(1, 2)).transpose(1, 2)
+        x = self.token_embed(x)
+        cls = self.cls_token.expand(x.shape[0], -1, -1)
+        x = torch.cat([cls, x], dim=1)
+        for blk in self.blocks:
+            x = blk(x)
+        return torch.sigmoid(self.head(x[:, 0]))
+
+
+@lru_cache(maxsize=1)
+def _blazeface_anchors() -> np.ndarray:
+    """896 anchors per SsdAnchorsCalculator (fixed_anchor_size → anchor_w=anchor_h=1)."""
+    per_ar = len(_BF_ASPECT_RATIOS) + (1 if _BF_INTERP_SCALE_AR > 0 else 0)
+    layer_anchors: List[np.ndarray] = []
+    layer = 0
+    while layer < _BF_NUM_LAYERS:
+        stride = _BF_STRIDES[layer]
+        last = layer
+        while last < _BF_NUM_LAYERS and _BF_STRIDES[last] == stride:
+            last += 1
+        per_cell = per_ar * (last - layer)
+        feat = (_BF_INPUT_SIZE + stride - 1) // stride
+        yy, xx = np.meshgrid(np.arange(feat, dtype=np.float32), np.arange(feat, dtype=np.float32), indexing="ij")
+        cx, cy, ones = (xx + _BF_ANCHOR_OFFSET_X) / feat, (yy + _BF_ANCHOR_OFFSET_Y) / feat, np.ones_like(xx)
+        cell = np.stack([cx, cy, ones, ones], axis=-1).reshape(-1, 4)
+        layer_anchors.append(np.repeat(cell, per_cell, axis=0))
+        layer = last
+    out = np.concatenate(layer_anchors, axis=0)
+    assert out.shape == (896, 4), out.shape
+    return out
+
+
+def _decode_blazeface(regressors: np.ndarray, classificators: np.ndarray,
+                      score_thresh: float = _BF_MIN_SCORE) -> np.ndarray:
+    """Decode (regs (896,16), cls (896,1)) → (N, 17) = [xyxy, kp0x..kp5y, score] in [0, 1]."""
+    scores = expit(np.clip(classificators[:, 0], -_BF_SCORE_CLIP, _BF_SCORE_CLIP))
+    keep = scores >= score_thresh
+    if not keep.any():
+        return np.empty((0, 17), dtype=np.float32)
+    r = regressors[keep] / _BF_BOX_SCALE
+    a = _blazeface_anchors()[keep]  # (N, 4) cx, cy, 1, 1
+    cxs, cys, aws, ahs = a[:, 0:1], a[:, 1:2], a[:, 2:3], a[:, 3:4]
+    xc, yc = r[:, 0:1] * aws + cxs, r[:, 1:2] * ahs + cys
+    w, h = r[:, 2:3] * aws, r[:, 3:4] * ahs
+    out = np.empty((r.shape[0], 17), dtype=np.float32)
+    out[:, 0:1], out[:, 1:2], out[:, 2:3], out[:, 3:4] = xc - w / 2, yc - h / 2, xc + w / 2, yc + h / 2
+    out[:, 4:16:2] = r[:, _BF_KP_OFFSET::2] * aws + cxs
+    out[:, 5:16:2] = r[:, _BF_KP_OFFSET + 1::2] * ahs + cys
+    out[:, 16] = scores[keep]
+    return out
+
+
+def _weighted_nms(detections: np.ndarray, iou_thresh: float = 0.5) -> np.ndarray:
+    """MP weighted NMS — kept boxes are score-weighted averages of overlapping detections."""
+    if detections.shape[0] == 0:
+        return detections
+    dets = detections[np.argsort(-detections[:, 16])]
+    N = dets.shape[0]
+    areas = np.clip(dets[:, 2] - dets[:, 0], 0, None) * np.clip(dets[:, 3] - dets[:, 1], 0, None)
+    kept: List[np.ndarray] = []
+    used = np.zeros(N, dtype=bool)
+    for i in range(N):
+        if used[i]:
+            continue
+        ax1, ay1, ax2, ay2 = dets[i, 0:4]
+        merge_idx = [i]
+        for j in range(i + 1, N):
+            if used[j]:
+                continue
+            bx1, by1, bx2, by2 = dets[j, 0:4]
+            iw = max(0.0, min(ax2, bx2) - max(ax1, bx1))
+            ih = max(0.0, min(ay2, by2) - max(ay1, by1))
+            inter = iw * ih
+            union = areas[i] + areas[j] - inter
+            if union > 0 and inter / union > iou_thresh:  # strict > matches MP
+                merge_idx.append(j)
+                used[j] = True
+        used[i] = True
+        cluster = dets[merge_idx]
+        ws = cluster[:, 16:17]
+        ws_sum = ws.sum()
+        merged = np.copy(cluster[0])
+        if ws_sum > 0:
+            merged[:16] = (cluster[:, :16] * ws).sum(axis=0) / ws_sum
+        kept.append(merged)
+    return np.stack(kept, axis=0) if kept else np.empty((0, 17), dtype=np.float32)
+
+
+def _detection_to_face_rect(detection: np.ndarray, image_w: int, image_h: int) -> Tuple[float, float, float, float, float]:
+    """Detection (normalized) → rotated 1.5xbbox ROI in image pixels (anisotropic)."""
+    xmin, ymin, xmax, ymax = detection[0:4]
+    lx = detection[4 + _FACE_LEFT_EYE_KP * 2 + 0] * image_w
+    ly = detection[4 + _FACE_LEFT_EYE_KP * 2 + 1] * image_h
+    rx = detection[4 + _FACE_RIGHT_EYE_KP * 2 + 0] * image_w
+    ry = detection[4 + _FACE_RIGHT_EYE_KP * 2 + 1] * image_h
+    # Image-y-down convention: angle = target - atan2(-dy, dx).
+    angle = _FACE_ROI_TARGET_ANGLE - math.atan2(ly - ry, rx - lx)
+    return (float((xmin + xmax) * 0.5 * image_w),
+            float((ymin + ymax) * 0.5 * image_h),
+            float((xmax - xmin) * image_w * _FACE_ROI_SCALE_X),
+            float((ymax - ymin) * image_h * _FACE_ROI_SCALE_Y),
+            float(angle))
+
+
+def _sample_warp(image_chw: Tensor, src_x: Tensor, src_y: Tensor, padding_mode: str) -> Tensor:
+    """Bilinear-sample image_chw at corner-aligned (src_x, src_y)."""
+    H, W = int(image_chw.shape[-2]), int(image_chw.shape[-1])
+    grid = torch.stack([(2.0 * src_x + 1.0) / W - 1.0,
+                        (2.0 * src_y + 1.0) / H - 1.0], dim=-1).unsqueeze(0)
+    return F.grid_sample(image_chw.unsqueeze(0), grid, mode="bilinear",
+                         align_corners=False, padding_mode=padding_mode).squeeze(0)
+
+
+def _warp_face_crop(image_chw: Tensor, cx: float, cy: float, width: float, height: float,
+                    angle: float, output_size: int = _FM_INPUT_SIZE) -> Tensor:
+    """Rotated rect → output_size² with BORDER_REPLICATE. image_chw must be in [0, 1]."""
+    s_x, s_y = width / output_size, height / output_size
+    cos_a, sin_a = math.cos(angle), math.sin(angle)
+    arange = torch.arange(output_size, dtype=image_chw.dtype, device=image_chw.device) - output_size * 0.5
+    v_grid, u_grid = torch.meshgrid(arange, arange, indexing="ij")
+    src_x = cx + u_grid * s_x * cos_a - v_grid * s_y * sin_a
+    src_y = cy + u_grid * s_x * sin_a + v_grid * s_y * cos_a
+    return _sample_warp(image_chw, src_x, src_y, "border")
+
+
+def _blazeface_input_warp(image_chw_raw: Tensor, target: int = _BF_INPUT_SIZE) -> Tuple[Tensor, float, float, float]:
+    """Centered max(W,H) square → target² with BORDER_ZERO + [-1, 1] norm.
+
+    Sub-pixel grid_sample matters; integer-pad-then-resize drifts the bbox ~5%.
+    Returns (warped, sub_rect_cx, sub_rect_cy, sub_rect_size) — the triplet maps
+    tensor-normalized [0,1] detections back to image pixels.
+    """
+    H, W = int(image_chw_raw.shape[1]), int(image_chw_raw.shape[2])
+    sub_rect_size = float(max(W, H))
+    sub_rect_cx, sub_rect_cy = W * 0.5, H * 0.5
+    s = sub_rect_size / target
+    arange = torch.arange(target, dtype=image_chw_raw.dtype, device=image_chw_raw.device) - target * 0.5
+    v_grid, u_grid = torch.meshgrid(arange, arange, indexing="ij")
+    out = _sample_warp(image_chw_raw, sub_rect_cx + u_grid * s, sub_rect_cy + v_grid * s, "zeros")
+    return (out / 127.5) - 1.0, sub_rect_cx, sub_rect_cy, sub_rect_size
+
+
+class FaceLandmarker(nn.Module):
+    """BlazeFace → FaceMesh v2 → blendshapes. `detector_variant` selects 'short'
+    (128², ≤2m) or 'full' (192² FPN, ≤5m). State dict uses inner-module prefixes
+    `detector.*` / `mesh.*` / `blendshapes.*`; the outer FaceLandmarkerModel
+    wrapper rewrites `detector_{variant}.*` keys to `detector.*` before loading.
+    """
+
+    def __init__(self, device=None, dtype=None, operations=None, detector_variant: str = "short"):
+        super().__init__()
+        det_cls = {"short": BlazeFace, "full": BlazeFaceFullRange}.get(detector_variant)
+
+        self.detector_variant = detector_variant
+        self.detector = det_cls(device=device, dtype=dtype, operations=operations)
+        self.mesh = FaceMesh(device=device, dtype=dtype, operations=operations)
+        self.blendshapes = FaceBlendshapes(device=device, dtype=dtype, operations=operations)
+        self.register_buffer("_bs_idx", torch.tensor(_BS_INPUT_INDICES, dtype=torch.long), persistent=False)
+
+    def run_detector_batch(self, images_rgb_uint8: List[np.ndarray],
+                           score_thresh: float = _BF_MIN_SCORE,
+                           iou_thresh: float = 0.5):
+        """Batched detector pass. Returns (img_raws, sub_rects, sizes, per_frame_decoded)
+        where per_frame_decoded[b] is (N, 17) in tensor-normalized [0,1] coords."""
+        if not images_rgb_uint8:
+            return [], [], [], []
+        device, dtype = self.detector.stem.weight.device, self.detector.stem.weight.dtype
+        det_input_size, decode_fn = ((_BF_FR_INPUT_SIZE, _decode_blazeface_full_range)
+                                     if self.detector_variant == "full"
+                                     else (_BF_INPUT_SIZE, _decode_blazeface))
+
+        # Same-size frames: stack once and transfer once. Variable size falls back
+        # to per-image (only triggers for SAM3DBody's head crops).
+        sizes = [tuple(img.shape[:2]) for img in images_rgb_uint8]
+        if len(set(sizes)) == 1:
+            batch_chw = torch.from_numpy(np.stack(images_rgb_uint8, axis=0)).to(device, dtype).movedim(-1, -3).contiguous()
+            img_raws = [batch_chw[bi] for bi in range(batch_chw.shape[0])]
+        else:
+            img_raws = [torch.from_numpy(img).to(device, dtype).movedim(-1, -3).contiguous() for img in images_rgb_uint8]
+
+        warps = [_blazeface_input_warp(img_raw, det_input_size) for img_raw in img_raws]
+        det_crops = [w[0] for w in warps]
+        sub_rects = [(w[1], w[2], w[3]) for w in warps]
+
+        regs_b, cls_b = self.detector(torch.stack(det_crops, dim=0))
+        regs_np, cls_np = regs_b.float().cpu().numpy(), cls_b.float().cpu().numpy()
+        per_frame = []
+        for b in range(len(images_rgb_uint8)):
+            decoded = decode_fn(regs_np[b], cls_np[b], score_thresh=score_thresh)
+            per_frame.append(_weighted_nms(decoded, iou_thresh=iou_thresh) if decoded.shape[0] > 0 else decoded)
+        return img_raws, sub_rects, sizes, per_frame
+
+    def detect_batch(self, images_rgb_uint8: List[np.ndarray], num_faces: int = 1,
+                     score_thresh: float = _BF_MIN_SCORE) -> List[List[dict]]:
+        """Full pipeline batched across `images_rgb_uint8`. Returns one face-dict
+        list per image (empty if nothing detected). Face dict:
+            bbox_xyxy (4,) image pixels, blendshapes {52} ∈ [0,1],
+            landmarks_xy (478, 2) image pixels, landmarks_3d (478, 3) in
+            192-canonical (pre-transformation) units, presence float (raw logit).
+        """
+        img_raws, sub_rects, sizes, per_frame_dets = self.run_detector_batch(
+            images_rgb_uint8, score_thresh=score_thresh,
+        )
+        # tensor-normalized → image-normalized [0,1] for _detection_to_face_rect.
+        for b, decoded in enumerate(per_frame_dets):
+            if decoded.shape[0] == 0:
+                continue
+            cx, cy, size = sub_rects[b]
+            H, W = sizes[b]
+            sx0, sy0 = cx - size * 0.5, cy - size * 0.5
+            decoded[:, 0:16:2] = (sx0 + size * decoded[:, 0:16:2]) / W
+            decoded[:, 1:16:2] = (sy0 + size * decoded[:, 1:16:2]) / H
+            if num_faces > 0:
+                per_frame_dets[b] = decoded[: int(num_faces)]
+
+        # Collect every detected face across all frames into one mesh input.
+        face_params: List[Tuple[int, float, float, float, float, float, float]] = []
+        mesh_crops: List[Tensor] = []
+        for b, dets in enumerate(per_frame_dets):
+            if dets.shape[0] == 0:
+                continue
+            H, W = sizes[b]
+            img_for_mesh = img_raws[b] / 255.0
+            for det in dets:
+                cx, cy, w, h, angle = _detection_to_face_rect(det, W, H)
+                mesh_crops.append(_warp_face_crop(img_for_mesh, cx, cy, w, h, angle, _FM_INPUT_SIZE))
+                face_params.append((b, float(det[16]), cx, cy, w, h, angle))
+
+        results: List[List[dict]] = [[] for _ in range(len(images_rgb_uint8))]
+        if not mesh_crops:
+            return results
+
+        lmks_canon_b, presence_b = self.mesh(torch.stack(mesh_crops, dim=0))
+        bs_out_b = self.blendshapes(lmks_canon_b[:, self._bs_idx, :2])
+
+        # Batched canonical→image affine
+        params_t = torch.tensor(
+            [(cx, cy, w, h, math.cos(a), math.sin(a)) for (_b, _s, cx, cy, w, h, a) in face_params],
+            device=lmks_canon_b.device, dtype=lmks_canon_b.dtype,
+        )
+        cxs, cys, ws, hs, cos_a, sin_a = params_t.unbind(dim=1)
+        inv = 1.0 / _FM_INPUT_SIZE
+        u = lmks_canon_b[..., 0] - _FM_INPUT_SIZE * 0.5
+        v = lmks_canon_b[..., 1] - _FM_INPUT_SIZE * 0.5
+        lmks_xy_t = torch.stack([
+            cxs[:, None] + u * (ws * inv * cos_a)[:, None] - v * (hs * inv * sin_a)[:, None],
+            cys[:, None] + u * (ws * inv * sin_a)[:, None] + v * (hs * inv * cos_a)[:, None],
+        ], dim=-1)
+
+        lmks_xy_np = lmks_xy_t.float().cpu().numpy()
+        lmks_canon_np = lmks_canon_b.float().cpu().numpy()
+        presence_np = presence_b.float().cpu().numpy()
+        bs_np = bs_out_b.float().cpu().numpy()
+
+        for i, (b, score, *_) in enumerate(face_params):
+            lmks_xy = lmks_xy_np[i]
+            mn, mx = lmks_xy.min(0), lmks_xy.max(0)
+            results[b].append({
+                "bbox_xyxy": np.array([mn[0], mn[1], mx[0], mx[1]], dtype=np.float32),
+                "blendshapes": dict(zip(BLENDSHAPE_NAMES, bs_np[i].tolist())),
+                "landmarks_xy": lmks_xy,
+                "landmarks_3d": lmks_canon_np[i],
+                "presence": float(presence_np[i]),
+                "score": score,
+            })
+        return results
--- a/comfy_extras/nodes_mediapipe.py
+++ b/comfy_extras/nodes_mediapipe.py
@ -0,0 +1,502 @@
+"""ComfyUI nodes for the pure-PyTorch MediaPipe Face Landmarker port.
+
+Custom IO types:
+  FACE_LANDMARKER  — FaceLandmarkerModel wrapper (ModelPatcher inside)
+  FACE_LANDMARKS   — {"frames": List[List[face_dict]], "image_size": (H, W),
+                      "connection_sets": dict[str, frozenset[(int, int)]]}
+                     face_dict: bbox_xyxy, blendshapes, landmarks_xy,
+                                landmarks_3d, presence, score, transformation_matrix
+
+MediaPipeFaceLandmarker also emits the core BOUNDING_BOX type — pair with DrawBBoxes.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import torch
+from PIL import Image, ImageColor, ImageDraw
+from tqdm.auto import tqdm
+from typing_extensions import override
+
+import comfy.model_management
+import comfy.model_patcher
+import comfy.utils
+import folder_paths
+from comfy_api.latest import ComfyExtension, io
+
+from comfy_extras.mediapipe.face_landmarker import FaceLandmarker
+from comfy_extras.mediapipe.face_geometry import transformation_matrix_from_detection
+
+
+FaceLandmarkerType = io.Custom("FACE_LANDMARKER")
+FaceLandmarksType = io.Custom("FACE_LANDMARKS")
+
+_CANONICAL_KEYS = ("canonical_vertices", "procrustes_indices", "procrustes_weights")
+_CONTOUR_PARTS = ("face_oval", "left_eye", "right_eye", "left_eyebrow", "right_eyebrow", "lips")
+
+
+class FaceLandmarkerModel:
+    """Loaded FaceLandmarker variants + ModelPatcher per variant.
+
+    Safetensors layout: `detector_short.*` / `detector_full.*` plus shared
+    `mesh.*`, `blendshapes.*`, `canonical_*`, and `topology.*`.
+    PReLU forces plain-nn / fp32 (manual_cast strands buffers across devices).
+    """
+
+    def __init__(self, state_dict: dict):
+        self.load_device = comfy.model_management.text_encoder_device()
+        offload_device = comfy.model_management.text_encoder_offload_device()
+        self.dtype = torch.float32
+
+        # FACEMESH_* connection sets, embedded as int32 (N, 2) under topology.*.
+        base: dict[str, frozenset] = {}
+        for k in [k for k in state_dict if k.startswith("topology.")]:
+            base[k[len("topology."):]] = frozenset(map(tuple, state_dict.pop(k).tolist()))
+        base["contours"] = frozenset().union(*(base[p] for p in _CONTOUR_PARTS))
+        base["all"] = base["contours"] | base["irises"] | base["nose"]
+
+        self.connection_sets: dict[str, frozenset] = base
+        self.canonical_data: dict[str, np.ndarray] = {k: state_dict.pop(k).numpy() for k in _CANONICAL_KEYS}
+
+        shared = {k: v for k, v in state_dict.items() if k.startswith(("mesh.", "blendshapes."))}
+
+        self.models: dict[str, FaceLandmarker] = {}
+        self.patchers: dict[str, comfy.model_patcher.ModelPatcher] = {}
+        for variant in ("short", "full"):
+            prefix = f"detector_{variant}."
+            sub = dict(shared)
+            sub.update({f"detector.{k[len(prefix):]}": v for k, v in state_dict.items() if k.startswith(prefix)})
+            fl = FaceLandmarker(device=offload_device, dtype=self.dtype, operations=None, detector_variant=variant).eval()
+            fl.load_state_dict(sub, strict=False)
+
+            self.models[variant] = fl
+            self.patchers[variant] = comfy.model_patcher.CoreModelPatcher(
+                fl, load_device=self.load_device, offload_device=offload_device,
+                size=comfy.model_management.module_size(fl),
+            )
+
+    def detect_batch(self, images, num_faces: int, score_thresh: float, variant: str):
+        comfy.model_management.load_model_gpu(self.patchers[variant])
+        return self.models[variant].detect_batch(images, num_faces=num_faces, score_thresh=score_thresh)
+
+
+def _image_to_uint8(image: torch.Tensor) -> np.ndarray:
+    return image[..., :3].mul(255.0).add_(0.5).clamp_(0, 255).to(torch.uint8).cpu().numpy()
+
+
+def _parse_color(color: str) -> tuple[int, int, int]:
+    try:
+        return ImageColor.getrgb(color)[:3]
+    except ValueError:
+        return (0, 255, 0)
+
+
+def _copy_face(face: dict) -> dict:
+    """Shallow copy of a face_dict with array-fields cloned so callers can mutate."""
+    return {
+        "bbox_xyxy":    face["bbox_xyxy"].copy(),
+        "blendshapes":  dict(face["blendshapes"]),
+        "landmarks_xy": face["landmarks_xy"].copy(),
+        "landmarks_3d": face["landmarks_3d"].copy(),
+        "presence":     face["presence"],
+        "score":        face["score"],
+    }
+
+
+def _lerp_face(a: dict, b: dict, t: float) -> dict:
+    return {
+        "bbox_xyxy":    (1 - t) * a["bbox_xyxy"]    + t * b["bbox_xyxy"],
+        "blendshapes":  {k: (1 - t) * a["blendshapes"][k] + t * b["blendshapes"][k] for k in a["blendshapes"]},
+        "landmarks_xy": (1 - t) * a["landmarks_xy"] + t * b["landmarks_xy"],
+        "landmarks_3d": (1 - t) * a["landmarks_3d"] + t * b["landmarks_3d"],
+        "presence":     (1 - t) * a["presence"] + t * b["presence"],
+        "score":        (1 - t) * a["score"]    + t * b["score"],
+    }
+
+
+def _match_faces(a: list[dict], b: list[dict]) -> list[tuple[int, int]]:
+    """Greedy nearest-neighbour pairing of faces between two frames by bbox
+    centre distance. Unmatched (when counts differ) are dropped."""
+    if not a or not b:
+        return []
+    centers_a = np.array([(0.5 * (f["bbox_xyxy"][0] + f["bbox_xyxy"][2]),
+                           0.5 * (f["bbox_xyxy"][1] + f["bbox_xyxy"][3])) for f in a])
+    centers_b = np.array([(0.5 * (f["bbox_xyxy"][0] + f["bbox_xyxy"][2]),
+                           0.5 * (f["bbox_xyxy"][1] + f["bbox_xyxy"][3])) for f in b])
+    dists = np.linalg.norm(centers_a[:, None] - centers_b[None], axis=-1)
+    pairs: list[tuple[int, int]] = []
+    used_a: set[int] = set()
+    used_b: set[int] = set()
+    candidates = sorted((dists[ia, ib], ia, ib) for ia in range(len(a)) for ib in range(len(b)))
+    for _, ia, ib in candidates:
+        if ia in used_a or ib in used_b:
+            continue
+        pairs.append((ia, ib))
+        used_a.add(ia)
+        used_b.add(ib)
+    return pairs
+
+
+def _fill_missing_frames(frames: list[list[dict]], mode: str) -> None:
+    """In-place fill empty frame slots from neighbouring detections. Multi-face
+    aware: pairs faces across bracketing frames by greedy bbox-centre NN.
+    When counts differ, unmatched faces are dropped from the synthesised frame."""
+    if mode == "empty":
+        return
+    valid = [i for i, fr in enumerate(frames) if fr]
+    if not valid:
+        return  # nothing to fill from
+    if mode == "previous":
+        last: list[dict] = []
+        for i, fr in enumerate(frames):
+            if fr:
+                last = fr
+            elif last:
+                frames[i] = [_copy_face(f) for f in last]
+        return
+    # interpolate: lerp between bracketing valid frames; clamp at ends.
+    for i in range(len(frames)):
+        if frames[i]:
+            continue
+        prev_i = max((v for v in valid if v < i), default=None)
+        next_i = min((v for v in valid if v > i), default=None)
+        if prev_i is None:
+            frames[i] = [_copy_face(f) for f in frames[next_i]]
+        elif next_i is None:
+            frames[i] = [_copy_face(f) for f in frames[prev_i]]
+        else:
+            t = (i - prev_i) / (next_i - prev_i)
+            pairs = _match_faces(frames[prev_i], frames[next_i])
+            frames[i] = [_lerp_face(frames[prev_i][a], frames[next_i][b], t) for a, b in pairs]
+
+
+def _ordered_rings(edges: frozenset[tuple[int, int]]) -> list[list[int]]:
+    """Walk an unordered edge set into one or more closed-loop vertex rings
+    (handles multi-loop sets like FACEMESH_LIPS: outer + inner)."""
+    adj: dict[int, set[int]] = {}
+    for a, b in edges:
+        adj.setdefault(a, set()).add(b)
+        adj.setdefault(b, set()).add(a)
+    visited: set[int] = set()
+    rings: list[list[int]] = []
+    for start in adj:
+        if start in visited:
+            continue
+        ring = [start]
+        visited.add(start)
+        prev, cur = -1, start
+        while True:
+            nxt = next((v for v in adj[cur] if v != prev), None)
+            if nxt is None or nxt == start:
+                break
+            ring.append(nxt)
+            visited.add(nxt)
+            prev, cur = cur, nxt
+        rings.append(ring)
+    return rings
+
+
+class LoadMediaPipeFaceLandmarker(io.ComfyNode):
+    """Load MediaPipe Face Landmarker v2 weights. Contains both detector variants
+    (short / full), shared mesh, blendshapes, and canonical geometry."""
+
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="LoadMediaPipeFaceLandmarker",
+            display_name="Load MediaPipe Face Landmarker",
+            category="loaders",
+            inputs=[
+                io.Combo.Input("model_name", options=folder_paths.get_filename_list("mediapipe"),
+                               tooltip="Face Landmarker safetensors from models/mediapipe/."),
+            ],
+            outputs=[FaceLandmarkerType.Output()],
+        )
+
+    @classmethod
+    def execute(cls, model_name) -> io.NodeOutput:
+        sd = comfy.utils.load_torch_file(folder_paths.get_full_path_or_raise("mediapipe", model_name), safe_load=True)
+        wrapper = FaceLandmarkerModel(sd)
+        return io.NodeOutput(wrapper)
+
+
+# Per-frame fallback modes for detection failures in a batch.
+_FALLBACK_MODES = ("empty", "previous", "interpolate")
+
+
+class MediaPipeFaceLandmarker(io.ComfyNode):
+    """BlazeFace → FaceMesh v2 → ARKit-52 blendshapes, batched across the
+    input. Also emits a BOUNDING_BOX list (landmark-extent bbox per face) —
+    pair with DrawBBoxes for detector-only viz or MediaPipeFaceMeshVisualize
+    for the mesh overlay."""
+
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="MediaPipeFaceLandmarker",
+            display_name="MediaPipe Face Landmarker",
+            category="image/detection",
+            inputs=[
+                FaceLandmarkerType.Input("face_landmarker"),
+                io.Image.Input("image"),
+                io.Combo.Input("detector_variant", options=["short", "full", "both"], default="short",
+                               tooltip="Face detector range. 'short' is tuned for close-up faces "
+                                       "(within ~2 m of the camera); 'full' covers farther / smaller "
+                                       "faces (up to ~5 m) but is slower. 'both' runs both detectors and "
+                                       "keeps whichever found more faces per frame (~2× detection cost)."),
+                io.Int.Input("num_faces", default=1, min=0, max=16, step=1,
+                             tooltip="Maximum faces to return per frame. 0 = no cap (return all detected)."),
+                io.Float.Input("min_confidence", default=0.5, min=0.0, max=1.0, step=0.01, advanced=True,
+                               tooltip="BlazeFace score threshold. Lower to catch small/occluded faces."),
+                io.Combo.Input("missing_frame_fallback", options=list(_FALLBACK_MODES), default="empty", advanced=True,
+                               tooltip="Per-frame behaviour when detection fails in a batch. "
+                                       "'empty' leaves the frame faceless. 'previous' copies the most recent successful "
+                                       "detection. 'interpolate' lerps landmarks/bbox/blendshapes between bracketing "
+                                       "successful frames. Multi-face: pairs faces across frames by greedy bbox-centre NN."),
+            ],
+            outputs=[
+                FaceLandmarksType.Output(display_name="face_landmarks"),
+                io.BoundingBox.Output("bboxes"),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, face_landmarker, image, detector_variant, num_faces, min_confidence,
+                missing_frame_fallback) -> io.NodeOutput:
+        canonical = face_landmarker.canonical_data
+        img_np = _image_to_uint8(image)
+        B, H, W = img_np.shape[:3]
+        chunk = 16
+        is_both = detector_variant == "both"
+        total_work = 2 * B if is_both else B
+        pbar = comfy.utils.ProgressBar(total_work)
+
+        def _run(variant: str) -> list[list[dict]]:
+            res: list[list[dict]] = []
+            with tqdm(total=B, desc=f"MediaPipe Face Landmarker ({variant})") as tq:
+                for i in range(0, B, chunk):
+                    end = min(i + chunk, B)
+                    res.extend(face_landmarker.detect_batch(
+                        [img_np[bi] for bi in range(i, end)],
+                        num_faces=int(num_faces),
+                        score_thresh=float(min_confidence),
+                        variant=variant,
+                    ))
+                    pbar.update_absolute(min(pbar.current + (end - i), total_work))
+                    tq.update(end - i)
+            return res
+
+        if is_both:
+            short_res = _run("short")
+            full_res = _run("full")
+            # Per-frame keep whichever found more faces (tie → short).
+            frames: list[list[dict]] = [
+                short_res[bi] if len(short_res[bi]) >= len(full_res[bi]) else full_res[bi]
+                for bi in range(B)
+            ]
+        else:
+            frames = _run(detector_variant)
+        _fill_missing_frames(frames, missing_frame_fallback)
+        bboxes = []
+        for per_frame in frames:
+            per_bb = []
+            for f in per_frame:
+                f["transformation_matrix"] = transformation_matrix_from_detection(f, W, H, canonical)
+                x1, y1, x2, y2 = (float(v) for v in f["bbox_xyxy"])
+                per_bb.append({"x": x1, "y": y1, "width": x2 - x1, "height": y2 - y1, "label": "face", "score": float(f["score"])})
+            bboxes.append(per_bb)
+        return io.NodeOutput({"frames": frames, "image_size": (H, W),
+                              "connection_sets": face_landmarker.connection_sets}, bboxes)
+
+
+# Topology keys unioned by the 'all' connections preset (contour parts + irises + nose).
+_ALL_CONNECTION_PARTS: tuple[str, ...] = (*_CONTOUR_PARTS, "irises", "nose")
+_CUSTOM_FEATURES: tuple[tuple[str, bool], ...] = (
+    ("face_oval",     True),
+    ("lips",          True),
+    ("left_eye",      True),
+    ("right_eye",     True),
+    ("left_eyebrow",  True),
+    ("right_eyebrow", True),
+    ("irises",        True),
+    ("nose",          True),
+    ("tesselation",   False),
+)
+
+
+class MediaPipeFaceMeshVisualize(io.ComfyNode):
+    """Draw a FACEMESH_* subset over an image. Topology travels with the
+    FACE_LANDMARKS payload (set at detection time)."""
+
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="MediaPipeFaceMeshVisualize",
+            display_name="MediaPipe Face Mesh Visualize",
+            category="image/detection",
+            inputs=[
+                FaceLandmarksType.Input("face_landmarks"),
+                io.Image.Input("image", optional=True, tooltip="If not connected, a black canvas will be used."),
+                io.DynamicCombo.Input(
+                    "connections",
+                    tooltip="'all' = oval+eyes+brows+lips+irises+nose. 'fill' = solid face_oval polygon (silhouette mask). 'custom' = toggle each feature individually (including 'tesselation', the full 2547-edge wireframe).",
+                    options=[
+                        io.DynamicCombo.Option("all", []),
+                        io.DynamicCombo.Option("fill", []),
+                        io.DynamicCombo.Option("custom", [
+                            io.Boolean.Input(feat, default=default,
+                                             tooltip=f"Draw the '{feat}' connection set.")
+                            for feat, default in _CUSTOM_FEATURES
+                        ]),
+                    ],
+                ),
+                io.Color.Input("color", default="#00ff00"),
+                io.Int.Input("thickness", default=1, min=0, max=8, step=1,
+                             tooltip="Edge line thickness in pixels. 0 disables edge drawing."),
+                io.Int.Input("point_size", default=2, min=0, max=16, step=1,
+                             tooltip="Landmark dot radius in pixels. 0 disables point drawing."),
+            ],
+            outputs=[io.Image.Output()],
+        )
+
+    @classmethod
+    def execute(cls, face_landmarks, connections, color, thickness, point_size, image=None) -> io.NodeOutput:
+        sets = face_landmarks["connection_sets"]
+        sel = connections["connections"]
+        fill_rings: list[list[int]] | None = None
+        if sel == "fill":
+            fill_rings = _ordered_rings(sets["face_oval"])
+            edges = frozenset()
+        elif sel == "custom":
+            parts = [feat for feat, _ in _CUSTOM_FEATURES if connections.get(feat, False)]
+            edges = frozenset().union(*(sets[p] for p in parts))
+        else:  # "all"
+            edges = frozenset().union(*(sets[p] for p in _ALL_CONNECTION_PARTS))
+        rgb, thick, psize = _parse_color(color), int(thickness), int(point_size)
+        frames = face_landmarks["frames"]
+        if image is None:
+            H, W = face_landmarks["image_size"]
+            img_np = np.zeros((len(frames), H, W, 3), dtype=np.uint8)
+        else:
+            img_np = _image_to_uint8(image)
+        B = img_np.shape[0]
+        n_frames = len(frames)
+        pbar = comfy.utils.ProgressBar(B)
+        out = np.empty_like(img_np)
+        for bi in range(B):
+            faces = frames[bi] if bi < n_frames else []
+            out[bi] = _draw_mesh(img_np[bi], faces, edges, rgb, thick, psize, fill_rings)
+            pbar.update_absolute(bi + 1)
+        return io.NodeOutput(torch.from_numpy(out).to(
+            device=comfy.model_management.intermediate_device(),
+            dtype=comfy.model_management.intermediate_dtype(),
+        ).div_(255.0))
+
+
+def _draw_mesh(image_rgb: np.ndarray, faces: list, edges,
+               rgb: tuple[int, int, int], thickness: int,
+               point_size: int, fill_rings: list[list[int]] | None = None) -> np.ndarray:
+    draw_edges = thickness > 0 and edges
+    if not faces or (fill_rings is None and not draw_edges and point_size <= 0):
+        return image_rgb.copy()
+    pil = Image.fromarray(image_rgb)
+    draw = ImageDraw.Draw(pil)
+    r = point_size * 0.5
+    if fill_rings is not None:
+        for f in faces:
+            lmks = f["landmarks_xy"]
+            for ring in fill_rings:
+                draw.polygon([(float(lmks[i, 0]), float(lmks[i, 1])) for i in ring], fill=rgb)
+        return np.asarray(pil)
+    for f in faces:
+        lmks = f["landmarks_xy"]
+        n = lmks.shape[0]
+        if draw_edges:
+            for a, b in edges:
+                if a < n and b < n:
+                    draw.line([(float(lmks[a, 0]), float(lmks[a, 1])),
+                               (float(lmks[b, 0]), float(lmks[b, 1]))], fill=rgb, width=thickness)
+        if point_size == 1:
+            draw.point(lmks.flatten().tolist(), fill=rgb)
+        elif point_size > 1:
+            for x, y in lmks:
+                draw.ellipse((float(x) - r, float(y) - r, float(x) + r, float(y) + r), fill=rgb)
+    return np.asarray(pil)
+
+
+# Mask region presets — closed-loop topologies only.
+_MASK_REGIONS: tuple[str, ...] = ("face_oval", "lips", "left_eye", "right_eye", "irises")
+_MASK_CUSTOM_FEATURES: tuple[tuple[str, bool], ...] = (
+    ("face_oval",  True),
+    ("lips",       False),
+    ("left_eye",   False),
+    ("right_eye",  False),
+    ("irises",     False),
+)
+
+
+class MediaPipeFaceMask(io.ComfyNode):
+    """Binary mask from face landmarks, filled polygon per face. One mask per
+    frame in the batch; faces in the same frame composite (union)."""
+
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="MediaPipeFaceMask",
+            display_name="MediaPipe Face Mask",
+            category="image/detection",
+            inputs=[
+                FaceLandmarksType.Input("face_landmarks"),
+                io.DynamicCombo.Input(
+                    "regions",
+                    tooltip="'all' = union of face_oval+lips+eyes+irises (which collapses to face_oval since it encloses the rest). 'custom' = toggle each region individually for combos like lips+eyes.",
+                    options=[
+                        io.DynamicCombo.Option("all", []),
+                        io.DynamicCombo.Option("custom", [
+                            io.Boolean.Input(reg, default=default,
+                                             tooltip=f"Include the '{reg}' region in the mask.")
+                            for reg, default in _MASK_CUSTOM_FEATURES
+                        ]),
+                    ],
+                ),
+            ],
+            outputs=[io.Mask.Output()],
+        )
+
+    @classmethod
+    def execute(cls, face_landmarks, regions) -> io.NodeOutput:
+        sets = face_landmarks["connection_sets"]
+        sel = regions["regions"]
+        if sel == "custom":
+            picked = [reg for reg, _ in _MASK_CUSTOM_FEATURES if regions.get(reg, False)]
+        else:
+            picked = list(_MASK_REGIONS)
+        rings = [r for reg in picked for r in _ordered_rings(sets[reg])]
+        frames = face_landmarks["frames"]
+        H, W = face_landmarks["image_size"]
+        masks = np.zeros((len(frames), H, W), dtype=np.uint8)
+        pbar = comfy.utils.ProgressBar(len(frames))
+        for bi, per_frame in enumerate(frames):
+            if per_frame:
+                pil = Image.new("L", (W, H), 0)
+                draw = ImageDraw.Draw(pil)
+                for f in per_frame:
+                    lmks = f["landmarks_xy"]
+                    for ring in rings:
+                        draw.polygon([(float(lmks[i, 0]), float(lmks[i, 1])) for i in ring], fill=255)
+                masks[bi] = np.asarray(pil)
+            pbar.update_absolute(bi + 1)
+        return io.NodeOutput(torch.from_numpy(masks).to(
+            device=comfy.model_management.intermediate_device(),
+            dtype=comfy.model_management.intermediate_dtype(),
+        ).div_(255.0))
+
+
+class MediaPipeFaceExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [LoadMediaPipeFaceLandmarker, MediaPipeFaceLandmarker, MediaPipeFaceMeshVisualize, MediaPipeFaceMask]
+
+
+async def comfy_entrypoint() -> MediaPipeFaceExtension:
+    return MediaPipeFaceExtension()
--- a/execution.py
+++ b/execution.py
@ -2,6 +2,7 @@ import copy
 import heapq
 import inspect
 import logging
+import psutil
 import sys
 import threading
 import time
@ -727,6 +728,7 @@ class PromptExecutor:

        self._notify_prompt_lifecycle("start", prompt_id)
        ram_headroom = int(self.cache_args["ram"] * (1024 ** 3))
+        ram_inactive_headroom = int(self.cache_args["ram_inactive"] * (1024 ** 3))
        ram_release_callback = self.caches.outputs.ram_release if self.cache_type == CacheType.RAM_PRESSURE else None
        comfy.memory_management.set_ram_cache_release_state(ram_release_callback, ram_headroom)

@ -780,8 +782,14 @@ class PromptExecutor:
                        execution_list.complete_node_execution()

                    if self.cache_type == CacheType.RAM_PRESSURE:
-                        comfy.model_management.free_memory(0, None, pins_required=ram_headroom, ram_required=ram_headroom)
-                        ram_release_callback(ram_headroom, free_active=True)
+                        ram_release_callback(ram_inactive_headroom)
+                        ram_shortfall = ram_headroom - psutil.virtual_memory().available
+                        freed = comfy.model_management.free_pins(ram_shortfall + 512 * (1024 ** 2))
+                        if freed < ram_shortfall:
+                            if freed > 64 * (1024 ** 2):
+                                # AIMDO MEM_DECOMMIT can outrun psutil.available catching up.
+                                time.sleep(0.05)
+                            ram_release_callback(ram_headroom, free_active=True)
                else:
                    # Only execute when the while-loop ends without break
                    # Send cached UI for intermediate output nodes that weren't executed
--- a/folder_paths.py
+++ b/folder_paths.py
@ -60,6 +60,8 @@ folder_names_and_paths["geometry_estimation"] = ([os.path.join(models_dir, "geom

 folder_names_and_paths["optical_flow"] = ([os.path.join(models_dir, "optical_flow")], supported_pt_extensions)

+folder_names_and_paths["mediapipe"] = ([os.path.join(models_dir, "mediapipe")], supported_pt_extensions)
+
 output_directory = os.path.join(base_path, "output")
 temp_directory = os.path.join(base_path, "temp")
 input_directory = os.path.join(base_path, "input")
--- a/main.py
+++ b/main.py
@ -283,19 +283,25 @@ def _collect_output_absolute_paths(history_result: dict) -> list[str]:

 def prompt_worker(q, server_instance):
    current_time: float = 0.0
-    cache_ram = args.cache_ram
-    if cache_ram < 0:
+    cache_ram = 0
+    cache_ram_inactive = 0
+    if not args.cache_classic and not args.cache_none and args.cache_lru <= 0:
        cache_ram = min(32.0, max(4.0, comfy.model_management.total_ram * 0.25 / 1024.0))
+        cache_ram_inactive = min(96.0, max(12.0, comfy.model_management.total_ram * 0.75 / 1024.0))
+        if len(args.cache_ram) > 0:
+            cache_ram = args.cache_ram[0]
+        if len(args.cache_ram) > 1:
+            cache_ram_inactive = args.cache_ram[1]

-    cache_type = execution.CacheType.CLASSIC
-    if args.cache_lru > 0:
+    cache_type = execution.CacheType.RAM_PRESSURE
+    if args.cache_classic:
+        cache_type = execution.CacheType.CLASSIC
+    elif args.cache_lru > 0:
        cache_type = execution.CacheType.LRU
-    elif cache_ram > 0:
-        cache_type = execution.CacheType.RAM_PRESSURE
    elif args.cache_none:
        cache_type = execution.CacheType.NONE

-    e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={ "lru" : args.cache_lru, "ram" : cache_ram } )
+    e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={ "lru" : args.cache_lru, "ram" : cache_ram, "ram_inactive" : cache_ram_inactive } )
    last_gc_collect = 0
    need_gc = False
    gc_collect_interval = 10.0
--- a/models/mediapipe/put_mediapipe_models_here
+++ b/models/mediapipe/put_mediapipe_models_here
--- a/nodes.py
+++ b/nodes.py
@ -2444,6 +2444,7 @@ async def init_builtin_extra_nodes():
        "nodes_hidream_o1.py",
        "nodes_save_3d.py",
        "nodes_moge.py",
+        "nodes_mediapipe.py",
    ]

    import_failed = []
--- a/requirements.txt
+++ b/requirements.txt
@ -23,7 +23,7 @@ SQLAlchemy>=2.0.0
 filelock
 av>=14.2.0
 comfy-kitchen>=0.2.8
-comfy-aimdo==0.3.0
+comfy-aimdo==0.4.3
 requests
 simpleeval>=1.0.0
 blake3
--- a/tests-unit/assets_test/queries/test_asset_info.py
+++ b/tests-unit/assets_test/queries/test_asset_info.py
@ -21,6 +21,7 @@ from app.assets.database.queries import (
    get_reference_ids_by_ids,
    ensure_tags_exist,
    add_tags_to_reference,
+    set_reference_tags,
 )
 from app.assets.helpers import get_utc_now

@ -159,6 +160,153 @@ class TestListReferencesPage:
        assert refs[0].name == "large"


+class TestTagRetrievalOrder:
+    """End-to-end check: tags written through the public write paths come
+    back from the public read paths in insertion order rather than the
+    composite-PK alphabetical order SQLite would otherwise impose.
+
+    Each test deliberately picks tag names that would sort differently
+    under alphabetical vs insertion order, so an alphabetical regression
+    fails loudly.
+    """
+
+    def _make_ref(self, session: Session) -> AssetReference:
+        asset = _make_asset(session, "h1")
+        return _make_reference(session, asset, name="x.bin")
+
+    def test_set_reference_tags_preserves_input_order_in_list(self, session: Session):
+        ref = self._make_ref(session)
+        # "checkpoints" < "models" alphabetically; if added_at stagger
+        # works, list_references_page returns insertion order.
+        set_reference_tags(session, reference_id=ref.id, tags=["models", "checkpoints"])
+        session.commit()
+
+        _, tag_map, _ = list_references_page(session)
+        assert tag_map[ref.id] == ["models", "checkpoints"]
+
+    def test_set_reference_tags_preserves_input_order_in_fetch(self, session: Session):
+        ref = self._make_ref(session)
+        # Subpath tag sorts before "models" alphabetically.
+        set_reference_tags(
+            session,
+            reference_id=ref.id,
+            tags=["models", "diffusers/kolors/text_encoder"],
+        )
+        session.commit()
+
+        result = fetch_reference_asset_and_tags(session, ref.id)
+        assert result is not None
+        _, _, tags = result
+        # Bucket-prefix expansion appends the standalone `diffusers` token
+        # at path-tier (microsecond stagger) so FE set-membership filters
+        # match nested category paths.
+        assert tags == ["models", "diffusers/kolors/text_encoder", "diffusers"]
+
+    def test_add_tags_to_reference_lands_after_path_tags(self, session: Session):
+        ref = self._make_ref(session)
+        set_reference_tags(session, reference_id=ref.id, tags=["models", "checkpoints"])
+        session.commit()
+
+        # "aaa-..." sorts before both path tags alphabetically. If added_at
+        # stagger is missing, alphabetic tiebreak would hoist it to tags[0].
+        add_tags_to_reference(
+            session, reference_id=ref.id, tags=["aaa-user-tag"], origin="manual"
+        )
+        session.commit()
+
+        _, tag_map, _ = list_references_page(session)
+        assert tag_map[ref.id] == ["models", "checkpoints", "aaa-user-tag"]
+
+    def test_multi_tag_batch_lands_after_path_tags(self, session: Session):
+        ref = self._make_ref(session)
+        set_reference_tags(session, reference_id=ref.id, tags=["models", "checkpoints"])
+        session.commit()
+
+        # Three user tags inserted in non-alphabetical input order. Per-tag
+        # microsecond stagger should preserve at least the "user batch is
+        # after path tags" property; within the user batch insertion order
+        # is also preserved.
+        add_tags_to_reference(
+            session,
+            reference_id=ref.id,
+            tags=["zzz-z", "favorite", "experiment-q4"],
+            origin="manual",
+        )
+        session.commit()
+
+        _, tag_map, _ = list_references_page(session)
+        tags = tag_map[ref.id]
+        assert tags[0:2] == ["models", "checkpoints"]
+        assert set(tags[2:]) == {"zzz-z", "favorite", "experiment-q4"}
+
+    def test_user_batch_lands_after_path_batch_under_clock_collision(
+        self, session: Session, monkeypatch: pytest.MonkeyPatch
+    ):
+        """Windows-specific race: when two back-to-back commits share the
+        same datetime.now() microsecond, the path-tier and user-tier
+        added_at values used to collide and alphabetic tiebreak would
+        hoist user tags ahead of path tags. The fix reads
+        max(existing_added_at) for the reference and seeds the next batch
+        past it, deterministically restoring insertion order.
+
+        This test simulates the collision by pinning get_utc_now() so the
+        platform-dependent race becomes a platform-independent failure.
+        """
+        ref = self._make_ref(session)
+
+        from datetime import datetime
+        from app.assets.database import queries as queries_pkg
+        from app.assets.database.queries import tags as tags_module
+
+        frozen = datetime(2026, 1, 1, 0, 0, 0)
+        monkeypatch.setattr(tags_module, "get_utc_now", lambda: frozen)
+        monkeypatch.setattr(queries_pkg, "get_utc_now", lambda: frozen, raising=False)
+
+        set_reference_tags(session, reference_id=ref.id, tags=["models", "checkpoints"])
+        session.commit()
+
+        # Same frozen timestamp — without the max(existing) seed, the
+        # user batch would share added_at with the path batch and
+        # `aaa-user-tag` would sort to position 0 via the alphabetic
+        # tiebreaker.
+        add_tags_to_reference(
+            session, reference_id=ref.id, tags=["aaa-user-tag"], origin="manual"
+        )
+        session.commit()
+
+        _, tag_map, _ = list_references_page(session)
+        assert tag_map[ref.id] == ["models", "checkpoints", "aaa-user-tag"]
+
+    def test_remove_then_add_does_not_disrupt_path_tag_positions(
+        self, session: Session
+    ):
+        ref = self._make_ref(session)
+        set_reference_tags(
+            session,
+            reference_id=ref.id,
+            tags=["models", "loras/my/custom/path"],
+        )
+        session.commit()
+        add_tags_to_reference(session, reference_id=ref.id, tags=["temp-tag"])
+        session.commit()
+        from app.assets.database.queries import remove_tags_from_reference
+
+        remove_tags_from_reference(session, reference_id=ref.id, tags=["temp-tag"])
+        session.commit()
+        add_tags_to_reference(session, reference_id=ref.id, tags=["second-tag"])
+        session.commit()
+
+        _, tag_map, _ = list_references_page(session)
+        # `loras` is expanded from the nested category path; user-added
+        # tags trail behind it via the microsecond stagger.
+        assert tag_map[ref.id] == [
+            "models",
+            "loras/my/custom/path",
+            "loras",
+            "second-tag",
+        ]
+
+
 class TestFetchReferenceAssetAndTags:
    def test_returns_none_for_nonexistent(self, session: Session):
        result = fetch_reference_asset_and_tags(session, "nonexistent")
--- a/tests-unit/assets_test/queries/test_tags.py
+++ b/tests-unit/assets_test/queries/test_tags.py
@ -160,6 +160,120 @@ class TestAddTagsToReference:
            add_tags_to_reference(session, reference_id="nonexistent", tags=["x"])


+class TestBucketPrefixExpansion:
+    """The standalone bucket token must appear in the asset's tag set for
+    nested category paths so FE filters like
+    `include_tags=models,checkpoints` continue to match.
+    """
+
+    def test_set_reference_tags_inserts_bucket_for_nested_path(
+        self, session: Session
+    ):
+        asset = _make_asset(session, "hash-nested")
+        ref = _make_reference(session, asset)
+
+        result = set_reference_tags(
+            session,
+            reference_id=ref.id,
+            tags=["models", "checkpoints/flux"],
+        )
+        session.commit()
+
+        assert set(result.total) == {"models", "checkpoints/flux", "checkpoints"}
+        stored = get_reference_tags(session, reference_id=ref.id)
+        # tag[1] keeps the slash-joined positional contract; the standalone
+        # bucket lands after it via path-tier microsecond stagger so user
+        # tags remain at the tail.
+        assert stored[:3] == ["models", "checkpoints/flux", "checkpoints"]
+
+    def test_set_reference_tags_idempotent_on_replay(self, session: Session):
+        asset = _make_asset(session, "hash-replay")
+        ref = _make_reference(session, asset)
+
+        set_reference_tags(
+            session,
+            reference_id=ref.id,
+            tags=["models", "checkpoints/flux"],
+        )
+        # Replay with the same caller-supplied set; expansion is already
+        # baked in, so nothing should be added or removed.
+        result = set_reference_tags(
+            session,
+            reference_id=ref.id,
+            tags=["models", "checkpoints/flux"],
+        )
+        session.commit()
+
+        assert result.added == []
+        assert result.removed == []
+        assert set(result.total) == {"models", "checkpoints/flux", "checkpoints"}
+
+    def test_add_tags_to_reference_expands_bucket(self, session: Session):
+        asset = _make_asset(session, "hash-add")
+        ref = _make_reference(session, asset)
+
+        result = add_tags_to_reference(
+            session,
+            reference_id=ref.id,
+            tags=["loras/style/v2"],
+        )
+        session.commit()
+
+        assert set(result.added) == {"loras/style/v2", "loras"}
+        stored = get_reference_tags(session, reference_id=ref.id)
+        assert "loras" in stored
+        assert "loras/style/v2" in stored
+
+    def test_add_tags_does_not_duplicate_existing_bucket(self, session: Session):
+        asset = _make_asset(session, "hash-dedupe")
+        ref = _make_reference(session, asset)
+
+        add_tags_to_reference(
+            session, reference_id=ref.id, tags=["models", "checkpoints"]
+        )
+        result = add_tags_to_reference(
+            session, reference_id=ref.id, tags=["checkpoints/flux"]
+        )
+        session.commit()
+
+        # `checkpoints` was already there from the first add; only the
+        # slash-joined token is genuinely new.
+        assert result.added == ["checkpoints/flux"]
+        assert "checkpoints" in result.already_present
+
+    def test_flat_category_is_unaffected(self, session: Session):
+        asset = _make_asset(session, "hash-flat")
+        ref = _make_reference(session, asset)
+
+        result = set_reference_tags(
+            session,
+            reference_id=ref.id,
+            tags=["models", "checkpoints"],
+        )
+        session.commit()
+
+        assert set(result.total) == {"models", "checkpoints"}
+        assert get_reference_tags(session, reference_id=ref.id) == [
+            "models",
+            "checkpoints",
+        ]
+
+    def test_unknown_prefix_passes_through(self, session: Session):
+        asset = _make_asset(session, "hash-user")
+        ref = _make_reference(session, asset)
+
+        # `my-org` isn't a registered bucket — the slash-joined user tag
+        # should not trigger bucket expansion.
+        result = set_reference_tags(
+            session,
+            reference_id=ref.id,
+            tags=["my-org/team-a"],
+        )
+        session.commit()
+
+        assert result.total == ["my-org/team-a"]
+
+
 class TestRemoveTagsFromReference:
    def test_removes_tags(self, session: Session):
        asset = _make_asset(session, "hash1")
--- a/tests-unit/assets_test/services/test_bulk_ingest.py
+++ b/tests-unit/assets_test/services/test_bulk_ingest.py
@ -4,7 +4,7 @@ from pathlib import Path

 from sqlalchemy.orm import Session

-from app.assets.database.models import Asset, AssetReference
+from app.assets.database.models import Asset, AssetReference, AssetReferenceTag
 from app.assets.services.bulk_ingest import SeedAssetSpec, batch_insert_seed_assets


@ -102,6 +102,82 @@ class TestBatchInsertSeedAssets:
            assert asset.mime_type == expected_mime, f"Expected {expected_mime} for {filename}, got {asset.mime_type}"


+class TestBucketPrefixExpansionOnIngest:
+    """Path-scanning ingest must persist the standalone bucket token for
+    nested category paths so the FE set-membership filter
+    (`include_tags=models,checkpoints`) matches assets organized into
+    subfolders (`models/checkpoints/flux/foo.safetensors`).
+    """
+
+    def test_nested_path_inserts_standalone_bucket(
+        self, session: Session, temp_dir: Path
+    ):
+        file_path = temp_dir / "flux.safetensors"
+        file_path.write_bytes(b"content")
+
+        specs: list[SeedAssetSpec] = [
+            {
+                "abs_path": str(file_path),
+                "size_bytes": 7,
+                "mtime_ns": 1234567890000000000,
+                "info_name": "flux",
+                # Shape emitted by get_name_and_tags_from_asset_path for a
+                # nested model path.
+                "tags": ["models", "checkpoints/flux"],
+                "fname": "flux.safetensors",
+                "metadata": None,
+                "hash": None,
+                "mime_type": "application/safetensors",
+            }
+        ]
+
+        result = batch_insert_seed_assets(session, specs=specs, owner_id="")
+
+        assert result.inserted_refs == 1
+        ref = session.query(AssetReference).filter_by(name="flux").one()
+        stored = [
+            row.tag_name
+            for row in session.query(AssetReferenceTag)
+            .filter_by(asset_reference_id=ref.id)
+            .order_by(AssetReferenceTag.added_at.asc())
+            .all()
+        ]
+        assert stored == ["models", "checkpoints/flux", "checkpoints"]
+
+    def test_flat_path_remains_two_tags(
+        self, session: Session, temp_dir: Path
+    ):
+        file_path = temp_dir / "vanilla.safetensors"
+        file_path.write_bytes(b"content")
+
+        specs: list[SeedAssetSpec] = [
+            {
+                "abs_path": str(file_path),
+                "size_bytes": 7,
+                "mtime_ns": 1234567890000000000,
+                "info_name": "vanilla",
+                "tags": ["models", "checkpoints"],
+                "fname": "vanilla.safetensors",
+                "metadata": None,
+                "hash": None,
+                "mime_type": "application/safetensors",
+            }
+        ]
+
+        batch_insert_seed_assets(session, specs=specs, owner_id="")
+
+        ref = session.query(AssetReference).filter_by(name="vanilla").one()
+        stored = {
+            row.tag_name
+            for row in session.query(AssetReferenceTag)
+            .filter_by(asset_reference_id=ref.id)
+            .all()
+        }
+        # Dedupe means flat layouts don't pick up a redundant `checkpoints`
+        # row — tag[1] already serves both positional and set-membership.
+        assert stored == {"models", "checkpoints"}
+
+
 class TestMetadataExtraction:
    def test_extracts_mime_type_for_model_files(self, temp_dir: Path):
        """Verify metadata extraction returns correct mime_type for model files."""
--- a/tests-unit/assets_test/services/test_path_utils.py
+++ b/tests-unit/assets_test/services/test_path_utils.py
@ -6,7 +6,11 @@ from unittest.mock import patch

 import pytest

-from app.assets.services.path_utils import get_asset_category_and_relative_path
+from app.assets.services.path_utils import (
+    get_asset_category_and_relative_path,
+    get_name_and_tags_from_asset_path,
+    resolve_destination_from_tags,
+)


@pytest.fixture
@ -38,6 +42,50 @@ def fake_dirs():
                }


+@pytest.fixture
+def fake_dirs_multi_bucket():
+    """Variant fixture with multiple model buckets (checkpoints + diffusers + loras)."""
+    with tempfile.TemporaryDirectory() as root:
+        root_path = Path(root)
+        input_dir = root_path / "input"
+        output_dir = root_path / "output"
+        temp_dir = root_path / "temp"
+        checkpoints_dir = root_path / "models" / "checkpoints"
+        diffusers_dir = root_path / "models" / "diffusers"
+        loras_dir = root_path / "models" / "loras"
+        for d in (
+            input_dir,
+            output_dir,
+            temp_dir,
+            checkpoints_dir,
+            diffusers_dir,
+            loras_dir,
+        ):
+            d.mkdir(parents=True)
+
+        with patch("app.assets.services.path_utils.folder_paths") as mock_fp:
+            mock_fp.get_input_directory.return_value = str(input_dir)
+            mock_fp.get_output_directory.return_value = str(output_dir)
+            mock_fp.get_temp_directory.return_value = str(temp_dir)
+
+            with patch(
+                "app.assets.services.path_utils.get_comfy_models_folders",
+                return_value=[
+                    ("checkpoints", [str(checkpoints_dir)]),
+                    ("diffusers", [str(diffusers_dir)]),
+                    ("loras", [str(loras_dir)]),
+                ],
+            ):
+                yield {
+                    "input": input_dir,
+                    "output": output_dir,
+                    "temp": temp_dir,
+                    "checkpoints": checkpoints_dir,
+                    "diffusers": diffusers_dir,
+                    "loras": loras_dir,
+                }
+
+
 class TestGetAssetCategoryAndRelativePath:
    def test_input_file(self, fake_dirs):
        f = fake_dirs["input"] / "photo.png"
@ -79,3 +127,161 @@ class TestGetAssetCategoryAndRelativePath:
    def test_unknown_path_raises(self, fake_dirs):
        with pytest.raises(ValueError, match="not within"):
            get_asset_category_and_relative_path("/some/random/path.png")
+
+
+class TestGetNameAndTagsFromAssetPath:
+    """tags collapse the parent subpath into a single slash-joined tag.
+
+    Consumers should be able to read ``tags[1]`` as a stable category
+    identifier regardless of how deep the file lives in the bucket.
+    """
+
+    def test_flat_input(self, fake_dirs_multi_bucket):
+        f = fake_dirs_multi_bucket["input"] / "photo.png"
+        f.touch()
+        name, tags = get_name_and_tags_from_asset_path(str(f))
+        assert name == "photo.png"
+        assert tags == ["input"]
+
+    def test_flat_output(self, fake_dirs_multi_bucket):
+        f = fake_dirs_multi_bucket["output"] / "result_00001.png"
+        f.touch()
+        name, tags = get_name_and_tags_from_asset_path(str(f))
+        assert name == "result_00001.png"
+        assert tags == ["output"]
+
+    def test_flat_models_checkpoint(self, fake_dirs_multi_bucket):
+        f = fake_dirs_multi_bucket["checkpoints"] / "flux.safetensors"
+        f.touch()
+        name, tags = get_name_and_tags_from_asset_path(str(f))
+        assert name == "flux.safetensors"
+        assert tags == ["models", "checkpoints"]
+
+    def test_diffusers_nested_subpath_slash_joined(self, fake_dirs_multi_bucket):
+        """Diffusers components live in nested directories — the full subpath
+        must collapse into one tag so consumers can look up the model category
+        via tags[1] regardless of nesting depth.
+
+        The subpath is lowercased to match the canonicalization
+        :func:`ensure_tags_exist` applies on the write side; without that,
+        the asset_reference_tags.tag_name FK to tags.name would fail for
+        any path containing uppercase letters.
+        """
+        nested = (
+            fake_dirs_multi_bucket["diffusers"]
+            / "Kolors"
+            / "text_encoder"
+        )
+        nested.mkdir(parents=True)
+        f = nested / "model.safetensors"
+        f.touch()
+        name, tags = get_name_and_tags_from_asset_path(str(f))
+        assert name == "model.safetensors"
+        assert tags == ["models", "diffusers/kolors/text_encoder"]
+
+    def test_deep_lora_user_subpath_slash_joined(self, fake_dirs_multi_bucket):
+        """User-created subdirectories under a model bucket also collapse to a
+        single tag rather than one tag per directory."""
+        nested = (
+            fake_dirs_multi_bucket["loras"]
+            / "my"
+            / "custom"
+            / "path"
+        )
+        nested.mkdir(parents=True)
+        f = nested / "v0001.safetensors"
+        f.touch()
+        name, tags = get_name_and_tags_from_asset_path(str(f))
+        assert name == "v0001.safetensors"
+        assert tags == ["models", "loras/my/custom/path"]
+
+
+class TestResolveDestinationFromTags:
+    """resolve_destination_from_tags must accept both the legacy
+    one-tag-per-directory shape and the new slash-joined shape so that an
+    upload using the tags it just read back from /api/assets round-trips
+    to the right on-disk destination.
+    """
+
+    @pytest.fixture
+    def resolve_dirs(self):
+        with tempfile.TemporaryDirectory() as root:
+            root_path = Path(root)
+            input_dir = root_path / "input"
+            output_dir = root_path / "output"
+            checkpoints_dir = root_path / "models" / "checkpoints"
+            diffusers_dir = root_path / "models" / "diffusers"
+            loras_dir = root_path / "models" / "loras"
+            for d in (input_dir, output_dir, checkpoints_dir, diffusers_dir, loras_dir):
+                d.mkdir(parents=True)
+            with patch("app.assets.services.path_utils.folder_paths") as mock_fp:
+                mock_fp.get_input_directory.return_value = str(input_dir)
+                mock_fp.get_output_directory.return_value = str(output_dir)
+                mock_fp.folder_names_and_paths = {
+                    "checkpoints": ([str(checkpoints_dir)], None),
+                    "diffusers": ([str(diffusers_dir)], None),
+                    "loras": ([str(loras_dir)], None),
+                }
+                yield {
+                    "input": input_dir,
+                    "output": output_dir,
+                    "checkpoints": checkpoints_dir,
+                    "diffusers": diffusers_dir,
+                    "loras": loras_dir,
+                }
+
+    def test_models_flat_category(self, resolve_dirs):
+        base, subdirs = resolve_destination_from_tags(["models", "checkpoints"])
+        assert base == str(resolve_dirs["checkpoints"])
+        assert subdirs == []
+
+    def test_models_slash_joined_new_shape(self, resolve_dirs):
+        # The shape get_name_and_tags_from_asset_path now emits.
+        base, subdirs = resolve_destination_from_tags(
+            ["models", "diffusers/kolors/text_encoder"]
+        )
+        assert base == str(resolve_dirs["diffusers"])
+        assert subdirs == ["kolors", "text_encoder"]
+
+    def test_models_legacy_one_tag_per_dir(self, resolve_dirs):
+        # The legacy shape must still resolve identically.
+        base, subdirs = resolve_destination_from_tags(
+            ["models", "diffusers", "kolors", "text_encoder"]
+        )
+        assert base == str(resolve_dirs["diffusers"])
+        assert subdirs == ["kolors", "text_encoder"]
+
+    def test_models_loras_slash_joined(self, resolve_dirs):
+        base, subdirs = resolve_destination_from_tags(
+            ["models", "loras/my/custom/path"]
+        )
+        assert base == str(resolve_dirs["loras"])
+        assert subdirs == ["my", "custom", "path"]
+
+    def test_input_no_subdir(self, resolve_dirs):
+        base, subdirs = resolve_destination_from_tags(["input"])
+        assert base == str(resolve_dirs["input"])
+        assert subdirs == []
+
+    def test_input_slash_joined_subdir(self, resolve_dirs):
+        base, subdirs = resolve_destination_from_tags(["input", "portraits/2026"])
+        assert base == str(resolve_dirs["input"])
+        assert subdirs == ["portraits", "2026"]
+
+    def test_output_slash_joined_subdir(self, resolve_dirs):
+        base, subdirs = resolve_destination_from_tags(["output", "runs/abc"])
+        assert base == str(resolve_dirs["output"])
+        assert subdirs == ["runs", "abc"]
+
+    def test_unknown_category_rejected(self, resolve_dirs):
+        with pytest.raises(ValueError, match="unknown model category"):
+            resolve_destination_from_tags(["models", "not_a_real_category"])
+
+    def test_unknown_category_via_slash_joined(self, resolve_dirs):
+        # First segment of a slash-joined tag must still match a registered category.
+        with pytest.raises(ValueError, match="unknown model category 'bogus'"):
+            resolve_destination_from_tags(["models", "bogus/sub/path"])
+
+    def test_traversal_in_subdir_rejected(self, resolve_dirs):
+        with pytest.raises(ValueError, match="invalid path component"):
+            resolve_destination_from_tags(["models", "checkpoints/..", "evil"])
--- a/tests-unit/assets_test/test_assets_missing_sync.py
+++ b/tests-unit/assets_test/test_assets_missing_sync.py
@ -32,7 +32,7 @@ def test_seed_asset_removed_when_file_is_deleted(
    # Verify it is visible via API and carries no hash (seed)
    r1 = http.get(
        api_base + "/api/assets",
-        params={"include_tags": "unit-tests,syncseed", "name_contains": name},
+        params={"include_tags": "unit-tests/syncseed", "name_contains": name},
        timeout=120,
    )
    body1 = r1.json()
@ -52,7 +52,7 @@ def test_seed_asset_removed_when_file_is_deleted(
    # It should disappear (AssetInfo and seed Asset gone)
    r2 = http.get(
        api_base + "/api/assets",
-        params={"include_tags": "unit-tests,syncseed", "name_contains": name},
+        params={"include_tags": "unit-tests/syncseed", "name_contains": name},
        timeout=120,
    )
    body2 = r2.json()
@ -332,7 +332,7 @@ def test_fastpass_removes_stale_state_row_no_missing(

    rl = http.get(
        api_base + "/api/assets",
-        params={"include_tags": f"unit-tests,{scope}"},
+        params={"include_tags": f"unit-tests/{scope}"},
        timeout=120,
    )
    bl = rl.json()
--- a/tests-unit/assets_test/test_crud.py
+++ b/tests-unit/assets_test/test_crud.py
@ -280,9 +280,15 @@ def test_metadata_filename_is_set_for_seed_asset_without_hash(

    trigger_sync_seed_assets(http, api_base)

+    # Scanner emits tags as ``[root, "<dir1>/<dir2>/..."]`` — the second tag
+    # is the slash-joined parent subpath. For ``<root>/unit-tests/<scope>/a/b/<name>``
+    # the second tag is ``"unit-tests/<scope>/a/b"``.
    r1 = http.get(
        api_base + "/api/assets",
-        params={"include_tags": f"unit-tests,{scope}", "name_contains": name},
+        params={
+            "include_tags": f"unit-tests/{scope}/a/b",
+            "name_contains": name,
+        },
        timeout=120,
    )
    body = r1.json()
--- a/tests-unit/assets_test/test_helpers.py
+++ b/tests-unit/assets_test/test_helpers.py
@ -0,0 +1,69 @@
+"""Unit tests for app.assets.helpers."""
+
+from app.assets.helpers import expand_bucket_prefixes
+
+
+class TestExpandBucketPrefixes:
+    def test_flat_category_unchanged(self):
+        # `checkpoints` is already a standalone token, no expansion needed.
+        assert expand_bucket_prefixes(["models", "checkpoints"]) == [
+            "models",
+            "checkpoints",
+        ]
+
+    def test_nested_category_inserts_bucket(self):
+        # Path-derived shape for `models/checkpoints/flux/foo.safetensors` —
+        # the standalone bucket has to be present so the FE set-membership
+        # filter (`include_tags=models,checkpoints`) matches the asset.
+        assert expand_bucket_prefixes(["models", "checkpoints/flux"]) == [
+            "models",
+            "checkpoints/flux",
+            "checkpoints",
+        ]
+
+    def test_deeply_nested_only_first_segment_expands(self):
+        # Only the FIRST slash segment ever gets emitted as a standalone —
+        # intermediate path segments don't have routing significance.
+        assert expand_bucket_prefixes(
+            ["models", "diffusers/kolors/text_encoder"]
+        ) == ["models", "diffusers/kolors/text_encoder", "diffusers"]
+
+    def test_unknown_prefix_does_not_expand(self):
+        # Free-form user labels with slashes whose first segment is not a
+        # registered bucket pass through opaquely.
+        assert expand_bucket_prefixes(["models", "my-org/team-a"]) == [
+            "models",
+            "my-org/team-a",
+        ]
+
+    def test_idempotent(self):
+        # Re-applying the helper is a no-op once the bucket is in the set.
+        expanded = expand_bucket_prefixes(["models", "checkpoints/flux"])
+        assert expand_bucket_prefixes(expanded) == expanded
+
+    def test_does_not_duplicate_existing_bucket(self):
+        # If the caller already supplied the standalone bucket, don't add a
+        # second copy.
+        assert expand_bucket_prefixes(
+            ["models", "checkpoints/flux", "checkpoints"]
+        ) == ["models", "checkpoints/flux", "checkpoints"]
+
+    def test_preserves_caller_order(self):
+        # User tags after path tags must stay after; the inserted bucket
+        # token slots in immediately after its slash-joined parent so the
+        # microsecond stagger lands it at path-tier before user-tier.
+        assert expand_bucket_prefixes(
+            ["models", "loras/style", "favorite", "v2"]
+        ) == ["models", "loras/style", "loras", "favorite", "v2"]
+
+    def test_empty_input(self):
+        assert expand_bucket_prefixes([]) == []
+
+    def test_input_root_with_subpath_no_expansion(self):
+        # `portraits` isn't a registered model category, so the input
+        # subpath stays opaque (FE filter doesn't have a checkpoint-loader
+        # analogue for input subfolders).
+        assert expand_bucket_prefixes(["input", "portraits/2026"]) == [
+            "input",
+            "portraits/2026",
+        ]
--- a/tests-unit/assets_test/test_prune_orphaned_assets.py
+++ b/tests-unit/assets_test/test_prune_orphaned_assets.py
@ -29,7 +29,10 @@ def create_seed_file(comfy_tmp_base_dir: Path):
 def find_asset(http: requests.Session, api_base: str):
    """Query API for assets matching scope and optional name."""
    def _find(scope: str, name: str | None = None) -> list[dict]:
-        params = {"include_tags": f"unit-tests,{scope}"}
+        # Scanner now emits tags as ``[root, "<dir1>/<dir2>/..."]`` rather than
+        # one tag per directory. For files at ``<root>/unit-tests/<scope>/...``
+        # the second tag is exactly ``"unit-tests/<scope>"``.
+        params = {"include_tags": f"unit-tests/{scope}"}
        if name:
            params["name_contains"] = name
        r = http.get(f"{api_base}/api/assets", params=params, timeout=120)
@ -138,4 +141,7 @@ def test_special_chars_in_path_escaped_correctly(
    trigger_sync_seed_assets(http, api_base)
    trigger_sync_seed_assets(http, api_base)

-    assert find_asset(scope.split("/")[0], fp.name), "Asset with special chars should survive"
+    # Scanner emits the full parent subpath as a single slash-joined tag, so
+    # the lookup tag is ``unit-tests/<scope>`` even when <scope> itself
+    # contains a slash (parent + special-char dirname).
+    assert find_asset(scope, fp.name), "Asset with special chars should survive"
--- a/tests-unit/assets_test/test_user_tag_http_smoke.py
+++ b/tests-unit/assets_test/test_user_tag_http_smoke.py
@ -0,0 +1,135 @@
+"""HTTP-layer smoke test: user-added tags via POST /api/assets/{id}/tags
+land after path tags when read back via GET /api/assets.
+
+Exercises the full route handler -> service -> query path that the unit
+tests at tests-unit/assets_test/queries/test_asset_info.py only cover at
+the service layer.
+"""
+import json
+
+import pytest
+import requests
+
+
+@pytest.fixture
+def smoke_asset(http: requests.Session, api_base: str):
+    """Upload a single asset into models/checkpoints/unit-tests/smoke
+    and delete it on teardown."""
+    name = "smoke_user_tag.safetensors"
+    tags = ["models", "checkpoints", "unit-tests", "smoke"]
+    files = {"file": (name, b"S" * 4096, "application/octet-stream")}
+    form_data = {
+        "tags": json.dumps(tags),
+        "name": name,
+        "user_metadata": json.dumps({}),
+    }
+    r = http.post(api_base + "/api/assets", files=files, data=form_data, timeout=120)
+    assert r.status_code == 201, r.text
+    body = r.json()
+    yield body
+    http.delete(
+        f"{api_base}/api/assets/{body['id']}?delete_content=true", timeout=30
+    )
+
+
+def _fetch_asset_tags(http, api_base, ref_id):
+    r = http.get(f"{api_base}/api/assets/{ref_id}", timeout=30)
+    assert r.status_code == 200, r.text
+    return r.json()["tags"]
+
+
+def test_user_tag_lands_after_path_tags_via_http(
+    http: requests.Session, api_base: str, smoke_asset: dict
+):
+    ref_id = smoke_asset["id"]
+
+    initial_tags = _fetch_asset_tags(http, api_base, ref_id)
+    # Path tags should already be at the front in upload order.
+    assert initial_tags[:2] == ["models", "checkpoints"]
+
+    # Add a user tag that would jump to position 0 under alphabetical sort.
+    r = http.post(
+        f"{api_base}/api/assets/{ref_id}/tags",
+        json={"tags": ["aaa-user-tag"]},
+        timeout=30,
+    )
+    assert r.status_code in (200, 201), r.text
+
+    tags_after = _fetch_asset_tags(http, api_base, ref_id)
+    # Path tags must still be at the front; user tag goes to the end.
+    assert tags_after[0] == "models"
+    assert tags_after[1] == "checkpoints"
+    assert "aaa-user-tag" in tags_after
+    assert tags_after[-1] == "aaa-user-tag"
+
+
+def test_user_tag_batch_lands_after_path_tags_via_http(
+    http: requests.Session, api_base: str, smoke_asset: dict
+):
+    ref_id = smoke_asset["id"]
+
+    # Add three user tags in a single request, in non-alphabetical input
+    # order. They should all land after the path tags (microsecond stagger
+    # in set_reference_tags / add_tags_to_reference is what makes this
+    # work — without it, "aaa" would jump to position 0).
+    r = http.post(
+        f"{api_base}/api/assets/{ref_id}/tags",
+        json={"tags": ["zzz-z", "favorite", "aaa-experiment"]},
+        timeout=30,
+    )
+    assert r.status_code in (200, 201), r.text
+
+    tags_after = _fetch_asset_tags(http, api_base, ref_id)
+    assert tags_after[0] == "models"
+    assert tags_after[1] == "checkpoints"
+    user_tail = tags_after[len({"models", "checkpoints", "unit-tests", "smoke"}):]
+    assert set(user_tail) >= {"zzz-z", "favorite", "aaa-experiment"}
+    # Critically: alphabetical sort would put 'aaa-experiment' at position 0.
+    assert tags_after.index("aaa-experiment") > tags_after.index("models")
+    assert tags_after.index("aaa-experiment") > tags_after.index("checkpoints")
+
+
+@pytest.fixture
+def nested_checkpoint_asset(http: requests.Session, api_base: str):
+    """Upload a checkpoint at the slash-joined path shape cloud emits
+    (`models/checkpoints/flux/...`), then delete it on teardown.
+    """
+    name = "nested_checkpoint.safetensors"
+    tags = ["models", "checkpoints/flux"]
+    files = {"file": (name, b"S" * 4096, "application/octet-stream")}
+    form_data = {
+        "tags": json.dumps(tags),
+        "name": name,
+        "user_metadata": json.dumps({}),
+    }
+    r = http.post(api_base + "/api/assets", files=files, data=form_data, timeout=120)
+    assert r.status_code == 201, r.text
+    body = r.json()
+    yield body
+    http.delete(
+        f"{api_base}/api/assets/{body['id']}?delete_content=true", timeout=30
+    )
+
+
+def test_nested_checkpoint_satisfies_fe_set_filter(
+    http: requests.Session, api_base: str, nested_checkpoint_asset: dict
+):
+    """The case Simon flagged: a nested-path checkpoint must still match
+    `include_tags=models,checkpoints` — the FE combo-widget filter.
+    """
+    ref_id = nested_checkpoint_asset["id"]
+
+    stored = _fetch_asset_tags(http, api_base, ref_id)
+    # tag[1] keeps cloud's slash-joined positional contract; tag[2] holds
+    # the standalone bucket the FE filter looks for.
+    assert stored[:3] == ["models", "checkpoints/flux", "checkpoints"]
+
+    # The actual FE query — exact set-membership across both tokens.
+    r = http.get(
+        f"{api_base}/api/assets",
+        params=[("include_tags", "models"), ("include_tags", "checkpoints")],
+        timeout=30,
+    )
+    assert r.status_code == 200, r.text
+    returned_ids = {a["id"] for a in r.json()["assets"]}
+    assert ref_id in returned_ids
--- a/tests/execution/test_async_nodes.py
+++ b/tests/execution/test_async_nodes.py
@ -14,7 +14,6 @@ from tests.execution.test_execution import ComfyClient, run_warmup
 class TestAsyncNodes:
    @fixture(scope="class", autouse=True, params=[
        (False, 0),
-        (True, 0),
        (True, 100),
    ])
    def _server(self, args_pytest, request):
@ -29,6 +28,8 @@ class TestAsyncNodes:
        use_lru, lru_size = request.param
        if use_lru:
            pargs += ['--cache-lru', str(lru_size)]
+        else:
+            pargs += ['--cache-classic']
        # Running server with args: pargs
        p = subprocess.Popen(pargs)
        yield
--- a/tests/execution/test_execution.py
+++ b/tests/execution/test_execution.py
@ -183,8 +183,7 @@ class TestExecution:
    # Initialize server and client
    #
    @fixture(scope="class", autouse=True, params=[
-        { "extra_args" : [], "should_cache_results" : True },
-        { "extra_args" : ["--cache-lru", 0], "should_cache_results" : True },
+        { "extra_args" : ["--cache-classic"], "should_cache_results" : True },
        { "extra_args" : ["--cache-lru", 100], "should_cache_results" : True },
        { "extra_args" : ["--cache-none"], "should_cache_results" : False },
    ])
Author	SHA1	Message	Date
Matt Miller	dc6190e8ba	fix(assets): seed added_at past max(existing) to survive Windows clock collisions The per-tag microsecond stagger preserves intra-batch order, but two back-to-back write batches on the same reference (e.g. set_reference_tags for path tags, then add_tags_to_reference for user tags) call get_utc_now() independently. On Windows the system clock can return the same datetime for both calls if no OS tick elapsed between the commits — both batches end up sharing microseconds and ORDER BY added_at, tag_name falls back to the alphabetic tiebreaker, sorting user tags ahead of path tags they were meant to follow. Add _next_added_at_base(reference_id) that reads max(existing added_at) and returns max(existing + 1us, get_utc_now()), guaranteeing the new batch sorts strictly after anything previously written for that reference. Used by set_reference_tags and add_tags_to_reference; batch_insert_seed_assets stays on raw get_utc_now() since seed inserts are always the first writes for a new reference. The accompanying regression test pins get_utc_now() to a frozen value so the previously-Windows-only race becomes a platform-independent failure mode under test.	2026-05-20 20:33:39 -07:00
Matt Miller	2d21956ac7	fix(assets): expand standalone bucket tag for nested category paths Path-derived tags for nested model layouts (e.g. models/checkpoints/flux/foo.safetensors) emitted only the slash-joined shape `["models", "checkpoints/flux"]`, which broke the frontend combo-widget set-membership filter `include_tags=models,checkpoints` — the literal `checkpoints` token was no longer present in the asset's tag set. Add `expand_bucket_prefixes` at the tag-write layer. When a tag's first slash segment is a registered model category (or input/output/temp root), the bucket is inserted as a standalone token immediately after the slash-joined form. This preserves tag[1] as the slash-joined positional contract cloud emits while restoring the set-membership token the frontend filter requires. The expansion is bounded to known buckets so free-form user labels with slashes (`my-org/team-a`) pass through unchanged. The helper is applied uniformly in `set_reference_tags`, `add_tags_to_reference`, and `batch_insert_seed_assets` so HTTP uploads, user-tag mutations, and path-scanning ingest all converge on the same canonical shape. Also align the upload-route category validator with `resolve_destination_from_tags` by extracting the first slash segment of tag[1], so HTTP uploads matching cloud's slash-joined emission shape are no longer rejected as `unknown models category`.	2026-05-20 20:33:39 -07:00
Matt Miller	396bfe4056	Merge branch 'master' into matt/asset-tags-cloud-shape	2026-05-20 19:20:33 -07:00
comfyanonymous	95fdc6cf91	Repo security stuff. (#14019 )	2026-05-20 17:17:55 -07:00
rattus	5aa5ccc9e0	Multi-threaded load of models from disk (big load time speedups & Offload to disk) (CORE-43,CORE-152,CORE-164,CORE-165,CORE-117) (#13802 ) * model_management: disable non-dynamic smart memory Disable smart memory outright for non dynamic models. This is a minor step towards deprecation of --disable-dynamic-vram and the legacy ModelPatcher. This is needed for estimate-free model development, where new models can opt-out of supplying a memory estimate and not have to worry about hard VRAM allocations due to legacy non-dynamic model patchers This is also a general stability increase for a lot of stray use cases where estimates may still be off and going forward we are not going to accurately maintain such estimates. * pinned_memory: implement with aimdo growable buffer Use a single growable buffer so we can do threaded pre-warming on pinned memory. * mm: use aimdo to do transfer from disk to pin Aimdo implements a faster threaded loader. * Add stream host pin buffer for AIMDO casts Introduce per-offload-stream HostBuffer reuse for pinned staging, include it in cast buffer reset synchronization. Defer actual casts that go via this pin path to a separate pass such that the buffer can be allocated monolithically (to avoid cudaHostRegister thrash). * remove old pin path * Implement JIT pinned memory pressure Replace the predictive pin pressure mechanism with JIT PIN memory pressure. * LowVRAMPatch: change to two-phase visit * lora: re-implement as inplace swiss-army-knife operation * prepare for multiple pin sets * implement pinned loras * requirements: comfy-aimdo 0.4.0 * ops: remove unused arg This was defeatured in aimdo iteration * ops: sync the CPU with only the offload stream activity This was syncing with the offload stream which itself is synced with the compute stream, so this was syncing CPU with compute transitively. Define the event to sync it more gently. * pins: implement freeing intermediate for pinned memory Pinning is more important than inactive intermediates and the stream pin buffer is more important than even active intermediates. * execution: implement pin eviction on RAM presure Add back proper pin freeing on RAM pressure * implement pin registration swaps Uncap the windows pins from 50% by extending the pool and have a pressure mechanism to move the pin reservations om demand. This unfortunately implies a GPU sync to do the freeing so significant hysterisis needs to be added to consolidate these pressure events. * cli_args/execution: Implement lower background cache-ram threshold Limit the amount of RAM background intermediates can use, so that switching workflows doesn't degrade performance too much. * make default * bump aimdo * model-patcher: force-cast tiny weights Flux 2 gets crazy stalls due to a mix of tiny and giant weights creating lopsided steam buffer rotations which creates stalls. * ops: refactor in prep for chunking * mm: delegate pin-on-the-way to aimdo Aimdo is able to chunk and slice this on the way for better CPU->GPU overlap. The main advantage is the ability to shorten the bus contention window between previous weight transfer and the next weights vbar fault. * bump aimdo * pinning updates * specify hostbuf max allocation size There a signs of virtual memory exhaustion on some linux systems when throwing 128GB for every little piece. Pass the actual to save aimdo from over-estimates * tests: update execution tests for caching The default caching changed to ram-cache so update these tests accordingly. Remove the LRU 0 test as this also falls through to RAM cache.	2026-05-20 17:03:58 -07:00
Jukka Seppänen	4d6a058bf1	feat: MediaPipe face detection (CORE-235) (#14009 ) * Initial mediapipe face detection support * Update face_geometry.py * Account for diff sized batch input * Model folder placeholder	2026-05-20 16:07:48 -07:00
Matt Miller	00940fb24e	fix(assets): preserve caller order in add_tags_to_reference + align response helper Smoke test through the real HTTP upload + tag-add path exposed two ordering bugs the unit-layer tests missed: 1. add_tags_to_reference did `to_add = sorted(want - current)` — an alphabetical pre-sort defeating the microsecond-stagger fix from the previous commit. The stagger was encoding alphabetical positions, not the caller's insertion order. Fix: build to_add by walking the already-normalized caller list and filtering against the current set, so the staggered added_at timestamps reflect what the caller actually requested. 2. get_reference_tags used .order_by(tag_name.asc()) — alphabetical. It's called by the upload response path; meanwhile list_references_page and fetch_reference_asset_and_tags were already updated to order by added_at. The mismatch meant POST /api/assets returned tags in alphabetical order but a subsequent GET returned them in insertion order. Fix: order get_reference_tags by added_at too, so all three response-path helpers agree. New tests-unit/assets_test/test_user_tag_http_smoke.py exercises the full HTTP layer: POST /api/assets to upload, POST /api/assets/{id}/tags to add a user tag (using tag names like "aaa-user-tag" that would jump to position 0 under alphabetical), GET /api/assets/{id} to verify ordering. Catches the bugs above in CI going forward. Full assets suite: 340 passed, 10 pre-existing skipped.	2026-05-19 21:10:53 -07:00
Matt Miller	7ff001d7c8	fix(assets): stagger added_at in set_reference_tags + add ordering tests Cursor-reviews follow-up on PR #13994: 1. set_reference_tags / add_tags_to_reference now apply the same microsecond stagger as batch_insert_seed_assets. Per-tag get_utc_now() calls can collide at microsecond resolution on fast machines, dropping retrieval to the tag_name alphabetical tiebreaker. Using a single base_ts + timedelta(microseconds=i) preserves insertion order for any batch. 2. Docstring on get_name_and_tags_from_asset_path corrected: only the subpath is lowercased in code; the root category is lowercase by construction in get_asset_category_and_relative_path. 3. resolve_destination_from_tags docstring now states explicitly that hybrid shapes (mix of legacy multi-tag + new slash-joined within a single call) are accepted and resolve to the same destination. 4. New TestTagRetrievalOrder class in test_asset_info.py exercises the public write paths (set_reference_tags, add_tags_to_reference, remove_tags_from_reference) and asserts the public read paths (list_references_page, fetch_reference_asset_and_tags) return tags in insertion order rather than alphabetical. Tag names are chosen to fail loudly under alphabetical regression — "checkpoints" sorts before "models", "aaa-user-tag" sorts before every path tag, etc. Full assets suite: 338 passed, 10 pre-existing skipped.	2026-05-19 21:05:54 -07:00
Matt Miller	19ba85bb2e	Merge branch 'master' into matt/asset-tags-cloud-shape	2026-05-19 20:48:47 -07:00
Matt Miller	3ffc49aa0e	fix(assets): lowercase subpath, parse slash-joined upload tags, stagger added_at Three bugs surfaced by an end-to-end smoke test of the read+write round-trip; all in this PR's scope. 1. FK violation on uppercase paths get_name_and_tags_from_asset_path was preserving case on the subpath (e.g. "diffusers/Kolors/text_encoder"). ensure_tags_exist lowercases via normalize_tags before inserting into the tags table, so the asset_reference_tags.tag_name FK to tags.name failed for any path containing uppercase letters — including the diffusers case the PR was designed to support. Fix: lowercase the slash-joined subpath in get_name_and_tags_from_asset_path to match the canonicalization ensure_tags_exist applies. Providers keyed on original-case subpaths need to normalize their lookup key to lowercase. 2. resolve_destination_from_tags rejected the new tag shape The inverse function only accepted the legacy one-tag-per-dir shape (["models", "diffusers", "Kolors", "text_encoder"]). An upload using the slash-joined shape returned by /api/assets raised "unknown model category" or "invalid path component". Fix: pre-split every entry after tags[0] on "/" so both shapes resolve identically. For models, the first expanded segment is the category and the rest are subdirs; for input/output the full expansion becomes the subdirs. 3. Within-batch tag order was lost bulk_ingest wrote every tag in a single batch with the same added_at = current_time. The retrieval ORDER BY added_at, tag_name then fell back to the tag_name tiebreaker, sorting the path-derived pair alphabetically — putting "checkpoints/..." ahead of "models" since "c" < "m". The tags[0] = root contract was lost on bulk- ingested rows. Fix: stagger added_at by microseconds per tag index within a reference so the retrieval order matches the input list order. Path-derived tags now consistently land in position-0 = root, position-1 = subpath. Tests - TestGetNameAndTagsFromAssetPath updated: subpath is now lowercase. - New TestResolveDestinationFromTags covers both tag shapes, the unknown-category case for slash-joined input, traversal rejection, and input/output paths. - Full suite: 333 passed, 10 pre-existing skipped.	2026-05-19 20:30:04 -07:00
Matt Miller	36f9a6fdef	feat(assets): preserve insertion order on tag retrieval The /api/assets response previously sorted tags alphabetically via .order_by(Tag.name.asc()). That breaks the structurally meaningful "root category first, then subpath" invariant the path-collapsing change relies on: alphabetical sort puts a custom user tag (or even the bare "models" root) at unpredictable positions, so positional access like tags[1] is not reliable on local. Cloud already preserves insertion order — its Ent WithTags() eager- load has no explicit ORDER BY, so Postgres returns rows in physical insertion order. Local's composite primary key on (asset_reference_id, tag_name) means SQLite walks the index in tag_name order even without an explicit ORDER BY, so just dropping the clause isn't enough. Switching to ORDER BY added_at ASC, tag_name ASC keeps the path tags inserted via set_reference_tags in their original order (microsecond-resolution timestamps disambiguate same-batch inserts; tag_name is a deterministic tiebreaker for the rare collision case). Custom tags added later via add_tags_to_reference land after the path tags in their own added_at bucket. Applies to both response-shaping queries: - list_references_page (GET /api/assets, tag_map join) - fetch_reference_asset_and_tags (GET /api/assets/{id}) Catalog/histogram queries in app/assets/database/queries/tags.py keep their alphabetical sort — those endpoints are listing all tags, not per-asset tags, and alphabetical is the right shape there.	2026-05-19 20:14:01 -07:00
Matt Miller	a0d1238829	Merge branch 'master' into matt/asset-tags-cloud-shape	2026-05-19 20:06:12 -07:00
Matt Miller	1688a5e262	Merge branch 'master' into matt/asset-tags-cloud-shape	2026-05-19 15:00:22 -07:00
Matt Miller	7ab346fc7b	chore(assets): drop unused normalize_tags import after subpath-collapse refactor normalize_tags lowercased every tag, which would have stripped case from the slash-joined subpath (e.g. "diffusers/Kolors/text_encoder" -> "diffusers/kolors/text_encoder") and broken consumer lookups keyed on the original-case path. The refactored implementation inlines a strip + dedup so the import is no longer needed.	2026-05-19 14:51:00 -07:00
Matt Miller	5b7288d700	feat(assets): collapse nested asset path into a single slash-joined tag The /api/assets response previously emitted one tag per parent directory between the root category and the filename. For nested categories like diffusers, this produced ["models", "diffusers", "Kolors", "text_encoder"] where consumers that look up a category via tags[1] would only see the top-level bucket name and miss the model-specific sub-path that uniquely identifies the component. This collapses the parent subpath into a single slash-joined tag so the result is ["models", "diffusers/Kolors/text_encoder"]. Consumers can now read tags[1] as a stable category identifier regardless of how deep the file lives in the bucket. Case is preserved on the subpath so providers keyed on the original-case path (e.g. "diffusers/Kolors/text_encoder") resolve correctly. Same shape applies uniformly: - input/foo.png -> ["input"] - output/00001.png -> ["output"] - models/checkpoints/flux.safetensors -> ["models", "checkpoints"] - models/diffusers/Kolors/text_encoder/m.sft -> ["models", "diffusers/Kolors/text_encoder"] - models/loras/my/custom/path/v1.safetensors -> ["models", "loras/my/custom/path"] Integration tests that filtered by individual subdirectory tags (`include_tags=unit-tests,scope`) updated to use the new slash-joined shape (`include_tags=unit-tests/scope`). Unit tests cover flat input, flat output, flat models, diffusers-style nested, and deep user-subpath cases.	2026-05-19 14:48:49 -07:00