mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-05-28 11:53:04 +08:00
Compare commits
1 Commits
unreviewed
...
feature/do
| Author | SHA1 | Date | |
|---|---|---|---|
| 2f7b77f341 |
24
.github/workflows/detect-unreviewed-merge.yml
vendored
24
.github/workflows/detect-unreviewed-merge.yml
vendored
@ -1,24 +0,0 @@
|
||||
name: Detect Unreviewed Merge
|
||||
|
||||
# SOC 2 compliance — reusable workflow lives in Comfy-Org/github-workflows,
|
||||
# tracking issues are filed in Comfy-Org/unreviewed-merges.
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [master]
|
||||
|
||||
concurrency:
|
||||
group: detect-unreviewed-merge-${{ github.sha }}
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: read
|
||||
|
||||
jobs:
|
||||
detect:
|
||||
uses: Comfy-Org/github-workflows/.github/workflows/detect-unreviewed-merge.yml@4d9cb6b87f953bb7cd69954280e1465fb9bd2040 # v1
|
||||
with:
|
||||
approval-mode: latest-per-reviewer
|
||||
secrets:
|
||||
UNREVIEWED_MERGES_TOKEN: ${{ secrets.UNREVIEWED_MERGES_TOKEN }}
|
||||
@ -93,13 +93,12 @@ def compute_relative_filename(file_path: str) -> str | None:
|
||||
|
||||
def get_asset_category_and_relative_path(
|
||||
file_path: str,
|
||||
) -> tuple[Literal["input", "output", "temp", "models"], str]:
|
||||
) -> tuple[Literal["input", "output", "models"], str]:
|
||||
"""Determine which root category a file path belongs to.
|
||||
|
||||
Categories:
|
||||
- 'input': under folder_paths.get_input_directory()
|
||||
- 'output': under folder_paths.get_output_directory()
|
||||
- 'temp': under folder_paths.get_temp_directory()
|
||||
- 'models': under any base path from get_comfy_models_folders()
|
||||
|
||||
Returns:
|
||||
@ -130,12 +129,7 @@ def get_asset_category_and_relative_path(
|
||||
if _check_is_within(fp_abs, output_base):
|
||||
return "output", _compute_relative(fp_abs, output_base)
|
||||
|
||||
# 3) temp
|
||||
temp_base = os.path.abspath(folder_paths.get_temp_directory())
|
||||
if _check_is_within(fp_abs, temp_base):
|
||||
return "temp", _compute_relative(fp_abs, temp_base)
|
||||
|
||||
# 4) models (check deepest matching base to avoid ambiguity)
|
||||
# 3) models (check deepest matching base to avoid ambiguity)
|
||||
best: tuple[int, str, str] | None = None # (base_len, bucket, rel_inside_bucket)
|
||||
for bucket, bases in get_comfy_models_folders():
|
||||
for b in bases:
|
||||
@ -152,7 +146,7 @@ def get_asset_category_and_relative_path(
|
||||
return "models", os.path.relpath(os.path.join(os.sep, combined), os.sep)
|
||||
|
||||
raise ValueError(
|
||||
f"Path is not within input, output, temp, or configured model bases: {file_path}"
|
||||
f"Path is not within input, output, or configured model bases: {file_path}"
|
||||
)
|
||||
|
||||
|
||||
|
||||
507
app/download_manager.py
Normal file
507
app/download_manager.py
Normal file
@ -0,0 +1,507 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import Optional, TYPE_CHECKING
|
||||
from urllib.parse import urlsplit
|
||||
|
||||
import aiohttp
|
||||
from yarl import URL
|
||||
|
||||
import folder_paths
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from server import PromptServer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ALLOWED_HTTPS_HOSTS = frozenset({
|
||||
"huggingface.co",
|
||||
"cdn-lfs.huggingface.co",
|
||||
"cdn-lfs-us-1.huggingface.co",
|
||||
"cdn-lfs-eu-1.huggingface.co",
|
||||
"civitai.com",
|
||||
"api.civitai.com",
|
||||
})
|
||||
|
||||
ALLOWED_EXTENSIONS = frozenset({".safetensors", ".sft"})
|
||||
|
||||
MAX_CONCURRENT_DOWNLOADS = 3
|
||||
MAX_TERMINAL_TASKS = 50
|
||||
MAX_REDIRECTS = 10
|
||||
|
||||
DOWNLOAD_TEMP_SUFFIX = ".download_tmp"
|
||||
DOWNLOAD_META_SUFFIX = ".download_meta"
|
||||
|
||||
|
||||
class DownloadStatus(str, Enum):
|
||||
PENDING = "pending"
|
||||
DOWNLOADING = "downloading"
|
||||
PAUSED = "paused"
|
||||
COMPLETED = "completed"
|
||||
ERROR = "error"
|
||||
CANCELLED = "cancelled"
|
||||
|
||||
|
||||
ACTIVE_STATUSES = frozenset({
|
||||
DownloadStatus.PENDING,
|
||||
DownloadStatus.DOWNLOADING,
|
||||
DownloadStatus.PAUSED,
|
||||
})
|
||||
|
||||
TERMINAL_STATUSES = frozenset({
|
||||
DownloadStatus.COMPLETED,
|
||||
DownloadStatus.ERROR,
|
||||
DownloadStatus.CANCELLED,
|
||||
})
|
||||
|
||||
|
||||
@dataclass
|
||||
class DownloadTask:
|
||||
id: str
|
||||
url: str
|
||||
filename: str
|
||||
directory: str
|
||||
save_path: str
|
||||
temp_path: str
|
||||
meta_path: str
|
||||
status: DownloadStatus = DownloadStatus.PENDING
|
||||
progress: float = 0.0
|
||||
received_bytes: int = 0
|
||||
total_bytes: int = 0
|
||||
speed_bytes_per_sec: float = 0.0
|
||||
eta_seconds: float = 0.0
|
||||
error: Optional[str] = None
|
||||
created_at: float = field(default_factory=time.time)
|
||||
client_id: Optional[str] = None
|
||||
_worker: Optional[asyncio.Task] = field(default=None, repr=False)
|
||||
_stop_reason: Optional[str] = field(default=None, repr=False)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"id": self.id,
|
||||
"url": self.url,
|
||||
"filename": self.filename,
|
||||
"directory": self.directory,
|
||||
"status": self.status.value,
|
||||
"progress": self.progress,
|
||||
"received_bytes": self.received_bytes,
|
||||
"total_bytes": self.total_bytes,
|
||||
"speed_bytes_per_sec": self.speed_bytes_per_sec,
|
||||
"eta_seconds": self.eta_seconds,
|
||||
"error": self.error,
|
||||
"created_at": self.created_at,
|
||||
}
|
||||
|
||||
|
||||
class DownloadManager:
|
||||
def __init__(self, server: PromptServer):
|
||||
self.server = server
|
||||
self.tasks: dict[str, DownloadTask] = {}
|
||||
self._session: Optional[aiohttp.ClientSession] = None
|
||||
self._semaphore = asyncio.Semaphore(MAX_CONCURRENT_DOWNLOADS)
|
||||
|
||||
async def _get_session(self) -> aiohttp.ClientSession:
|
||||
if self._session is None or self._session.closed:
|
||||
timeout = aiohttp.ClientTimeout(total=None, connect=30, sock_read=60)
|
||||
self._session = aiohttp.ClientSession(timeout=timeout)
|
||||
return self._session
|
||||
|
||||
async def close(self):
|
||||
workers = [t._worker for t in self.tasks.values() if t._worker and not t._worker.done()]
|
||||
for w in workers:
|
||||
w.cancel()
|
||||
if workers:
|
||||
await asyncio.gather(*workers, return_exceptions=True)
|
||||
if self._session and not self._session.closed:
|
||||
await self._session.close()
|
||||
|
||||
# -- Validation --
|
||||
|
||||
@staticmethod
|
||||
def _validate_url(url: str) -> Optional[str]:
|
||||
try:
|
||||
parts = urlsplit(url)
|
||||
except Exception:
|
||||
return "Invalid URL"
|
||||
|
||||
if parts.username or parts.password:
|
||||
return "Credentials in URL are not allowed"
|
||||
|
||||
host = (parts.hostname or "").lower()
|
||||
scheme = parts.scheme.lower()
|
||||
|
||||
if scheme != "https":
|
||||
return "Only HTTPS URLs are allowed"
|
||||
|
||||
if host not in ALLOWED_HTTPS_HOSTS:
|
||||
return f"Host '{host}' is not in the allowed list"
|
||||
|
||||
if parts.port not in (None, 443):
|
||||
return "Custom ports are not allowed for remote downloads"
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _validate_filename(filename: str) -> Optional[str]:
|
||||
if not filename:
|
||||
return "Filename must not be empty"
|
||||
ext = os.path.splitext(filename)[1].lower()
|
||||
if ext not in ALLOWED_EXTENSIONS:
|
||||
return f"File extension '{ext}' not allowed. Allowed: {', '.join(sorted(ALLOWED_EXTENSIONS))}"
|
||||
if os.path.sep in filename or (os.path.altsep and os.path.altsep in filename):
|
||||
return "Filename must not contain path separators"
|
||||
if ".." in filename:
|
||||
return "Filename must not contain '..'"
|
||||
for ch in filename:
|
||||
if ord(ch) < 32:
|
||||
return "Filename must not contain control characters"
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _validate_directory(directory: str) -> Optional[str]:
|
||||
if directory not in folder_paths.folder_names_and_paths:
|
||||
valid = ', '.join(sorted(folder_paths.folder_names_and_paths.keys()))
|
||||
return f"Unknown model directory '{directory}'. Valid directories: {valid}"
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _resolve_save_path(directory: str, filename: str) -> tuple[str, str, str]:
|
||||
"""Returns (save_path, temp_path, meta_path) for a download."""
|
||||
paths = folder_paths.folder_names_and_paths[directory][0]
|
||||
base_dir = paths[0]
|
||||
os.makedirs(base_dir, exist_ok=True)
|
||||
|
||||
save_path = os.path.join(base_dir, filename)
|
||||
temp_path = save_path + DOWNLOAD_TEMP_SUFFIX
|
||||
meta_path = save_path + DOWNLOAD_META_SUFFIX
|
||||
|
||||
real_save = os.path.realpath(save_path)
|
||||
real_base = os.path.realpath(base_dir)
|
||||
if os.path.commonpath([real_save, real_base]) != real_base:
|
||||
raise ValueError("Resolved path escapes the model directory")
|
||||
|
||||
return save_path, temp_path, meta_path
|
||||
|
||||
# -- Sidecar metadata for resume validation --
|
||||
|
||||
@staticmethod
|
||||
def _write_meta(meta_path: str, url: str, task_id: str):
|
||||
try:
|
||||
with open(meta_path, "w") as f:
|
||||
json.dump({"url": url, "task_id": task_id}, f)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def _read_meta(meta_path: str) -> Optional[dict]:
|
||||
try:
|
||||
with open(meta_path, "r") as f:
|
||||
return json.load(f)
|
||||
except (OSError, json.JSONDecodeError):
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _cleanup_files(*paths: str):
|
||||
for p in paths:
|
||||
try:
|
||||
if os.path.exists(p):
|
||||
os.remove(p)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# -- Task management --
|
||||
|
||||
def _prune_terminal_tasks(self):
|
||||
terminal = [
|
||||
(tid, t) for tid, t in self.tasks.items()
|
||||
if t.status in TERMINAL_STATUSES
|
||||
]
|
||||
if len(terminal) > MAX_TERMINAL_TASKS:
|
||||
terminal.sort(key=lambda x: x[1].created_at)
|
||||
to_remove = len(terminal) - MAX_TERMINAL_TASKS
|
||||
for tid, _ in terminal[:to_remove]:
|
||||
del self.tasks[tid]
|
||||
|
||||
async def start_download(
|
||||
self, url: str, directory: str, filename: str, client_id: Optional[str] = None
|
||||
) -> tuple[Optional[DownloadTask], Optional[str]]:
|
||||
err = self._validate_url(url)
|
||||
if err:
|
||||
return None, err
|
||||
|
||||
err = self._validate_filename(filename)
|
||||
if err:
|
||||
return None, err
|
||||
|
||||
err = self._validate_directory(directory)
|
||||
if err:
|
||||
return None, err
|
||||
|
||||
try:
|
||||
save_path, temp_path, meta_path = self._resolve_save_path(directory, filename)
|
||||
except ValueError as e:
|
||||
return None, str(e)
|
||||
|
||||
if os.path.exists(save_path):
|
||||
return None, f"File already exists: {directory}/{filename}"
|
||||
|
||||
# Reject duplicate active download by URL
|
||||
for task in self.tasks.values():
|
||||
if task.url == url and task.status in ACTIVE_STATUSES:
|
||||
return None, f"Download already in progress for this URL (id: {task.id})"
|
||||
|
||||
# Reject duplicate active download by destination path (#4)
|
||||
for task in self.tasks.values():
|
||||
if task.save_path == save_path and task.status in ACTIVE_STATUSES:
|
||||
return None, f"Download already in progress for {directory}/{filename} (id: {task.id})"
|
||||
|
||||
# Clean stale temp/meta if no active task owns them (#9)
|
||||
existing_meta = self._read_meta(meta_path)
|
||||
if existing_meta:
|
||||
owning_task = self.tasks.get(existing_meta.get("task_id", ""))
|
||||
if not owning_task or owning_task.status in TERMINAL_STATUSES:
|
||||
if existing_meta.get("url") != url:
|
||||
self._cleanup_files(temp_path, meta_path)
|
||||
|
||||
task = DownloadTask(
|
||||
id=uuid.uuid4().hex[:12],
|
||||
url=url,
|
||||
filename=filename,
|
||||
directory=directory,
|
||||
save_path=save_path,
|
||||
temp_path=temp_path,
|
||||
meta_path=meta_path,
|
||||
client_id=client_id,
|
||||
)
|
||||
self.tasks[task.id] = task
|
||||
self._prune_terminal_tasks()
|
||||
|
||||
task._worker = asyncio.create_task(self._run_download(task))
|
||||
return task, None
|
||||
|
||||
# -- Redirect-safe fetch (#1, #2, #3) --
|
||||
|
||||
async def _fetch_with_validated_redirects(
|
||||
self, session: aiohttp.ClientSession, url: str, headers: dict
|
||||
) -> aiohttp.ClientResponse:
|
||||
"""Follow redirects manually, validating each hop against the allowlist."""
|
||||
current_url = url
|
||||
for _ in range(MAX_REDIRECTS + 1):
|
||||
resp = await session.get(current_url, headers=headers, allow_redirects=False)
|
||||
if resp.status not in (301, 302, 303, 307, 308):
|
||||
return resp
|
||||
|
||||
location = resp.headers.get("Location")
|
||||
await resp.release()
|
||||
if not location:
|
||||
raise ValueError("Redirect without Location header")
|
||||
|
||||
resolved = URL(current_url).join(URL(location))
|
||||
current_url = str(resolved)
|
||||
|
||||
# Validate the redirect target host
|
||||
parts = urlsplit(current_url)
|
||||
host = (parts.hostname or "").lower()
|
||||
scheme = parts.scheme.lower()
|
||||
|
||||
if scheme != "https":
|
||||
raise ValueError(f"Redirect to non-HTTPS URL: {current_url}")
|
||||
if host not in ALLOWED_HTTPS_HOSTS:
|
||||
# Allow CDN hosts that HuggingFace/CivitAI commonly redirect to
|
||||
raise ValueError(f"Redirect to disallowed host: {host}")
|
||||
|
||||
# 303 means GET with no Range
|
||||
if resp.status == 303:
|
||||
headers = {k: v for k, v in headers.items() if k.lower() != "range"}
|
||||
|
||||
raise ValueError(f"Too many redirects (>{MAX_REDIRECTS})")
|
||||
|
||||
# -- Download worker --
|
||||
|
||||
async def _run_download(self, task: DownloadTask):
|
||||
try:
|
||||
async with self._semaphore:
|
||||
await self._run_download_inner(task)
|
||||
except asyncio.CancelledError:
|
||||
if task._stop_reason == "pause":
|
||||
task.status = DownloadStatus.PAUSED
|
||||
task.speed_bytes_per_sec = 0
|
||||
task.eta_seconds = 0
|
||||
await self._send_progress(task)
|
||||
else:
|
||||
task.status = DownloadStatus.CANCELLED
|
||||
await self._send_progress(task)
|
||||
self._cleanup_files(task.temp_path, task.meta_path)
|
||||
except Exception as e:
|
||||
task.status = DownloadStatus.ERROR
|
||||
task.error = str(e)
|
||||
await self._send_progress(task)
|
||||
logger.exception("Download error for %s", task.url)
|
||||
|
||||
async def _run_download_inner(self, task: DownloadTask):
|
||||
session = await self._get_session()
|
||||
headers = {}
|
||||
|
||||
# Resume support with sidecar validation (#9)
|
||||
if os.path.exists(task.temp_path):
|
||||
meta = self._read_meta(task.meta_path)
|
||||
if meta and meta.get("url") == task.url:
|
||||
existing_size = os.path.getsize(task.temp_path)
|
||||
if existing_size > 0:
|
||||
headers["Range"] = f"bytes={existing_size}-"
|
||||
task.received_bytes = existing_size
|
||||
else:
|
||||
self._cleanup_files(task.temp_path, task.meta_path)
|
||||
|
||||
self._write_meta(task.meta_path, task.url, task.id)
|
||||
task.status = DownloadStatus.DOWNLOADING
|
||||
await self._send_progress(task)
|
||||
|
||||
resp = await self._fetch_with_validated_redirects(session, task.url, headers)
|
||||
try:
|
||||
if resp.status == 416:
|
||||
content_range = resp.headers.get("Content-Range", "")
|
||||
if content_range:
|
||||
total_str = content_range.split("/")[-1]
|
||||
if total_str != "*":
|
||||
total = int(total_str)
|
||||
if task.received_bytes >= total:
|
||||
if not os.path.exists(task.save_path):
|
||||
os.rename(task.temp_path, task.save_path)
|
||||
self._cleanup_files(task.meta_path)
|
||||
task.status = DownloadStatus.COMPLETED
|
||||
task.progress = 1.0
|
||||
task.total_bytes = total
|
||||
await self._send_progress(task)
|
||||
return
|
||||
raise ValueError(f"HTTP 416 Range Not Satisfiable")
|
||||
|
||||
if resp.status not in (200, 206):
|
||||
task.status = DownloadStatus.ERROR
|
||||
task.error = f"HTTP {resp.status}"
|
||||
await self._send_progress(task)
|
||||
return
|
||||
|
||||
if resp.status == 200:
|
||||
task.received_bytes = 0
|
||||
|
||||
content_length = resp.content_length
|
||||
if resp.status == 206 and content_length:
|
||||
task.total_bytes = task.received_bytes + content_length
|
||||
elif resp.status == 200 and content_length:
|
||||
task.total_bytes = content_length
|
||||
|
||||
mode = "ab" if resp.status == 206 else "wb"
|
||||
speed_window_start = time.monotonic()
|
||||
speed_window_bytes = 0
|
||||
last_progress_time = 0.0
|
||||
|
||||
with open(task.temp_path, mode) as f:
|
||||
async for chunk in resp.content.iter_chunked(1024 * 64):
|
||||
f.write(chunk)
|
||||
task.received_bytes += len(chunk)
|
||||
speed_window_bytes += len(chunk)
|
||||
|
||||
now = time.monotonic()
|
||||
elapsed = now - speed_window_start
|
||||
if elapsed > 0.5:
|
||||
task.speed_bytes_per_sec = speed_window_bytes / elapsed
|
||||
if task.total_bytes > 0 and task.speed_bytes_per_sec > 0:
|
||||
remaining = task.total_bytes - task.received_bytes
|
||||
task.eta_seconds = remaining / task.speed_bytes_per_sec
|
||||
speed_window_start = now
|
||||
speed_window_bytes = 0
|
||||
|
||||
if task.total_bytes > 0:
|
||||
task.progress = task.received_bytes / task.total_bytes
|
||||
|
||||
if now - last_progress_time >= 0.25:
|
||||
await self._send_progress(task)
|
||||
last_progress_time = now
|
||||
finally:
|
||||
resp.release()
|
||||
|
||||
# Final cancel check before committing (#7)
|
||||
if task._stop_reason is not None:
|
||||
raise asyncio.CancelledError()
|
||||
|
||||
# Re-check destination before finalizing (#10)
|
||||
if os.path.exists(task.save_path):
|
||||
task.status = DownloadStatus.ERROR
|
||||
task.error = f"Destination file appeared during download: {task.directory}/{task.filename}"
|
||||
await self._send_progress(task)
|
||||
return
|
||||
|
||||
os.replace(task.temp_path, task.save_path)
|
||||
self._cleanup_files(task.meta_path)
|
||||
task.status = DownloadStatus.COMPLETED
|
||||
task.progress = 1.0
|
||||
task.speed_bytes_per_sec = 0
|
||||
task.eta_seconds = 0
|
||||
await self._send_progress(task)
|
||||
logger.info("Download complete: %s/%s", task.directory, task.filename)
|
||||
|
||||
# -- Progress (#8, #14) --
|
||||
|
||||
async def _send_progress(self, task: DownloadTask):
|
||||
try:
|
||||
self.server.send_sync("download_progress", task.to_dict(), task.client_id)
|
||||
except Exception:
|
||||
logger.exception("Failed to send download progress event")
|
||||
|
||||
# -- Control operations (#5, #6, #13) --
|
||||
|
||||
def pause_download(self, task_id: str) -> Optional[str]:
|
||||
task = self.tasks.get(task_id)
|
||||
if not task:
|
||||
return "Download not found"
|
||||
if task.status not in (DownloadStatus.PENDING, DownloadStatus.DOWNLOADING):
|
||||
return f"Cannot pause download in state '{task.status.value}'"
|
||||
task._stop_reason = "pause"
|
||||
if task._worker and not task._worker.done():
|
||||
task._worker.cancel()
|
||||
return None
|
||||
|
||||
def resume_download(self, task_id: str) -> Optional[str]:
|
||||
task = self.tasks.get(task_id)
|
||||
if not task:
|
||||
return "Download not found"
|
||||
if task.status != DownloadStatus.PAUSED:
|
||||
return f"Cannot resume download in state '{task.status.value}'"
|
||||
task._stop_reason = None
|
||||
task.status = DownloadStatus.PENDING
|
||||
task._worker = asyncio.create_task(self._run_download(task))
|
||||
return None
|
||||
|
||||
def cancel_download(self, task_id: str) -> Optional[str]:
|
||||
task = self.tasks.get(task_id)
|
||||
if not task:
|
||||
return "Download not found"
|
||||
if task.status in TERMINAL_STATUSES:
|
||||
return f"Cannot cancel download in state '{task.status.value}'"
|
||||
task._stop_reason = "cancel"
|
||||
if task._worker and not task._worker.done():
|
||||
task._worker.cancel()
|
||||
else:
|
||||
task.status = DownloadStatus.CANCELLED
|
||||
self._cleanup_files(task.temp_path, task.meta_path)
|
||||
return None
|
||||
|
||||
# -- Query --
|
||||
|
||||
def get_all_tasks(self, client_id: Optional[str] = None) -> list[dict]:
|
||||
tasks = self.tasks.values()
|
||||
if client_id is not None:
|
||||
tasks = [t for t in tasks if t.client_id == client_id]
|
||||
return [t.to_dict() for t in tasks]
|
||||
|
||||
def get_task(self, task_id: str) -> Optional[dict]:
|
||||
task = self.tasks.get(task_id)
|
||||
return task.to_dict() if task else None
|
||||
@ -224,6 +224,8 @@ parser.add_argument("--user-directory", type=is_valid_directory, default=None, h
|
||||
|
||||
parser.add_argument("--enable-compress-response-body", action="store_true", help="Enable compressing response body.")
|
||||
|
||||
parser.add_argument("--enable-download-api", action="store_true", help="Enable the model download API. When set, ComfyUI exposes endpoints that allow downloading model files directly into the models directory. Only HTTPS downloads from allowed hosts (huggingface.co, civitai.com) are permitted.")
|
||||
|
||||
parser.add_argument(
|
||||
"--comfy-api-base",
|
||||
type=str,
|
||||
|
||||
@ -928,7 +928,6 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
|
||||
weight = state_dict.pop(weight_key, None)
|
||||
if weight is None:
|
||||
logging.warning(f"Missing weight for layer {layer_name}")
|
||||
self.weight = None
|
||||
return
|
||||
|
||||
manually_loaded_keys = [weight_key]
|
||||
@ -1035,9 +1034,6 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
|
||||
if self.bias is not None:
|
||||
sd["{}bias".format(prefix)] = self.bias
|
||||
|
||||
if self.weight is None:
|
||||
return sd
|
||||
|
||||
if isinstance(self.weight, QuantizedTensor):
|
||||
sd_out = self.weight.state_dict("{}weight".format(prefix))
|
||||
for k in sd_out:
|
||||
|
||||
33
comfy/sd.py
33
comfy/sd.py
@ -61,7 +61,6 @@ import comfy.text_encoders.newbie
|
||||
import comfy.text_encoders.anima
|
||||
import comfy.text_encoders.ace15
|
||||
import comfy.text_encoders.longcat_image
|
||||
import comfy.text_encoders.qwen35
|
||||
|
||||
import comfy.model_patcher
|
||||
import comfy.lora
|
||||
@ -426,13 +425,13 @@ class CLIP:
|
||||
def get_key_patches(self):
|
||||
return self.patcher.get_key_patches()
|
||||
|
||||
def generate(self, tokens, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.95, min_p=0.0, repetition_penalty=1.0, seed=None, presence_penalty=0.0):
|
||||
def generate(self, tokens, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.95, min_p=0.0, repetition_penalty=1.0, seed=None):
|
||||
self.cond_stage_model.reset_clip_options()
|
||||
|
||||
self.load_model(tokens)
|
||||
self.cond_stage_model.set_clip_options({"layer": None})
|
||||
self.cond_stage_model.set_clip_options({"execution_device": self.patcher.load_device})
|
||||
return self.cond_stage_model.generate(tokens, do_sample=do_sample, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, min_p=min_p, repetition_penalty=repetition_penalty, seed=seed, presence_penalty=presence_penalty)
|
||||
return self.cond_stage_model.generate(tokens, do_sample=do_sample, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, min_p=min_p, repetition_penalty=repetition_penalty, seed=seed)
|
||||
|
||||
def decode(self, token_ids, skip_special_tokens=True):
|
||||
return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
|
||||
@ -1229,11 +1228,6 @@ class TEModel(Enum):
|
||||
QWEN3_8B = 20
|
||||
QWEN3_06B = 21
|
||||
GEMMA_3_4B_VISION = 22
|
||||
QWEN35_08B = 23
|
||||
QWEN35_2B = 24
|
||||
QWEN35_4B = 25
|
||||
QWEN35_9B = 26
|
||||
QWEN35_27B = 27
|
||||
|
||||
|
||||
def detect_te_model(sd):
|
||||
@ -1273,17 +1267,6 @@ def detect_te_model(sd):
|
||||
return TEModel.QWEN25_3B
|
||||
if weight.shape[0] == 512:
|
||||
return TEModel.QWEN25_7B
|
||||
if "model.language_model.layers.0.linear_attn.A_log" in sd and "model.language_model.layers.0.input_layernorm.weight" in sd:
|
||||
weight = sd['model.language_model.layers.0.input_layernorm.weight']
|
||||
if weight.shape[0] == 1024:
|
||||
return TEModel.QWEN35_08B
|
||||
if weight.shape[0] == 2560:
|
||||
return TEModel.QWEN35_4B
|
||||
if weight.shape[0] == 4096:
|
||||
return TEModel.QWEN35_9B
|
||||
if weight.shape[0] == 5120:
|
||||
return TEModel.QWEN35_27B
|
||||
return TEModel.QWEN35_2B
|
||||
if "model.layers.0.post_attention_layernorm.weight" in sd:
|
||||
weight = sd['model.layers.0.post_attention_layernorm.weight']
|
||||
if 'model.layers.0.self_attn.q_norm.weight' in sd:
|
||||
@ -1316,12 +1299,11 @@ def t5xxl_detect(clip_data):
|
||||
return {}
|
||||
|
||||
def llama_detect(clip_data):
|
||||
weight_names = ["model.layers.0.self_attn.k_proj.weight", "model.layers.0.linear_attn.in_proj_a.weight"]
|
||||
weight_name = "model.layers.0.self_attn.k_proj.weight"
|
||||
|
||||
for sd in clip_data:
|
||||
for weight_name in weight_names:
|
||||
if weight_name in sd:
|
||||
return comfy.text_encoders.hunyuan_video.llama_detect(sd)
|
||||
if weight_name in sd:
|
||||
return comfy.text_encoders.hunyuan_video.llama_detect(sd)
|
||||
|
||||
return {}
|
||||
|
||||
@ -1449,11 +1431,6 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
||||
elif te_model == TEModel.JINA_CLIP_2:
|
||||
clip_target.clip = comfy.text_encoders.jina_clip_2.JinaClip2TextModelWrapper
|
||||
clip_target.tokenizer = comfy.text_encoders.jina_clip_2.JinaClip2TokenizerWrapper
|
||||
elif te_model in (TEModel.QWEN35_08B, TEModel.QWEN35_2B, TEModel.QWEN35_4B, TEModel.QWEN35_9B, TEModel.QWEN35_27B):
|
||||
clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
|
||||
qwen35_type = {TEModel.QWEN35_08B: "qwen35_08b", TEModel.QWEN35_2B: "qwen35_2b", TEModel.QWEN35_4B: "qwen35_4b", TEModel.QWEN35_9B: "qwen35_9b", TEModel.QWEN35_27B: "qwen35_27b"}[te_model]
|
||||
clip_target.clip = comfy.text_encoders.qwen35.te(**llama_detect(clip_data), model_type=qwen35_type)
|
||||
clip_target.tokenizer = comfy.text_encoders.qwen35.tokenizer(model_type=qwen35_type)
|
||||
elif te_model == TEModel.QWEN3_06B:
|
||||
clip_target.clip = comfy.text_encoders.anima.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.anima.AnimaTokenizer
|
||||
|
||||
@ -308,14 +308,14 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
|
||||
def load_sd(self, sd):
|
||||
return self.transformer.load_state_dict(sd, strict=False, assign=getattr(self, "can_assign_sd", False))
|
||||
|
||||
def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, presence_penalty=0.0):
|
||||
def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed):
|
||||
if isinstance(tokens, dict):
|
||||
tokens_only = next(iter(tokens.values())) # todo: get this better?
|
||||
else:
|
||||
tokens_only = tokens
|
||||
tokens_only = [[t[0] for t in b] for b in tokens_only]
|
||||
embeds = self.process_tokens(tokens_only, device=self.execution_device)[0]
|
||||
return self.transformer.generate(embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, presence_penalty=presence_penalty)
|
||||
return self.transformer.generate(embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed)
|
||||
|
||||
def parse_parentheses(string):
|
||||
result = []
|
||||
@ -740,5 +740,5 @@ class SD1ClipModel(torch.nn.Module):
|
||||
def load_sd(self, sd):
|
||||
return getattr(self, self.clip).load_sd(sd)
|
||||
|
||||
def generate(self, tokens, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.95, min_p=0.0, repetition_penalty=1.0, seed=None, presence_penalty=0.0):
|
||||
return getattr(self, self.clip).generate(tokens, do_sample=do_sample, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, min_p=min_p, repetition_penalty=repetition_penalty, seed=seed, presence_penalty=presence_penalty)
|
||||
def generate(self, tokens, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.95, min_p=0.0, repetition_penalty=1.0, seed=None):
|
||||
return getattr(self, self.clip).generate(tokens, do_sample=do_sample, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, min_p=min_p, repetition_penalty=repetition_penalty, seed=seed)
|
||||
|
||||
@ -224,7 +224,7 @@ class Qwen3_8BConfig:
|
||||
k_norm = "gemma3"
|
||||
rope_scale = None
|
||||
final_norm: bool = True
|
||||
lm_head: bool = True
|
||||
lm_head: bool = False
|
||||
stop_tokens = [151643, 151645]
|
||||
|
||||
@dataclass
|
||||
@ -655,17 +655,6 @@ class Llama2_(nn.Module):
|
||||
if config.lm_head:
|
||||
self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype)
|
||||
|
||||
def get_past_len(self, past_key_values):
|
||||
return past_key_values[0][2]
|
||||
|
||||
def compute_freqs_cis(self, position_ids, device):
|
||||
return precompute_freqs_cis(self.config.head_dim,
|
||||
position_ids,
|
||||
self.config.rope_theta,
|
||||
self.config.rope_scale,
|
||||
self.config.rope_dims,
|
||||
device=device)
|
||||
|
||||
def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[], past_key_values=None):
|
||||
if embeds is not None:
|
||||
x = embeds
|
||||
@ -678,12 +667,17 @@ class Llama2_(nn.Module):
|
||||
seq_len = x.shape[1]
|
||||
past_len = 0
|
||||
if past_key_values is not None and len(past_key_values) > 0:
|
||||
past_len = self.get_past_len(past_key_values)
|
||||
past_len = past_key_values[0][2]
|
||||
|
||||
if position_ids is None:
|
||||
position_ids = torch.arange(past_len, past_len + seq_len, device=x.device).unsqueeze(0)
|
||||
|
||||
freqs_cis = self.compute_freqs_cis(position_ids, x.device)
|
||||
freqs_cis = precompute_freqs_cis(self.config.head_dim,
|
||||
position_ids,
|
||||
self.config.rope_theta,
|
||||
self.config.rope_scale,
|
||||
self.config.rope_dims,
|
||||
device=x.device)
|
||||
|
||||
mask = None
|
||||
if attention_mask is not None:
|
||||
@ -818,16 +812,9 @@ class BaseGenerate:
|
||||
comfy.ops.uncast_bias_weight(module, weight, None, offload_stream)
|
||||
return x
|
||||
|
||||
def init_kv_cache(self, batch, max_cache_len, device, execution_dtype):
|
||||
model_config = self.model.config
|
||||
past_key_values = []
|
||||
for x in range(model_config.num_hidden_layers):
|
||||
past_key_values.append((torch.empty([batch, model_config.num_key_value_heads, max_cache_len, model_config.head_dim], device=device, dtype=execution_dtype),
|
||||
torch.empty([batch, model_config.num_key_value_heads, max_cache_len, model_config.head_dim], device=device, dtype=execution_dtype), 0))
|
||||
return past_key_values
|
||||
|
||||
def generate(self, embeds=None, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.9, min_p=0.0, repetition_penalty=1.0, seed=42, stop_tokens=None, initial_tokens=[], execution_dtype=None, min_tokens=0, presence_penalty=0.0):
|
||||
def generate(self, embeds=None, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.9, min_p=0.0, repetition_penalty=1.0, seed=42, stop_tokens=None, initial_tokens=[], execution_dtype=None, min_tokens=0):
|
||||
device = embeds.device
|
||||
model_config = self.model.config
|
||||
|
||||
if stop_tokens is None:
|
||||
stop_tokens = self.model.config.stop_tokens
|
||||
@ -842,8 +829,11 @@ class BaseGenerate:
|
||||
if embeds.ndim == 2:
|
||||
embeds = embeds.unsqueeze(0)
|
||||
|
||||
past_key_values = [] #kv_cache init
|
||||
max_cache_len = embeds.shape[1] + max_length
|
||||
past_key_values = self.init_kv_cache(embeds.shape[0], max_cache_len, device, execution_dtype)
|
||||
for x in range(model_config.num_hidden_layers):
|
||||
past_key_values.append((torch.empty([embeds.shape[0], model_config.num_key_value_heads, max_cache_len, model_config.head_dim], device=device, dtype=execution_dtype),
|
||||
torch.empty([embeds.shape[0], model_config.num_key_value_heads, max_cache_len, model_config.head_dim], device=device, dtype=execution_dtype), 0))
|
||||
|
||||
generator = torch.Generator(device=device).manual_seed(seed) if do_sample else None
|
||||
|
||||
@ -854,7 +844,7 @@ class BaseGenerate:
|
||||
for step in tqdm(range(max_length), desc="Generating tokens"):
|
||||
x, _, past_key_values = self.model.forward(None, embeds=embeds, attention_mask=None, past_key_values=past_key_values)
|
||||
logits = self.logits(x)[:, -1]
|
||||
next_token = self.sample_token(logits, temperature, top_k, top_p, min_p, repetition_penalty, initial_tokens + generated_token_ids, generator, do_sample=do_sample, presence_penalty=presence_penalty)
|
||||
next_token = self.sample_token(logits, temperature, top_k, top_p, min_p, repetition_penalty, initial_tokens + generated_token_ids, generator, do_sample=do_sample)
|
||||
token_id = next_token[0].item()
|
||||
generated_token_ids.append(token_id)
|
||||
|
||||
@ -866,7 +856,7 @@ class BaseGenerate:
|
||||
|
||||
return generated_token_ids
|
||||
|
||||
def sample_token(self, logits, temperature, top_k, top_p, min_p, repetition_penalty, token_history, generator, do_sample=True, presence_penalty=0.0):
|
||||
def sample_token(self, logits, temperature, top_k, top_p, min_p, repetition_penalty, token_history, generator, do_sample=True):
|
||||
|
||||
if not do_sample or temperature == 0.0:
|
||||
return torch.argmax(logits, dim=-1, keepdim=True)
|
||||
@ -877,11 +867,6 @@ class BaseGenerate:
|
||||
for token_id in set(token_history):
|
||||
logits[i, token_id] *= repetition_penalty if logits[i, token_id] < 0 else 1/repetition_penalty
|
||||
|
||||
if presence_penalty is not None and presence_penalty != 0.0:
|
||||
for i in range(logits.shape[0]):
|
||||
for token_id in set(token_history):
|
||||
logits[i, token_id] -= presence_penalty
|
||||
|
||||
if temperature != 1.0:
|
||||
logits = logits / temperature
|
||||
|
||||
@ -912,9 +897,6 @@ class BaseGenerate:
|
||||
class BaseQwen3:
|
||||
def logits(self, x):
|
||||
input = x[:, -1:]
|
||||
if self.model.config.lm_head:
|
||||
return self.model.lm_head(input)
|
||||
|
||||
module = self.model.embed_tokens
|
||||
|
||||
offload_stream = None
|
||||
|
||||
@ -91,11 +91,11 @@ class Gemma3_12BModel(sd1_clip.SDClipModel):
|
||||
self.dtypes.add(dtype)
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Gemma3_12B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||
|
||||
def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, presence_penalty):
|
||||
def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed):
|
||||
tokens_only = [[t[0] for t in b] for b in tokens]
|
||||
embeds, _, _, embeds_info = self.process_tokens(tokens_only, self.execution_device)
|
||||
comfy.utils.normalize_image_embeddings(embeds, embeds_info, self.transformer.model.config.hidden_size ** 0.5)
|
||||
return self.transformer.generate(embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, stop_tokens=[106], presence_penalty=presence_penalty) # 106 is <end_of_turn>
|
||||
return self.transformer.generate(embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, stop_tokens=[106]) # 106 is <end_of_turn>
|
||||
|
||||
class DualLinearProjection(torch.nn.Module):
|
||||
def __init__(self, in_dim, out_dim_video, out_dim_audio, dtype=None, device=None, operations=None):
|
||||
@ -189,8 +189,8 @@ class LTXAVTEModel(torch.nn.Module):
|
||||
|
||||
return out.to(device=out_device, dtype=torch.float), pooled, extra
|
||||
|
||||
def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, presence_penalty):
|
||||
return self.gemma3_12b.generate(tokens["gemma3_12b"], do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, presence_penalty)
|
||||
def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed):
|
||||
return self.gemma3_12b.generate(tokens["gemma3_12b"], do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed)
|
||||
|
||||
def load_sd(self, sd):
|
||||
if "model.layers.47.self_attn.q_norm.weight" in sd:
|
||||
|
||||
@ -1,833 +0,0 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from dataclasses import dataclass, field
|
||||
import os
|
||||
import math
|
||||
|
||||
import comfy.model_management
|
||||
from comfy.ldm.modules.attention import optimized_attention_for_device
|
||||
from comfy import sd1_clip
|
||||
import comfy.text_encoders.qwen_vl
|
||||
|
||||
from .llama import BaseLlama, BaseGenerate, Llama2_, MLP, RMSNorm, apply_rope
|
||||
|
||||
|
||||
def _qwen35_layer_types(n):
|
||||
return [("full_attention" if (i + 1) % 4 == 0 else "linear_attention") for i in range(n)]
|
||||
|
||||
@dataclass
|
||||
class Qwen35Config:
|
||||
vocab_size: int = 248320
|
||||
hidden_size: int = 2048
|
||||
intermediate_size: int = 6144
|
||||
num_hidden_layers: int = 24
|
||||
# Full attention params
|
||||
num_attention_heads: int = 8
|
||||
num_key_value_heads: int = 2
|
||||
head_dim: int = 256
|
||||
partial_rotary_factor: float = 0.25
|
||||
# Linear attention (DeltaNet) params
|
||||
linear_num_key_heads: int = 16
|
||||
linear_num_value_heads: int = 16
|
||||
linear_key_head_dim: int = 128
|
||||
linear_value_head_dim: int = 128
|
||||
conv_kernel_size: int = 4
|
||||
# Shared params
|
||||
max_position_embeddings: int = 32768
|
||||
rms_norm_eps: float = 1e-6
|
||||
rope_theta: float = 10000000.0
|
||||
mrope_section: list = field(default_factory=lambda: [11, 11, 10])
|
||||
layer_types: list = field(default_factory=lambda: _qwen35_layer_types(24))
|
||||
rms_norm_add: bool = True
|
||||
mlp_activation: str = "silu"
|
||||
qkv_bias: bool = False
|
||||
final_norm: bool = True
|
||||
lm_head: bool = False
|
||||
stop_tokens: list = field(default_factory=lambda: [248044, 248046])
|
||||
# These are needed for BaseLlama/BaseGenerate compatibility but unused directly
|
||||
transformer_type: str = "qwen35_2b"
|
||||
rope_dims: list = None
|
||||
rope_scale: float = None
|
||||
|
||||
QWEN35_VISION_DEFAULTS = dict(hidden_size=1024, num_heads=16, intermediate_size=4096, depth=24, patch_size=16, temporal_patch_size=2, in_channels=3, spatial_merge_size=2, num_position_embeddings=2304)
|
||||
|
||||
QWEN35_MODELS = {
|
||||
"qwen35_08b": dict(hidden_size=1024, intermediate_size=3584, vision=dict(hidden_size=768, num_heads=12, intermediate_size=3072, depth=12)),
|
||||
"qwen35_2b": dict(hidden_size=2048, intermediate_size=6144, num_hidden_layers=24, num_attention_heads=8, num_key_value_heads=2, linear_num_value_heads=16),
|
||||
"qwen35_4b": dict(hidden_size=2560, intermediate_size=9216, num_hidden_layers=32, num_attention_heads=16, num_key_value_heads=4, linear_num_value_heads=32),
|
||||
"qwen35_9b": dict(hidden_size=4096, intermediate_size=12288, num_hidden_layers=32, num_attention_heads=16, num_key_value_heads=4, linear_num_value_heads=32, lm_head=True, vision=dict(hidden_size=1152, intermediate_size=4304, depth=27)),
|
||||
"qwen35_27b": dict(hidden_size=5120, intermediate_size=17408, num_hidden_layers=64, num_attention_heads=24, num_key_value_heads=4, linear_num_value_heads=48, lm_head=True, vision=dict(hidden_size=1152, intermediate_size=4304, depth=27)),
|
||||
}
|
||||
|
||||
|
||||
def _make_config(model_type, config_dict={}):
|
||||
overrides = QWEN35_MODELS.get(model_type, {}).copy()
|
||||
overrides.pop("vision", None)
|
||||
if "num_hidden_layers" in overrides:
|
||||
overrides["layer_types"] = _qwen35_layer_types(overrides["num_hidden_layers"])
|
||||
overrides.update(config_dict)
|
||||
return Qwen35Config(**overrides)
|
||||
|
||||
|
||||
class RMSNormGated(RMSNorm):
|
||||
def forward(self, x, gate):
|
||||
return super().forward(x) * F.silu(gate.to(x.dtype))
|
||||
|
||||
def torch_chunk_gated_delta_rule(query, key, value, g, beta, chunk_size=64, initial_state=None, output_final_state=False):
|
||||
initial_dtype = query.dtype
|
||||
query = F.normalize(query, dim=-1)
|
||||
key = F.normalize(key, dim=-1)
|
||||
query, key, value, beta, g = [x.transpose(1, 2).contiguous().to(torch.float32) for x in (query, key, value, beta, g)]
|
||||
|
||||
batch_size, num_heads, sequence_length, k_head_dim = key.shape
|
||||
v_head_dim = value.shape[-1]
|
||||
pad_size = (chunk_size - sequence_length % chunk_size) % chunk_size
|
||||
query = F.pad(query, (0, 0, 0, pad_size))
|
||||
key = F.pad(key, (0, 0, 0, pad_size))
|
||||
value = F.pad(value, (0, 0, 0, pad_size))
|
||||
beta = F.pad(beta, (0, pad_size))
|
||||
g = F.pad(g, (0, pad_size))
|
||||
total_sequence_length = sequence_length + pad_size
|
||||
scale = 1 / (query.shape[-1] ** 0.5)
|
||||
query = query * scale
|
||||
|
||||
v_beta = value * beta.unsqueeze(-1)
|
||||
k_beta = key * beta.unsqueeze(-1)
|
||||
query, key, value, k_beta, v_beta = [x.reshape(x.shape[0], x.shape[1], -1, chunk_size, x.shape[-1]) for x in (query, key, value, k_beta, v_beta)]
|
||||
g = g.reshape(g.shape[0], g.shape[1], -1, chunk_size)
|
||||
mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), diagonal=0)
|
||||
|
||||
g = g.cumsum(dim=-1)
|
||||
decay_mask = ((g.unsqueeze(-1) - g.unsqueeze(-2)).tril().exp().float()).tril()
|
||||
attn = -((k_beta @ key.transpose(-1, -2)) * decay_mask).masked_fill(mask, 0)
|
||||
for i in range(1, chunk_size):
|
||||
row = attn[..., i, :i].clone()
|
||||
sub = attn[..., :i, :i].clone()
|
||||
attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
|
||||
attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
|
||||
value = attn @ v_beta
|
||||
k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1))
|
||||
last_recurrent_state = (
|
||||
torch.zeros(batch_size, num_heads, k_head_dim, v_head_dim).to(value)
|
||||
if initial_state is None
|
||||
else initial_state.to(value)
|
||||
)
|
||||
core_attn_out = torch.zeros_like(value)
|
||||
mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), diagonal=1)
|
||||
|
||||
for i in range(0, total_sequence_length // chunk_size):
|
||||
q_i, k_i, v_i = query[:, :, i], key[:, :, i], value[:, :, i]
|
||||
attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
|
||||
v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
|
||||
v_new = v_i - v_prime
|
||||
attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
|
||||
core_attn_out[:, :, i] = attn_inter + attn @ v_new
|
||||
last_recurrent_state = (
|
||||
last_recurrent_state * g[:, :, i, -1, None, None].exp()
|
||||
+ (k_i * (g[:, :, i, -1, None] - g[:, :, i]).exp()[..., None]).transpose(-1, -2) @ v_new
|
||||
)
|
||||
|
||||
if not output_final_state:
|
||||
last_recurrent_state = None
|
||||
core_attn_out = core_attn_out.reshape(core_attn_out.shape[0], core_attn_out.shape[1], -1, core_attn_out.shape[-1])
|
||||
core_attn_out = core_attn_out[:, :, :sequence_length]
|
||||
core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype)
|
||||
return core_attn_out, last_recurrent_state
|
||||
|
||||
|
||||
def torch_causal_conv1d_update(x, conv_state, weight, bias=None):
|
||||
# conv_state: [B, channels, kernel_size-1], x: [B, channels, 1]
|
||||
# weight: [channels, kernel_size]
|
||||
state_len = conv_state.shape[-1]
|
||||
combined = torch.cat([conv_state, x], dim=-1).to(weight.dtype) # [B, channels, kernel_size]
|
||||
conv_state.copy_(combined[:, :, -state_len:])
|
||||
out = (combined * weight).sum(dim=-1, keepdim=True) # [B, channels, 1]
|
||||
if bias is not None:
|
||||
out = out + bias.unsqueeze(0).unsqueeze(-1)
|
||||
return F.silu(out).to(x.dtype)
|
||||
|
||||
|
||||
# GatedDeltaNet - Linear Attention Layer
|
||||
|
||||
class GatedDeltaNet(nn.Module):
|
||||
def __init__(self, config, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
|
||||
hidden = config.hidden_size
|
||||
self.num_key_heads = config.linear_num_key_heads
|
||||
self.num_value_heads = config.linear_num_value_heads
|
||||
self.key_head_dim = config.linear_key_head_dim
|
||||
self.value_head_dim = config.linear_value_head_dim
|
||||
self.conv_kernel_size = config.conv_kernel_size
|
||||
|
||||
key_dim = self.num_key_heads * self.key_head_dim
|
||||
value_dim = self.num_value_heads * self.value_head_dim
|
||||
self.key_dim = key_dim
|
||||
self.value_dim = value_dim
|
||||
conv_dim = key_dim * 2 + value_dim
|
||||
|
||||
self.in_proj_qkv = ops.Linear(hidden, conv_dim, bias=False, device=device, dtype=dtype)
|
||||
self.in_proj_z = ops.Linear(hidden, value_dim, bias=False, device=device, dtype=dtype)
|
||||
self.in_proj_b = ops.Linear(hidden, self.num_value_heads, bias=False, device=device, dtype=dtype)
|
||||
self.in_proj_a = ops.Linear(hidden, self.num_value_heads, bias=False, device=device, dtype=dtype)
|
||||
self.out_proj = ops.Linear(value_dim, hidden, bias=False, device=device, dtype=dtype)
|
||||
|
||||
self.dt_bias = nn.Parameter(torch.empty(self.num_value_heads, device=device, dtype=dtype))
|
||||
self.A_log = nn.Parameter(torch.empty(self.num_value_heads, device=device, dtype=dtype))
|
||||
|
||||
self.conv1d = ops.Conv1d(in_channels=conv_dim, out_channels=conv_dim, bias=False, kernel_size=self.conv_kernel_size,
|
||||
groups=conv_dim, padding=self.conv_kernel_size - 1, device=device, dtype=dtype)
|
||||
|
||||
self.norm = RMSNormGated(self.value_head_dim, eps=config.rms_norm_eps, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, past_key_value=None, **kwargs):
|
||||
batch_size, seq_len, _ = x.shape
|
||||
|
||||
use_recurrent = (
|
||||
past_key_value is not None
|
||||
and past_key_value[2] > 0
|
||||
and seq_len == 1
|
||||
)
|
||||
|
||||
# Projections (shared)
|
||||
mixed_qkv = self.in_proj_qkv(x).transpose(1, 2) # [B, conv_dim, seq_len]
|
||||
z = self.in_proj_z(x)
|
||||
b = self.in_proj_b(x)
|
||||
a = self.in_proj_a(x)
|
||||
|
||||
# Conv1d
|
||||
if use_recurrent:
|
||||
recurrent_state, conv_state, step_index = past_key_value
|
||||
conv_weight = comfy.model_management.cast_to_device(self.conv1d.weight, mixed_qkv.device, mixed_qkv.dtype).squeeze(1)
|
||||
conv_bias = comfy.model_management.cast_to_device(self.conv1d.bias, mixed_qkv.device, mixed_qkv.dtype) if self.conv1d.bias is not None else None
|
||||
mixed_qkv = torch_causal_conv1d_update(mixed_qkv, conv_state, conv_weight, conv_bias)
|
||||
else:
|
||||
if past_key_value is not None:
|
||||
recurrent_state, conv_state, step_index = past_key_value
|
||||
conv_state_init = F.pad(mixed_qkv, (self.conv_kernel_size - mixed_qkv.shape[-1], 0))
|
||||
conv_state.copy_(conv_state_init[:, :, -conv_state.shape[-1]:])
|
||||
mixed_qkv = F.silu(self.conv1d(mixed_qkv)[:, :, :seq_len])
|
||||
|
||||
# Split QKV and compute beta/g
|
||||
mixed_qkv = mixed_qkv.transpose(1, 2) # [B, seq_len, conv_dim]
|
||||
query, key, value = mixed_qkv.split([self.key_dim, self.key_dim, self.value_dim], dim=-1)
|
||||
beta = b.sigmoid()
|
||||
g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias.float())
|
||||
|
||||
# Delta rule
|
||||
if use_recurrent:
|
||||
# single-token path: work in [B, heads, dim] without seq dim
|
||||
query = query.reshape(batch_size, self.num_key_heads, self.key_head_dim)
|
||||
key = key.reshape(batch_size, self.num_key_heads, self.key_head_dim)
|
||||
value = value.reshape(batch_size, self.num_value_heads, self.value_head_dim)
|
||||
|
||||
if self.num_value_heads != self.num_key_heads:
|
||||
rep = self.num_value_heads // self.num_key_heads
|
||||
query = query.repeat_interleave(rep, dim=1)
|
||||
key = key.repeat_interleave(rep, dim=1)
|
||||
|
||||
scale = self.key_head_dim ** -0.5
|
||||
q = F.normalize(query.float(), dim=-1) * scale
|
||||
k = F.normalize(key.float(), dim=-1)
|
||||
v = value.float()
|
||||
beta_t = beta.reshape(batch_size, -1)
|
||||
g_t = g.reshape(batch_size, -1).exp()
|
||||
|
||||
# In-place state update: [B, heads, k_dim, v_dim]
|
||||
recurrent_state.mul_(g_t[:, :, None, None])
|
||||
kv_mem = torch.einsum('bhk,bhkv->bhv', k, recurrent_state)
|
||||
delta = (v - kv_mem) * beta_t[:, :, None]
|
||||
recurrent_state.add_(k.unsqueeze(-1) * delta.unsqueeze(-2))
|
||||
core_attn_out = torch.einsum('bhk,bhkv->bhv', q, recurrent_state)
|
||||
|
||||
core_attn_out = core_attn_out.to(x.dtype).unsqueeze(1)
|
||||
present_key_value = (recurrent_state, conv_state, step_index + 1)
|
||||
else:
|
||||
query = query.reshape(batch_size, seq_len, -1, self.key_head_dim)
|
||||
key = key.reshape(batch_size, seq_len, -1, self.key_head_dim)
|
||||
value = value.reshape(batch_size, seq_len, -1, self.value_head_dim)
|
||||
|
||||
if self.num_value_heads != self.num_key_heads:
|
||||
rep = self.num_value_heads // self.num_key_heads
|
||||
query = query.repeat_interleave(rep, dim=2)
|
||||
key = key.repeat_interleave(rep, dim=2)
|
||||
|
||||
core_attn_out, last_recurrent_state = torch_chunk_gated_delta_rule(
|
||||
query, key, value, g=g, beta=beta,
|
||||
initial_state=None,
|
||||
output_final_state=past_key_value is not None,
|
||||
)
|
||||
|
||||
present_key_value = None
|
||||
if past_key_value is not None:
|
||||
if last_recurrent_state is not None:
|
||||
recurrent_state.copy_(last_recurrent_state.to(recurrent_state.dtype))
|
||||
present_key_value = (recurrent_state, conv_state, step_index + seq_len)
|
||||
|
||||
# Gated norm + output projection (shared)
|
||||
core_attn_out = self.norm(core_attn_out.reshape(-1, self.value_head_dim), z.reshape(-1, self.value_head_dim))
|
||||
output = self.out_proj(core_attn_out.reshape(batch_size, seq_len, -1))
|
||||
return output, present_key_value
|
||||
|
||||
|
||||
# GatedAttention - Full Attention with output gating
|
||||
def precompute_partial_rope(head_dim, rotary_dim, position_ids, theta, device=None, mrope_section=None):
|
||||
"""Compute RoPE frequencies for partial rotary embeddings."""
|
||||
theta_numerator = torch.arange(0, rotary_dim, 2, device=device).float()
|
||||
inv_freq = 1.0 / (theta ** (theta_numerator / rotary_dim))
|
||||
|
||||
inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
|
||||
position_ids_expanded = position_ids[:, None, :].float()
|
||||
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
||||
emb = torch.cat((freqs, freqs), dim=-1)
|
||||
cos = emb.cos()
|
||||
sin = emb.sin()
|
||||
|
||||
if mrope_section is not None and position_ids.shape[0] == 3:
|
||||
mrope_section_2 = [s * 2 for s in mrope_section]
|
||||
cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section_2, dim=-1))], dim=-1).unsqueeze(0)
|
||||
sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section_2, dim=-1))], dim=-1).unsqueeze(0)
|
||||
|
||||
cos = cos.unsqueeze(1)
|
||||
sin = sin.unsqueeze(1)
|
||||
sin_split = sin.shape[-1] // 2
|
||||
return (cos, sin[..., :sin_split], -sin[..., sin_split:])
|
||||
|
||||
|
||||
def apply_partial_rope(xq, xk, freqs_cis, rotary_dim):
|
||||
"""Apply RoPE to only the first rotary_dim dimensions."""
|
||||
xq_rot = xq[..., :rotary_dim]
|
||||
xq_pass = xq[..., rotary_dim:]
|
||||
xk_rot = xk[..., :rotary_dim]
|
||||
xk_pass = xk[..., rotary_dim:]
|
||||
|
||||
xq_rot, xk_rot = apply_rope(xq_rot, xk_rot, freqs_cis)
|
||||
|
||||
xq = torch.cat([xq_rot, xq_pass], dim=-1)
|
||||
xk = torch.cat([xk_rot, xk_pass], dim=-1)
|
||||
return xq, xk
|
||||
|
||||
|
||||
class GatedAttention(nn.Module):
|
||||
def __init__(self, config, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
|
||||
self.num_heads = config.num_attention_heads
|
||||
self.num_kv_heads = config.num_key_value_heads
|
||||
self.head_dim = config.head_dim
|
||||
self.hidden_size = config.hidden_size
|
||||
self.inner_size = self.num_heads * self.head_dim
|
||||
self.rotary_dim = int(self.head_dim * config.partial_rotary_factor)
|
||||
|
||||
# q_proj outputs 2x: query + gate
|
||||
self.q_proj = ops.Linear(config.hidden_size, self.inner_size * 2, bias=config.qkv_bias, device=device, dtype=dtype)
|
||||
self.k_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=config.qkv_bias, device=device, dtype=dtype)
|
||||
self.v_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=config.qkv_bias, device=device, dtype=dtype)
|
||||
self.o_proj = ops.Linear(self.inner_size, config.hidden_size, bias=False, device=device, dtype=dtype)
|
||||
|
||||
# QK norms with (1+weight) scaling
|
||||
self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
|
||||
self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, attention_mask=None, freqs_cis=None, optimized_attention=None, past_key_value=None):
|
||||
batch_size, seq_length, _ = x.shape
|
||||
|
||||
# Project Q (with gate), K, V
|
||||
qg = self.q_proj(x)
|
||||
# Split into query and gate: each is [B, seq, inner_size]
|
||||
qg = qg.view(batch_size, seq_length, self.num_heads, self.head_dim * 2)
|
||||
xq, gate = qg[..., :self.head_dim], qg[..., self.head_dim:]
|
||||
gate = gate.reshape(batch_size, seq_length, -1) # [B, seq, inner_size]
|
||||
|
||||
xk = self.k_proj(x)
|
||||
xv = self.v_proj(x)
|
||||
|
||||
xq = self.q_norm(xq).transpose(1, 2) # [B, heads, seq, head_dim]
|
||||
xk = self.k_norm(xk.view(batch_size, seq_length, self.num_kv_heads, self.head_dim)).transpose(1, 2)
|
||||
xv = xv.view(batch_size, seq_length, self.num_kv_heads, self.head_dim).transpose(1, 2)
|
||||
|
||||
# Apply partial RoPE
|
||||
xq, xk = apply_partial_rope(xq, xk, freqs_cis, self.rotary_dim)
|
||||
|
||||
# KV cache
|
||||
present_key_value = None
|
||||
if past_key_value is not None:
|
||||
past_key, past_value, index = past_key_value
|
||||
num_tokens = xk.shape[2]
|
||||
if past_key.shape[2] >= (index + num_tokens):
|
||||
past_key[:, :, index:index + num_tokens] = xk
|
||||
past_value[:, :, index:index + num_tokens] = xv
|
||||
xk = past_key[:, :, :index + num_tokens]
|
||||
xv = past_value[:, :, :index + num_tokens]
|
||||
present_key_value = (past_key, past_value, index + num_tokens)
|
||||
else:
|
||||
if index > 0:
|
||||
xk = torch.cat((past_key[:, :, :index], xk), dim=2)
|
||||
xv = torch.cat((past_value[:, :, :index], xv), dim=2)
|
||||
present_key_value = (xk, xv, index + num_tokens)
|
||||
|
||||
# Expand KV heads for GQA
|
||||
if self.num_heads != self.num_kv_heads:
|
||||
xk = xk.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1)
|
||||
xv = xv.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1)
|
||||
|
||||
output = optimized_attention(xq, xk, xv, self.num_heads, mask=attention_mask, skip_reshape=True)
|
||||
output = output * gate.sigmoid()
|
||||
|
||||
return self.o_proj(output), present_key_value
|
||||
|
||||
|
||||
# Hybrid Transformer Block
|
||||
class Qwen35TransformerBlock(nn.Module):
|
||||
def __init__(self, config, index, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
self.layer_type = config.layer_types[index]
|
||||
if self.layer_type == "linear_attention":
|
||||
self.linear_attn = GatedDeltaNet(config, device=device, dtype=dtype, ops=ops)
|
||||
else:
|
||||
self.self_attn = GatedAttention(config, device=device, dtype=dtype, ops=ops)
|
||||
self.mlp = MLP(config, device=device, dtype=dtype, ops=ops)
|
||||
self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
|
||||
self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, attention_mask=None, freqs_cis=None, optimized_attention=None, past_key_value=None):
|
||||
if self.layer_type == "linear_attention":
|
||||
h, present_key_value = self.linear_attn(self.input_layernorm(x), attention_mask=attention_mask, past_key_value=past_key_value)
|
||||
else:
|
||||
h, present_key_value = self.self_attn(self.input_layernorm(x), attention_mask=attention_mask, freqs_cis=freqs_cis, optimized_attention=optimized_attention, past_key_value=past_key_value)
|
||||
|
||||
x = x + h
|
||||
x = x + self.mlp(self.post_attention_layernorm(x))
|
||||
return x, present_key_value
|
||||
|
||||
|
||||
# Qwen35 Transformer Backbone
|
||||
class Qwen35Transformer(Llama2_):
|
||||
def __init__(self, config, device=None, dtype=None, ops=None):
|
||||
nn.Module.__init__(self)
|
||||
self.config = config
|
||||
self.vocab_size = config.vocab_size
|
||||
self.normalize_in = False
|
||||
|
||||
self.embed_tokens = ops.Embedding(config.vocab_size, config.hidden_size, device=device, dtype=dtype)
|
||||
self.layers = nn.ModuleList([
|
||||
Qwen35TransformerBlock(config, index=i, device=device, dtype=dtype, ops=ops)
|
||||
for i in range(config.num_hidden_layers)
|
||||
])
|
||||
|
||||
if config.final_norm:
|
||||
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
|
||||
else:
|
||||
self.norm = None
|
||||
|
||||
if config.lm_head:
|
||||
self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype)
|
||||
|
||||
def get_past_len(self, past_key_values):
|
||||
for i, layer in enumerate(self.layers):
|
||||
if layer.layer_type == "full_attention":
|
||||
if len(past_key_values) > i:
|
||||
return past_key_values[i][2]
|
||||
break
|
||||
return 0
|
||||
|
||||
def compute_freqs_cis(self, position_ids, device):
|
||||
rotary_dim = int(self.config.head_dim * self.config.partial_rotary_factor)
|
||||
return precompute_partial_rope(
|
||||
self.config.head_dim, rotary_dim, position_ids,
|
||||
self.config.rope_theta, device=device,
|
||||
mrope_section=self.config.mrope_section,
|
||||
)
|
||||
|
||||
|
||||
# Vision Encoder
|
||||
class Qwen35VisionPatchEmbed(nn.Module):
|
||||
def __init__(self, config, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
self.patch_size = config["patch_size"]
|
||||
self.temporal_patch_size = config["temporal_patch_size"]
|
||||
self.in_channels = config["in_channels"]
|
||||
self.embed_dim = config["hidden_size"]
|
||||
kernel_size = [self.temporal_patch_size, self.patch_size, self.patch_size]
|
||||
self.proj = ops.Conv3d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=True, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
target_dtype = self.proj.weight.dtype
|
||||
x = x.view(-1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size)
|
||||
return self.proj(x.to(target_dtype)).view(-1, self.embed_dim)
|
||||
|
||||
|
||||
class Qwen35VisionMLP(nn.Module):
|
||||
def __init__(self, hidden_size, intermediate_size, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
|
||||
self.linear_fc1 = ops.Linear(hidden_size, intermediate_size, bias=True, device=device, dtype=dtype)
|
||||
self.linear_fc2 = ops.Linear(intermediate_size, hidden_size, bias=True, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, hidden_state):
|
||||
return self.linear_fc2(F.gelu(self.linear_fc1(hidden_state), approximate="tanh"))
|
||||
|
||||
|
||||
class Qwen35VisionRotaryEmbedding(nn.Module):
|
||||
def __init__(self, dim, theta=10000.0):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
|
||||
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
||||
|
||||
def forward(self, seqlen):
|
||||
seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
|
||||
freqs = torch.outer(seq, self.inv_freq)
|
||||
return freqs
|
||||
|
||||
|
||||
class Qwen35VisionAttention(nn.Module):
|
||||
def __init__(self, hidden_size, num_heads, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
|
||||
self.dim = hidden_size
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = self.dim // self.num_heads
|
||||
self.qkv = ops.Linear(self.dim, self.dim * 3, bias=True, device=device, dtype=dtype)
|
||||
self.proj = ops.Linear(self.dim, self.dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, cu_seqlens, position_embeddings, optimized_attention=None):
|
||||
seq_length = x.shape[0]
|
||||
query_states, key_states, value_states = (
|
||||
self.qkv(x).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
|
||||
)
|
||||
query_states, key_states = apply_rope(query_states, key_states, position_embeddings)
|
||||
|
||||
# Process per-sequence attention
|
||||
lengths = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
|
||||
q_splits = torch.split(query_states, lengths, dim=0)
|
||||
k_splits = torch.split(key_states, lengths, dim=0)
|
||||
v_splits = torch.split(value_states, lengths, dim=0)
|
||||
|
||||
attn_outputs = []
|
||||
for q, k, v in zip(q_splits, k_splits, v_splits):
|
||||
q = q.transpose(0, 1).unsqueeze(0)
|
||||
k = k.transpose(0, 1).unsqueeze(0)
|
||||
v = v.transpose(0, 1).unsqueeze(0)
|
||||
attn_outputs.append(optimized_attention(q, k, v, self.num_heads, skip_reshape=True))
|
||||
|
||||
attn_output = torch.cat(attn_outputs, dim=1)
|
||||
attn_output = attn_output.reshape(seq_length, -1)
|
||||
return self.proj(attn_output)
|
||||
|
||||
|
||||
class Qwen35VisionBlock(nn.Module):
|
||||
def __init__(self, hidden_size, num_heads, intermediate_size, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
|
||||
self.norm1 = ops.LayerNorm(hidden_size, eps=1e-6, device=device, dtype=dtype)
|
||||
self.norm2 = ops.LayerNorm(hidden_size, eps=1e-6, device=device, dtype=dtype)
|
||||
self.attn = Qwen35VisionAttention(hidden_size, num_heads, device=device, dtype=dtype, ops=ops)
|
||||
self.mlp = Qwen35VisionMLP(hidden_size, intermediate_size, device=device, dtype=dtype, ops=ops)
|
||||
|
||||
def forward(self, x, cu_seqlens, position_embeddings, optimized_attention=None):
|
||||
x = x + self.attn(self.norm1(x), cu_seqlens=cu_seqlens, position_embeddings=position_embeddings, optimized_attention=optimized_attention)
|
||||
return x + self.mlp(self.norm2(x))
|
||||
|
||||
|
||||
class Qwen35VisionPatchMerger(nn.Module):
|
||||
def __init__(self, hidden_size, spatial_merge_size, out_hidden_size, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
|
||||
merge_dim = hidden_size * (spatial_merge_size ** 2)
|
||||
self.norm = ops.LayerNorm(hidden_size, eps=1e-6, device=device, dtype=dtype)
|
||||
self.linear_fc1 = ops.Linear(merge_dim, merge_dim, device=device, dtype=dtype)
|
||||
self.linear_fc2 = ops.Linear(merge_dim, out_hidden_size, device=device, dtype=dtype)
|
||||
self.merge_dim = merge_dim
|
||||
|
||||
def forward(self, x):
|
||||
x = self.norm(x).view(-1, self.merge_dim)
|
||||
return self.linear_fc2(F.gelu(self.linear_fc1(x)))
|
||||
|
||||
|
||||
class Qwen35VisionModel(nn.Module):
|
||||
def __init__(self, config, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
self.spatial_merge_size = config["spatial_merge_size"]
|
||||
self.patch_size = config["patch_size"]
|
||||
self.spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
|
||||
|
||||
self.hidden_size = config["hidden_size"]
|
||||
self.num_heads = config["num_heads"]
|
||||
self.num_position_embeddings = config["num_position_embeddings"]
|
||||
|
||||
self.patch_embed = Qwen35VisionPatchEmbed(config, device=device, dtype=dtype, ops=ops)
|
||||
self.pos_embed = ops.Embedding(self.num_position_embeddings, self.hidden_size, device=device, dtype=dtype)
|
||||
self.num_grid_per_side = int(self.num_position_embeddings ** 0.5)
|
||||
self.rotary_pos_emb = Qwen35VisionRotaryEmbedding(self.hidden_size // self.num_heads // 2)
|
||||
self.blocks = nn.ModuleList([
|
||||
Qwen35VisionBlock(self.hidden_size, self.num_heads, config["intermediate_size"], device=device, dtype=dtype, ops=ops)
|
||||
for _ in range(config["depth"])
|
||||
])
|
||||
self.merger = Qwen35VisionPatchMerger(self.hidden_size, self.spatial_merge_size, config["out_hidden_size"], device=device, dtype=dtype, ops=ops)
|
||||
|
||||
def rot_pos_emb(self, grid_thw):
|
||||
merge_size = self.spatial_merge_size
|
||||
grid_thw_list = grid_thw.tolist()
|
||||
max_hw = max(max(h, w) for _, h, w in grid_thw_list)
|
||||
freq_table = self.rotary_pos_emb(max_hw)
|
||||
device = freq_table.device
|
||||
total_tokens = sum(int(t * h * w) for t, h, w in grid_thw_list)
|
||||
pos_ids = torch.empty((total_tokens, 2), dtype=torch.long, device=device)
|
||||
offset = 0
|
||||
for num_frames, height, width in grid_thw_list:
|
||||
num_frames, height, width = int(num_frames), int(height), int(width)
|
||||
merged_h, merged_w = height // merge_size, width // merge_size
|
||||
block_rows = torch.arange(merged_h, device=device)
|
||||
block_cols = torch.arange(merged_w, device=device)
|
||||
intra_row = torch.arange(merge_size, device=device)
|
||||
intra_col = torch.arange(merge_size, device=device)
|
||||
row_idx = block_rows[:, None, None, None] * merge_size + intra_row[None, None, :, None]
|
||||
col_idx = block_cols[None, :, None, None] * merge_size + intra_col[None, None, None, :]
|
||||
row_idx = row_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1)
|
||||
col_idx = col_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1)
|
||||
coords = torch.stack((row_idx, col_idx), dim=-1)
|
||||
if num_frames > 1:
|
||||
coords = coords.repeat(num_frames, 1)
|
||||
num_tokens = coords.shape[0]
|
||||
pos_ids[offset:offset + num_tokens] = coords
|
||||
offset += num_tokens
|
||||
embeddings = freq_table[pos_ids]
|
||||
embeddings = embeddings.flatten(1)
|
||||
return embeddings
|
||||
|
||||
def fast_pos_embed_interpolate(self, grid_thw):
|
||||
grid_thw_list = grid_thw.tolist()
|
||||
grid_ts = [int(row[0]) for row in grid_thw_list]
|
||||
grid_hs = [int(row[1]) for row in grid_thw_list]
|
||||
grid_ws = [int(row[2]) for row in grid_thw_list]
|
||||
device = self.pos_embed.weight.device
|
||||
idx_list = [[] for _ in range(4)]
|
||||
weight_list = [[] for _ in range(4)]
|
||||
for t, h, w in grid_thw_list:
|
||||
h, w = int(h), int(w)
|
||||
h_idxs = torch.linspace(0, self.num_grid_per_side - 1, h, device=device)
|
||||
w_idxs = torch.linspace(0, self.num_grid_per_side - 1, w, device=device)
|
||||
h_idxs_floor = h_idxs.int()
|
||||
w_idxs_floor = w_idxs.int()
|
||||
h_idxs_ceil = (h_idxs.int() + 1).clip(max=self.num_grid_per_side - 1)
|
||||
w_idxs_ceil = (w_idxs.int() + 1).clip(max=self.num_grid_per_side - 1)
|
||||
dh = h_idxs - h_idxs_floor
|
||||
dw = w_idxs - w_idxs_floor
|
||||
base_h = h_idxs_floor * self.num_grid_per_side
|
||||
base_h_ceil = h_idxs_ceil * self.num_grid_per_side
|
||||
indices = [
|
||||
(base_h[None].T + w_idxs_floor[None]).flatten(),
|
||||
(base_h[None].T + w_idxs_ceil[None]).flatten(),
|
||||
(base_h_ceil[None].T + w_idxs_floor[None]).flatten(),
|
||||
(base_h_ceil[None].T + w_idxs_ceil[None]).flatten(),
|
||||
]
|
||||
weights = [
|
||||
((1 - dh)[None].T * (1 - dw)[None]).flatten(),
|
||||
((1 - dh)[None].T * dw[None]).flatten(),
|
||||
(dh[None].T * (1 - dw)[None]).flatten(),
|
||||
(dh[None].T * dw[None]).flatten(),
|
||||
]
|
||||
for j in range(4):
|
||||
idx_list[j].extend(indices[j].tolist())
|
||||
weight_list[j].extend(weights[j].tolist())
|
||||
idx_tensor = torch.tensor(idx_list, dtype=torch.long, device=device)
|
||||
weight_tensor = torch.tensor(weight_list, dtype=self.pos_embed.weight.dtype, device=device)
|
||||
pos_embeds = self.pos_embed(idx_tensor).to(device) * weight_tensor[:, :, None]
|
||||
patch_pos_embeds = pos_embeds[0] + pos_embeds[1] + pos_embeds[2] + pos_embeds[3]
|
||||
patch_pos_embeds = patch_pos_embeds.split([h * w for h, w in zip(grid_hs, grid_ws)])
|
||||
patch_pos_embeds_permute = []
|
||||
merge_size = self.spatial_merge_size
|
||||
for pos_embed, t, h, w in zip(patch_pos_embeds, grid_ts, grid_hs, grid_ws):
|
||||
pos_embed = pos_embed.repeat(t, 1)
|
||||
pos_embed = (
|
||||
pos_embed.view(t, h // merge_size, merge_size, w // merge_size, merge_size, -1)
|
||||
.permute(0, 1, 3, 2, 4, 5)
|
||||
.flatten(0, 4)
|
||||
)
|
||||
patch_pos_embeds_permute.append(pos_embed)
|
||||
return torch.cat(patch_pos_embeds_permute)
|
||||
|
||||
def forward(self, x, grid_thw):
|
||||
x = self.patch_embed(x)
|
||||
pos_embeds = self.fast_pos_embed_interpolate(grid_thw).to(x.device)
|
||||
x = x + pos_embeds
|
||||
rotary_pos_emb = self.rot_pos_emb(grid_thw)
|
||||
seq_len = x.shape[0]
|
||||
x = x.reshape(seq_len, -1)
|
||||
rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
|
||||
emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
|
||||
cos = emb.cos().unsqueeze(-2)
|
||||
sin = emb.sin().unsqueeze(-2)
|
||||
sin_half = sin.shape[-1] // 2
|
||||
position_embeddings = (cos, sin[..., :sin_half], -sin[..., sin_half:])
|
||||
cu_seqlens = torch.repeat_interleave(
|
||||
grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
|
||||
).cumsum(dim=0, dtype=torch.int32)
|
||||
cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
|
||||
optimized_attention = optimized_attention_for_device(x.device, mask=False, small_input=True)
|
||||
for blk in self.blocks:
|
||||
x = blk(x, cu_seqlens=cu_seqlens, position_embeddings=position_embeddings, optimized_attention=optimized_attention)
|
||||
merged = self.merger(x)
|
||||
return merged
|
||||
|
||||
# Model Wrapper
|
||||
class Qwen35(BaseLlama, BaseGenerate, torch.nn.Module):
|
||||
model_type = "qwen35_2b"
|
||||
|
||||
def __init__(self, config_dict, dtype, device, operations):
|
||||
super().__init__()
|
||||
config = _make_config(self.model_type, config_dict)
|
||||
self.num_layers = config.num_hidden_layers
|
||||
self.model = Qwen35Transformer(config, device=device, dtype=dtype, ops=operations)
|
||||
vision_overrides = QWEN35_MODELS.get(self.model_type, {}).get("vision", {})
|
||||
vision_config = {**QWEN35_VISION_DEFAULTS, **vision_overrides, "out_hidden_size": config.hidden_size}
|
||||
self.visual = Qwen35VisionModel(vision_config, device=device, dtype=dtype, ops=operations)
|
||||
self.dtype = dtype
|
||||
|
||||
def preprocess_embed(self, embed, device):
|
||||
if embed["type"] == "image":
|
||||
image, grid = comfy.text_encoders.qwen_vl.process_qwen2vl_images(embed["data"], patch_size=16)
|
||||
return self.visual(image.to(device, dtype=torch.float32), grid), grid
|
||||
return None, None
|
||||
|
||||
def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[], past_key_values=None):
|
||||
grid = None
|
||||
position_ids = None
|
||||
offset = 0
|
||||
for e in embeds_info:
|
||||
if e.get("type") == "image":
|
||||
grid = e.get("extra", None)
|
||||
start = e.get("index")
|
||||
if position_ids is None:
|
||||
position_ids = torch.zeros((3, embeds.shape[1]), device=embeds.device)
|
||||
position_ids[:, :start] = torch.arange(0, start, device=embeds.device)
|
||||
end = e.get("size") + start
|
||||
len_max = int(grid.max()) // 2
|
||||
start_next = len_max + start
|
||||
position_ids[:, end:] = torch.arange(start_next + offset, start_next + (embeds.shape[1] - end) + offset, device=embeds.device)
|
||||
position_ids[0, start:end] = start + offset
|
||||
max_d = int(grid[0][1]) // 2
|
||||
position_ids[1, start:end] = torch.arange(start + offset, start + max_d + offset, device=embeds.device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start]
|
||||
max_d = int(grid[0][2]) // 2
|
||||
position_ids[2, start:end] = torch.arange(start + offset, start + max_d + offset, device=embeds.device).unsqueeze(0).repeat(math.ceil((end - start) / max_d), 1).flatten(0)[:end - start]
|
||||
offset += len_max - (end - start)
|
||||
|
||||
if grid is None:
|
||||
position_ids = None
|
||||
|
||||
return super().forward(x, attention_mask=attention_mask, embeds=embeds, num_tokens=num_tokens, intermediate_output=intermediate_output, final_layer_norm_intermediate=final_layer_norm_intermediate, dtype=dtype, position_ids=position_ids, past_key_values=past_key_values)
|
||||
|
||||
def init_kv_cache(self, batch, max_cache_len, device, execution_dtype):
|
||||
model_config = self.model.config
|
||||
past_key_values = []
|
||||
for i in range(model_config.num_hidden_layers):
|
||||
if model_config.layer_types[i] == "linear_attention":
|
||||
recurrent_state = torch.zeros(
|
||||
[batch, model_config.linear_num_value_heads, model_config.linear_key_head_dim, model_config.linear_value_head_dim],
|
||||
device=device, dtype=torch.float32
|
||||
)
|
||||
conv_dim = model_config.linear_num_key_heads * model_config.linear_key_head_dim * 2 + model_config.linear_num_value_heads * model_config.linear_value_head_dim
|
||||
conv_state = torch.zeros(
|
||||
[batch, conv_dim, model_config.conv_kernel_size - 1],
|
||||
device=device, dtype=execution_dtype
|
||||
)
|
||||
past_key_values.append((recurrent_state, conv_state, 0))
|
||||
else:
|
||||
past_key_values.append((
|
||||
torch.empty([batch, model_config.num_key_value_heads, max_cache_len, model_config.head_dim], device=device, dtype=execution_dtype),
|
||||
torch.empty([batch, model_config.num_key_value_heads, max_cache_len, model_config.head_dim], device=device, dtype=execution_dtype),
|
||||
0
|
||||
))
|
||||
return past_key_values
|
||||
|
||||
# Tokenizer and Text Encoder Wrappers
|
||||
|
||||
class Qwen35Tokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}, embedding_size=2048, embedding_key="qwen35_2b"):
|
||||
from transformers import Qwen2Tokenizer
|
||||
tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen35_tokenizer")
|
||||
super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=embedding_size, embedding_key=embedding_key, tokenizer_class=Qwen2Tokenizer,
|
||||
has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=248044, tokenizer_data=tokenizer_data)
|
||||
|
||||
|
||||
class Qwen35ImageTokenizer(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}, model_type="qwen35_2b"):
|
||||
embedding_size = QWEN35_MODELS.get(model_type, {}).get("hidden_size", 2048)
|
||||
tokenizer = lambda *a, **kw: Qwen35Tokenizer(*a, **kw, embedding_size=embedding_size, embedding_key=model_type)
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name=model_type, tokenizer=tokenizer)
|
||||
self.llama_template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
|
||||
self.llama_template_images = "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=False, **kwargs):
|
||||
image = kwargs.get("image", None)
|
||||
if image is not None and len(images) == 0:
|
||||
images = [image]
|
||||
|
||||
skip_template = False
|
||||
if text.startswith('<|im_start|>'):
|
||||
skip_template = True
|
||||
if prevent_empty_text and text == '':
|
||||
text = ' '
|
||||
|
||||
if skip_template:
|
||||
llama_text = text
|
||||
else:
|
||||
if llama_template is None:
|
||||
if len(images) > 0:
|
||||
llama_text = self.llama_template_images.format(text)
|
||||
else:
|
||||
llama_text = self.llama_template.format(text)
|
||||
else:
|
||||
llama_text = llama_template.format(text)
|
||||
if not thinking:
|
||||
llama_text += "<think>\n</think>\n"
|
||||
|
||||
tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
|
||||
key_name = next(iter(tokens))
|
||||
embed_count = 0
|
||||
qwen_tokens = tokens[key_name]
|
||||
for r in qwen_tokens:
|
||||
for i in range(len(r)):
|
||||
if r[i][0] == 248056: # <|image_pad|>
|
||||
if len(images) > embed_count:
|
||||
r[i] = ({"type": "image", "data": images[embed_count], "original_type": "image"},) + r[i][1:]
|
||||
embed_count += 1
|
||||
return tokens
|
||||
|
||||
|
||||
class Qwen35ClipModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}, model_type="qwen35_2b"):
|
||||
class Qwen35_(Qwen35):
|
||||
pass
|
||||
Qwen35_.model_type = model_type
|
||||
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={},
|
||||
dtype=dtype, special_tokens={"pad": 248044}, layer_norm_hidden_state=False,
|
||||
model_class=Qwen35_, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||
|
||||
|
||||
class Qwen35TEModel(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}, model_type="qwen35_2b"):
|
||||
clip_model = lambda **kw: Qwen35ClipModel(**kw, model_type=model_type)
|
||||
super().__init__(device=device, dtype=dtype, name=model_type, clip_model=clip_model, model_options=model_options)
|
||||
|
||||
|
||||
def tokenizer(model_type="qwen35_2b"):
|
||||
class Qwen35ImageTokenizer_(Qwen35ImageTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, model_type=model_type)
|
||||
return Qwen35ImageTokenizer_
|
||||
|
||||
|
||||
def te(dtype_llama=None, llama_quantization_metadata=None, model_type="qwen35_2b"):
|
||||
class Qwen35TEModel_(Qwen35TEModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
if dtype_llama is not None:
|
||||
dtype = dtype_llama
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options, model_type=model_type)
|
||||
return Qwen35TEModel_
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
@ -16,6 +16,7 @@ SERVER_FEATURE_FLAGS: dict[str, Any] = {
|
||||
"extension": {"manager": {"supports_v4": True}},
|
||||
"node_replacements": True,
|
||||
"assets": args.enable_assets,
|
||||
"download_api": args.enable_download_api,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -145,20 +145,7 @@ class ReveImageCreateNode(IO.ComfyNode):
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(
|
||||
widgets=["upscale", "upscale.upscale_factor"],
|
||||
),
|
||||
expr="""
|
||||
(
|
||||
$factor := $lookup(widgets, "upscale.upscale_factor");
|
||||
$fmt := {"approximate": true, "note": "(base)"};
|
||||
widgets.upscale = "enabled" ? (
|
||||
$factor = 4 ? {"type": "usd", "usd": 0.0762, "format": $fmt}
|
||||
: $factor = 3 ? {"type": "usd", "usd": 0.0591, "format": $fmt}
|
||||
: {"type": "usd", "usd": 0.0457, "format": $fmt}
|
||||
) : {"type": "usd", "usd": 0.03432, "format": $fmt}
|
||||
)
|
||||
""",
|
||||
expr="""{"type":"usd","usd":0.03432,"format":{"approximate":true,"note":"(base)"}}""",
|
||||
),
|
||||
)
|
||||
|
||||
@ -238,21 +225,13 @@ class ReveImageEditNode(IO.ComfyNode):
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(
|
||||
widgets=["model", "upscale", "upscale.upscale_factor"],
|
||||
widgets=["model"],
|
||||
),
|
||||
expr="""
|
||||
(
|
||||
$fmt := {"approximate": true, "note": "(base)"};
|
||||
$isFast := $contains(widgets.model, "fast");
|
||||
$enabled := widgets.upscale = "enabled";
|
||||
$factor := $lookup(widgets, "upscale.upscale_factor");
|
||||
$isFast
|
||||
? {"type": "usd", "usd": 0.01001, "format": $fmt}
|
||||
: $enabled ? (
|
||||
$factor = 4 ? {"type": "usd", "usd": 0.0991, "format": $fmt}
|
||||
: $factor = 3 ? {"type": "usd", "usd": 0.0819, "format": $fmt}
|
||||
: {"type": "usd", "usd": 0.0686, "format": $fmt}
|
||||
) : {"type": "usd", "usd": 0.0572, "format": $fmt}
|
||||
$base := $isFast ? 0.01001 : 0.0572;
|
||||
{"type": "usd", "usd": $base, "format": {"approximate": true, "note": "(base)"}}
|
||||
)
|
||||
""",
|
||||
),
|
||||
@ -348,21 +327,13 @@ class ReveImageRemixNode(IO.ComfyNode):
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(
|
||||
widgets=["model", "upscale", "upscale.upscale_factor"],
|
||||
widgets=["model"],
|
||||
),
|
||||
expr="""
|
||||
(
|
||||
$fmt := {"approximate": true, "note": "(base)"};
|
||||
$isFast := $contains(widgets.model, "fast");
|
||||
$enabled := widgets.upscale = "enabled";
|
||||
$factor := $lookup(widgets, "upscale.upscale_factor");
|
||||
$isFast
|
||||
? {"type": "usd", "usd": 0.01001, "format": $fmt}
|
||||
: $enabled ? (
|
||||
$factor = 4 ? {"type": "usd", "usd": 0.0991, "format": $fmt}
|
||||
: $factor = 3 ? {"type": "usd", "usd": 0.0819, "format": $fmt}
|
||||
: {"type": "usd", "usd": 0.0686, "format": $fmt}
|
||||
) : {"type": "usd", "usd": 0.0572, "format": $fmt}
|
||||
$base := $isFast ? 0.01001 : 0.0572;
|
||||
{"type": "usd", "usd": $base, "format": {"approximate": true, "note": "(base)"}}
|
||||
)
|
||||
""",
|
||||
),
|
||||
|
||||
@ -38,7 +38,6 @@ from comfy_api_nodes.util import (
|
||||
UPSCALER_MODELS_MAP = {
|
||||
"Starlight (Astra) Fast": "slf-1",
|
||||
"Starlight (Astra) Creative": "slc-1",
|
||||
"Starlight Precise 2.5": "slp-2.5",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -44,13 +44,8 @@ class NumberConvertNode(io.ComfyNode):
|
||||
def execute(cls, value) -> io.NodeOutput:
|
||||
if isinstance(value, bool):
|
||||
float_val = 1.0 if value else 0.0
|
||||
int_val = 1 if value else 0
|
||||
elif isinstance(value, int):
|
||||
elif isinstance(value, (int, float)):
|
||||
float_val = float(value)
|
||||
int_val = value
|
||||
elif isinstance(value, float):
|
||||
float_val = value
|
||||
int_val = int(value)
|
||||
elif isinstance(value, str):
|
||||
text = value.strip()
|
||||
if not text:
|
||||
@ -61,14 +56,6 @@ class NumberConvertNode(io.ComfyNode):
|
||||
raise ValueError(
|
||||
f"Cannot convert string to number: {value!r}"
|
||||
) from None
|
||||
if not math.isfinite(float_val):
|
||||
raise ValueError(
|
||||
f"Cannot convert non-finite value to number: {float_val}"
|
||||
)
|
||||
try:
|
||||
int_val = int(text)
|
||||
except ValueError:
|
||||
int_val = int(float_val)
|
||||
else:
|
||||
raise TypeError(
|
||||
f"Unsupported input type: {type(value).__name__}"
|
||||
@ -79,7 +66,7 @@ class NumberConvertNode(io.ComfyNode):
|
||||
f"Cannot convert non-finite value to number: {float_val}"
|
||||
)
|
||||
|
||||
return io.NodeOutput(float_val, int_val)
|
||||
return io.NodeOutput(float_val, int(float_val))
|
||||
|
||||
|
||||
class NumberConvertExtension(ComfyExtension):
|
||||
|
||||
@ -15,7 +15,6 @@ class TextGenerate(io.ComfyNode):
|
||||
io.Float.Input("min_p", default=0.05, min=0.0, max=1.0, step=0.01),
|
||||
io.Float.Input("repetition_penalty", default=1.05, min=0.0, max=5.0, step=0.01),
|
||||
io.Int.Input("seed", default=0, min=0, max=0xffffffffffffffff),
|
||||
io.Float.Input("presence_penalty", optional=True, default=0.0, min=0.0, max=5.0, step=0.01),
|
||||
]
|
||||
),
|
||||
io.DynamicCombo.Option(
|
||||
@ -26,7 +25,7 @@ class TextGenerate(io.ComfyNode):
|
||||
|
||||
return io.Schema(
|
||||
node_id="TextGenerate",
|
||||
category="textgen",
|
||||
category="textgen/",
|
||||
search_aliases=["LLM", "gemma"],
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
@ -34,7 +33,6 @@ class TextGenerate(io.ComfyNode):
|
||||
io.Image.Input("image", optional=True),
|
||||
io.Int.Input("max_length", default=256, min=1, max=2048),
|
||||
io.DynamicCombo.Input("sampling_mode", options=sampling_options, display_name="Sampling Mode"),
|
||||
io.Boolean.Input("thinking", optional=True, default=False, tooltip="Operate in thinking mode if the model supports it."),
|
||||
],
|
||||
outputs=[
|
||||
io.String.Output(display_name="generated_text"),
|
||||
@ -42,9 +40,9 @@ class TextGenerate(io.ComfyNode):
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, clip, prompt, max_length, sampling_mode, image=None, thinking=False) -> io.NodeOutput:
|
||||
def execute(cls, clip, prompt, max_length, sampling_mode, image=None) -> io.NodeOutput:
|
||||
|
||||
tokens = clip.tokenize(prompt, image=image, skip_template=False, min_length=1, thinking=thinking)
|
||||
tokens = clip.tokenize(prompt, image=image, skip_template=False, min_length=1)
|
||||
|
||||
# Get sampling parameters from dynamic combo
|
||||
do_sample = sampling_mode.get("sampling_mode") == "on"
|
||||
@ -54,7 +52,6 @@ class TextGenerate(io.ComfyNode):
|
||||
min_p = sampling_mode.get("min_p", 0.0)
|
||||
seed = sampling_mode.get("seed", None)
|
||||
repetition_penalty = sampling_mode.get("repetition_penalty", 1.0)
|
||||
presence_penalty = sampling_mode.get("presence_penalty", 0.0)
|
||||
|
||||
generated_ids = clip.generate(
|
||||
tokens,
|
||||
@ -65,7 +62,6 @@ class TextGenerate(io.ComfyNode):
|
||||
top_p=top_p,
|
||||
min_p=min_p,
|
||||
repetition_penalty=repetition_penalty,
|
||||
presence_penalty=presence_penalty,
|
||||
seed=seed
|
||||
)
|
||||
|
||||
@ -160,12 +156,12 @@ class TextGenerateLTX2Prompt(TextGenerate):
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, clip, prompt, max_length, sampling_mode, image=None, thinking=False) -> io.NodeOutput:
|
||||
def execute(cls, clip, prompt, max_length, sampling_mode, image=None) -> io.NodeOutput:
|
||||
if image is None:
|
||||
formatted_prompt = f"<start_of_turn>system\n{LTX2_T2V_SYSTEM_PROMPT.strip()}<end_of_turn>\n<start_of_turn>user\nUser Raw Input Prompt: {prompt}.<end_of_turn>\n<start_of_turn>model\n"
|
||||
else:
|
||||
formatted_prompt = f"<start_of_turn>system\n{LTX2_I2V_SYSTEM_PROMPT.strip()}<end_of_turn>\n<start_of_turn>user\n\n<image_soft_token>\n\nUser Raw Input Prompt: {prompt}.<end_of_turn>\n<start_of_turn>model\n"
|
||||
return super().execute(clip, formatted_prompt, max_length, sampling_mode, image, thinking)
|
||||
return super().execute(clip, formatted_prompt, max_length, sampling_mode, image)
|
||||
|
||||
|
||||
class TextgenExtension(ComfyExtension):
|
||||
|
||||
@ -1 +1 @@
|
||||
comfyui_manager==4.1
|
||||
comfyui_manager==4.1b8
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
comfyui-frontend-package==1.42.8
|
||||
comfyui-workflow-templates==0.9.38
|
||||
comfyui-workflow-templates==0.9.36
|
||||
comfyui-embedded-docs==0.4.3
|
||||
torch
|
||||
torchsde
|
||||
|
||||
84
server.py
84
server.py
@ -43,6 +43,7 @@ from app.model_manager import ModelFileManager
|
||||
from app.custom_node_manager import CustomNodeManager
|
||||
from app.subgraph_manager import SubgraphManager
|
||||
from app.node_replace_manager import NodeReplaceManager
|
||||
from app.download_manager import DownloadManager
|
||||
from typing import Optional, Union
|
||||
from api_server.routes.internal.internal_routes import InternalRoutes
|
||||
from protocol import BinaryEventTypes
|
||||
@ -205,6 +206,7 @@ class PromptServer():
|
||||
self.subgraph_manager = SubgraphManager()
|
||||
self.node_replace_manager = NodeReplaceManager()
|
||||
self.internal_routes = InternalRoutes(self)
|
||||
self.download_manager = DownloadManager(self) if args.enable_download_api else None
|
||||
self.supports = ["custom_nodes_from_web"]
|
||||
self.prompt_queue = execution.PromptQueue(self)
|
||||
self.loop = loop
|
||||
@ -1028,9 +1030,91 @@ class PromptServer():
|
||||
|
||||
return web.Response(status=200)
|
||||
|
||||
# -- Download API (gated behind --enable-download-api) --
|
||||
|
||||
def _require_download_api(handler):
|
||||
async def wrapper(request):
|
||||
if self.download_manager is None:
|
||||
return web.json_response(
|
||||
{"error": "Download API is not enabled. Start ComfyUI with --enable-download-api."},
|
||||
status=403,
|
||||
)
|
||||
return await handler(request)
|
||||
return wrapper
|
||||
|
||||
@routes.post("/download/model")
|
||||
@_require_download_api
|
||||
async def post_download_model(request):
|
||||
json_data = await request.json()
|
||||
url = json_data.get("url")
|
||||
directory = json_data.get("directory")
|
||||
filename = json_data.get("filename")
|
||||
client_id = json_data.get("client_id")
|
||||
|
||||
if not url or not directory or not filename:
|
||||
return web.json_response(
|
||||
{"error": "Missing required fields: url, directory, filename"},
|
||||
status=400,
|
||||
)
|
||||
|
||||
task, err = await self.download_manager.start_download(url, directory, filename, client_id=client_id)
|
||||
if err:
|
||||
status = 409 if "already" in err.lower() else 400
|
||||
return web.json_response({"error": err}, status=status)
|
||||
|
||||
return web.json_response(task.to_dict(), status=201)
|
||||
|
||||
@routes.get("/download/status")
|
||||
@_require_download_api
|
||||
async def get_download_status(request):
|
||||
client_id = request.rel_url.query.get("client_id")
|
||||
return web.json_response(self.download_manager.get_all_tasks(client_id=client_id))
|
||||
|
||||
@routes.get("/download/status/{task_id}")
|
||||
@_require_download_api
|
||||
async def get_download_task_status(request):
|
||||
task_id = request.match_info["task_id"]
|
||||
task_data = self.download_manager.get_task(task_id)
|
||||
if task_data is None:
|
||||
return web.json_response({"error": "Download not found"}, status=404)
|
||||
return web.json_response(task_data)
|
||||
|
||||
@routes.post("/download/pause/{task_id}")
|
||||
@_require_download_api
|
||||
async def post_download_pause(request):
|
||||
task_id = request.match_info["task_id"]
|
||||
err = self.download_manager.pause_download(task_id)
|
||||
if err:
|
||||
return web.json_response({"error": err}, status=400)
|
||||
return web.json_response({"status": "paused"})
|
||||
|
||||
@routes.post("/download/resume/{task_id}")
|
||||
@_require_download_api
|
||||
async def post_download_resume(request):
|
||||
task_id = request.match_info["task_id"]
|
||||
err = self.download_manager.resume_download(task_id)
|
||||
if err:
|
||||
return web.json_response({"error": err}, status=400)
|
||||
return web.json_response({"status": "resumed"})
|
||||
|
||||
@routes.post("/download/cancel/{task_id}")
|
||||
@_require_download_api
|
||||
async def post_download_cancel(request):
|
||||
task_id = request.match_info["task_id"]
|
||||
err = self.download_manager.cancel_download(task_id)
|
||||
if err:
|
||||
return web.json_response({"error": err}, status=400)
|
||||
return web.json_response({"status": "cancelled"})
|
||||
|
||||
async def setup(self):
|
||||
timeout = aiohttp.ClientTimeout(total=None) # no timeout
|
||||
self.client_session = aiohttp.ClientSession(timeout=timeout)
|
||||
if self.download_manager is not None:
|
||||
self.app.on_cleanup.append(self._cleanup_download_manager)
|
||||
|
||||
async def _cleanup_download_manager(self, app):
|
||||
if self.download_manager is not None:
|
||||
await self.download_manager.close()
|
||||
|
||||
def add_routes(self):
|
||||
self.user_manager.add_routes(self.routes)
|
||||
|
||||
@ -1,81 +0,0 @@
|
||||
"""Tests for path_utils – asset category resolution."""
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from app.assets.services.path_utils import get_asset_category_and_relative_path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fake_dirs():
|
||||
"""Create temporary input, output, and temp directories."""
|
||||
with tempfile.TemporaryDirectory() as root:
|
||||
root_path = Path(root)
|
||||
input_dir = root_path / "input"
|
||||
output_dir = root_path / "output"
|
||||
temp_dir = root_path / "temp"
|
||||
models_dir = root_path / "models" / "checkpoints"
|
||||
for d in (input_dir, output_dir, temp_dir, models_dir):
|
||||
d.mkdir(parents=True)
|
||||
|
||||
with patch("app.assets.services.path_utils.folder_paths") as mock_fp:
|
||||
mock_fp.get_input_directory.return_value = str(input_dir)
|
||||
mock_fp.get_output_directory.return_value = str(output_dir)
|
||||
mock_fp.get_temp_directory.return_value = str(temp_dir)
|
||||
|
||||
with patch(
|
||||
"app.assets.services.path_utils.get_comfy_models_folders",
|
||||
return_value=[("checkpoints", [str(models_dir)])],
|
||||
):
|
||||
yield {
|
||||
"input": input_dir,
|
||||
"output": output_dir,
|
||||
"temp": temp_dir,
|
||||
"models": models_dir,
|
||||
}
|
||||
|
||||
|
||||
class TestGetAssetCategoryAndRelativePath:
|
||||
def test_input_file(self, fake_dirs):
|
||||
f = fake_dirs["input"] / "photo.png"
|
||||
f.touch()
|
||||
cat, rel = get_asset_category_and_relative_path(str(f))
|
||||
assert cat == "input"
|
||||
assert rel == "photo.png"
|
||||
|
||||
def test_output_file(self, fake_dirs):
|
||||
f = fake_dirs["output"] / "result.png"
|
||||
f.touch()
|
||||
cat, rel = get_asset_category_and_relative_path(str(f))
|
||||
assert cat == "output"
|
||||
assert rel == "result.png"
|
||||
|
||||
def test_temp_file(self, fake_dirs):
|
||||
"""Regression: temp files must be categorised, not raise ValueError."""
|
||||
f = fake_dirs["temp"] / "GLSLShader_output_00004_.png"
|
||||
f.touch()
|
||||
cat, rel = get_asset_category_and_relative_path(str(f))
|
||||
assert cat == "temp"
|
||||
assert rel == "GLSLShader_output_00004_.png"
|
||||
|
||||
def test_temp_file_in_subfolder(self, fake_dirs):
|
||||
sub = fake_dirs["temp"] / "sub"
|
||||
sub.mkdir()
|
||||
f = sub / "ComfyUI_temp_tczip_00004_.png"
|
||||
f.touch()
|
||||
cat, rel = get_asset_category_and_relative_path(str(f))
|
||||
assert cat == "temp"
|
||||
assert os.path.normpath(rel) == os.path.normpath("sub/ComfyUI_temp_tczip_00004_.png")
|
||||
|
||||
def test_model_file(self, fake_dirs):
|
||||
f = fake_dirs["models"] / "model.safetensors"
|
||||
f.touch()
|
||||
cat, rel = get_asset_category_and_relative_path(str(f))
|
||||
assert cat == "models"
|
||||
|
||||
def test_unknown_path_raises(self, fake_dirs):
|
||||
with pytest.raises(ValueError, match="not within"):
|
||||
get_asset_category_and_relative_path("/some/random/path.png")
|
||||
@ -90,63 +90,6 @@ class TestNumberConvertExecute:
|
||||
assert result[0] == 1000.0
|
||||
assert result[1] == 1000
|
||||
|
||||
# --- Large number precision (string input) ---
|
||||
|
||||
def test_string_large_int_above_2_53(self):
|
||||
"""Text-to-int must not lose precision for integers beyond 2^53."""
|
||||
big = 2**53 + 1 # 9007199254740993
|
||||
result = self._exec(str(big))
|
||||
assert result[1] == big
|
||||
|
||||
def test_string_large_negative_int_above_2_53(self):
|
||||
big = -(2**53 + 1)
|
||||
result = self._exec(str(big))
|
||||
assert result[1] == big
|
||||
|
||||
def test_string_very_large_int(self):
|
||||
big = 2**63 + 42
|
||||
result = self._exec(str(big))
|
||||
assert result[1] == big
|
||||
|
||||
def test_string_large_int_float_output_is_float(self):
|
||||
"""FLOAT output is still a float (may lose precision, but must be float type)."""
|
||||
result = self._exec(str(2**53 + 1))
|
||||
assert isinstance(result[0], float)
|
||||
|
||||
# --- Large number precision (int input) ---
|
||||
|
||||
def test_int_large_above_2_53(self):
|
||||
"""Native int input must preserve its value in the INT output."""
|
||||
big = 2**53 + 1
|
||||
result = self._exec(big)
|
||||
assert result[1] == big
|
||||
|
||||
def test_int_large_negative_above_2_53(self):
|
||||
big = -(2**53 + 1)
|
||||
result = self._exec(big)
|
||||
assert result[1] == big
|
||||
|
||||
def test_int_very_large(self):
|
||||
big = 2**100
|
||||
result = self._exec(big)
|
||||
assert result[1] == big
|
||||
|
||||
# --- String decimal / scientific notation fallback ---
|
||||
|
||||
def test_string_decimal_still_truncates(self):
|
||||
"""Strings with decimal points fall back to int(float(...)) truncation."""
|
||||
result = self._exec("3.7")
|
||||
assert result[1] == 3
|
||||
|
||||
def test_string_negative_decimal_truncates(self):
|
||||
result = self._exec("-2.9")
|
||||
assert result[1] == -2
|
||||
|
||||
def test_string_scientific_large(self):
|
||||
result = self._exec("1e18")
|
||||
assert result[0] == 1e18
|
||||
assert result[1] == 10**18
|
||||
|
||||
# --- STRING error paths ---
|
||||
|
||||
def test_empty_string_raises(self):
|
||||
|
||||
Reference in New Issue
Block a user