refactor: unify download item types and eliminate extension-based branching

Merge AssetDownloadItem, AssetInlineItem into SandboxDownloadItem with
optional 'content' field. All consumers now follow a clean pipeline:
  get items → accessor.resolve_items() → AppAssetService.to_download_items() → download

Key changes:
- SandboxDownloadItem gains content: bytes | None (entities.py)
- ZipSandbox.download_items() handles both inline (base64 heredoc) and
  remote (curl) via a single pipeline — no structural branching
- AssetDownloadService.build_download_script() takes unified list
- CachedContentAccessor.resolve_items() batch-enriches items from DB
  (extension-agnostic, no 'if md' checks needed)
- AppAssetService.to_download_items() converts AssetItem → SandboxDownloadItem
- DraftAppAssetsInitializer, package_and_upload, export_bundle simplified
- file_upload/node.py switched to SandboxDownloadItem
- Deleted AssetDownloadItem and AssetInlineItem classes
This commit is contained in:
Harry
2026-03-10 17:11:41 +08:00
parent 6ac730ec2e
commit 65e89520c0
19 changed files with 492 additions and 214 deletions

View File

@ -13,12 +13,12 @@ from core.virtual_environment.__base.helpers import pipeline
from ..bash.dify_cli import DifyCliConfig, DifyCliLocator
from ..entities import DifyCli
from .base import AsyncSandboxInitializer
from .base import SyncSandboxInitializer
logger = logging.getLogger(__name__)
class DifyCliInitializer(AsyncSandboxInitializer):
class DifyCliInitializer(SyncSandboxInitializer):
def __init__(
self,
tenant_id: str,

View File

@ -1,23 +1,46 @@
"""Async initializer that populates a draft sandbox with app asset files.
Unlike ``AppAssetsInitializer`` (which downloads a pre-built ZIP for
published assets), this initializer runs the build pipeline on the fly
so that ``.md`` skill documents are compiled and their resolved content
is embedded directly into the download script — avoiding the S3
round-trip that was previously required for resolved keys.
Execution order guarantee:
This runs as an ``AsyncSandboxInitializer`` in the background thread.
By the time it finishes, ``SkillManager.save_bundle()`` has been
called (inside ``SkillBuilder.build()``), so subsequent initializers
like ``DifyCliInitializer`` can safely load the bundle from Redis/S3.
"""
import logging
from core.app_assets.accessor import should_mirror
from core.app_assets.builder.base import BuildContext
from core.app_assets.builder.file_builder import FileBuilder
from core.app_assets.builder.pipeline import AssetBuildPipeline
from core.app_assets.builder.skill_builder import SkillBuilder
from core.app_assets.constants import AppAssetsAttrs
from core.app_assets.storage import AssetPaths
from core.sandbox.entities import AppAssets
from core.sandbox.sandbox import Sandbox
from core.sandbox.services import AssetDownloadService
from core.sandbox.services.asset_download_service import AssetDownloadItem
from core.virtual_environment.__base.helpers import pipeline
from services.app_asset_service import AppAssetService
from .base import AsyncSandboxInitializer
from .base import SyncSandboxInitializer
logger = logging.getLogger(__name__)
_TIMEOUT = 600 # 10 minutes
class DraftAppAssetsInitializer(AsyncSandboxInitializer):
class DraftAppAssetsInitializer(SyncSandboxInitializer):
"""Compile draft assets and push them into the sandbox VM.
``.md`` (skill) files are compiled in-process and their resolved
content is embedded as base64 heredocs in the download script.
All other files are fetched from S3 via presigned URLs.
"""
def __init__(self, tenant_id: str, app_id: str, assets_id: str) -> None:
self._tenant_id = tenant_id
self._app_id = app_id
@ -25,22 +48,22 @@ class DraftAppAssetsInitializer(AsyncSandboxInitializer):
def initialize(self, sandbox: Sandbox) -> None:
vm = sandbox.vm
build_id = self._assets_id
tree = sandbox.attrs.get(AppAssetsAttrs.FILE_TREE)
asset_storage = AppAssetService.get_storage()
nodes = list(tree.walk_files())
if not nodes:
if tree.empty():
return
# Inline-mirror nodes use the resolved (compiled) key; others use draft.
keys = [
AssetPaths.resolved(self._tenant_id, self._app_id, build_id, node.id)
if should_mirror(node.extension)
else AssetPaths.draft(self._tenant_id, self._app_id, node.id)
for node in nodes
]
urls = asset_storage.get_download_urls(keys, _TIMEOUT)
items = [AssetDownloadItem(path=tree.get_path(node.id).lstrip("/"), url=url) for node, url in zip(nodes, urls)]
script = AssetDownloadService.build_download_script(items, AppAssets.PATH)
# --- 1. Run the build pipeline (SkillBuilder compiles .md inline) ---
accessor = AppAssetService.get_accessor(self._tenant_id, self._app_id)
build_pipeline = AssetBuildPipeline([SkillBuilder(accessor=accessor), FileBuilder()])
ctx = BuildContext(tenant_id=self._tenant_id, app_id=self._app_id, build_id=self._assets_id)
built_assets = build_pipeline.build_all(tree, ctx)
if not built_assets:
return
# --- 2. Convert to unified download items and execute ---
download_items = AppAssetService.to_download_items(built_assets)
script = AssetDownloadService.build_download_script(download_items, AppAssets.PATH)
pipeline(vm).add(
["sh", "-c", script],
error_message="Failed to download draft assets",

View File

@ -1,11 +1,46 @@
"""Shell script builder for downloading / writing assets into a sandbox VM.
Generates a self-contained POSIX shell script that handles two kinds of
``SandboxDownloadItem``:
- Items with *content* — written via base64 heredoc (sequential).
- Items with *url* — fetched via ``curl``/``wget``/``python3`` with
auto-detection, run as parallel background jobs.
Both kinds can be mixed freely in a single call.
"""
from __future__ import annotations
import base64
import shlex
import textwrap
from dataclasses import dataclass
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from core.zip_sandbox.entities import SandboxDownloadItem
def _render_download_script(root_path: str, download_commands: str) -> str:
def _build_inline_commands(items: list[SandboxDownloadItem], root_var: str) -> str:
"""Generate shell commands that write base64-encoded content to files."""
lines: list[str] = []
for idx, item in enumerate(items):
assert item.content is not None
dest = f"${{{root_var}}}/{shlex.quote(item.path)}"
encoded = base64.b64encode(item.content).decode("ascii")
lines.append(f'mkdir -p "$(dirname "{dest}")"')
lines.append(f"base64 -d <<'_INLINE_{idx}' > \"{dest}\"")
lines.append(encoded)
lines.append(f"_INLINE_{idx}")
return "\n".join(lines)
def _render_download_script(
root_path: str,
inline_commands: str,
download_commands: str,
need_downloader: bool,
) -> str:
python_download_cmd = (
'python3 - "${url}" "${dest}" <<"PY"\n'
"import sys\n"
@ -18,59 +53,88 @@ def _render_download_script(root_path: str, download_commands: str) -> str:
" f.write(data)\n"
"PY"
)
script = f"""
download_root={shlex.quote(root_path)}
if command -v curl >/dev/null 2>&1; then
download_cmd='curl -fsSL "${{url}}" -o "${{dest}}"'
elif command -v wget >/dev/null 2>&1; then
download_cmd='wget -q "${{url}}" -O "${{dest}}"'
elif command -v python3 >/dev/null 2>&1; then
download_cmd={shlex.quote(python_download_cmd)}
else
echo 'No downloader found (curl/wget/python3)' >&2
exit 1
fi
# Only emit the downloader-detection block when there are remote items.
if need_downloader:
downloader_block = f"""\
if command -v curl >/dev/null 2>&1; then
download_cmd='curl -fsSL "${{url}}" -o "${{dest}}"'
elif command -v wget >/dev/null 2>&1; then
download_cmd='wget -q "${{url}}" -O "${{dest}}"'
elif command -v python3 >/dev/null 2>&1; then
download_cmd={shlex.quote(python_download_cmd)}
else
echo 'No downloader found (curl/wget/python3)' >&2
exit 1
fi
mkdir -p "${{download_root}}"
fail_log="$(mktemp)"
fail_log="$(mktemp)"
download_one() {{
file_path="$1"
url="$2"
dest="${{download_root}}/${{file_path}}"
mkdir -p "$(dirname "${{dest}}")"
eval "${{download_cmd}}" 2>/dev/null || echo "${{file_path}}" >> "${{fail_log}}"
}}
download_one() {{
file_path="$1"
url="$2"
dest="${{download_root}}/${{file_path}}"
mkdir -p "$(dirname "${{dest}}")"
eval "${{download_cmd}}" 2>/dev/null || echo "${{file_path}}" >> "${{fail_log}}"
}}"""
else:
downloader_block = ""
{download_commands}
# The failure-check block is only meaningful when downloads occurred.
if need_downloader:
wait_block = textwrap.dedent("""\
wait
wait
if [ -s "${fail_log}" ]; then
mv "${fail_log}" "${download_root}/DOWNLOAD_FAILURES.txt"
else
rm -f "${fail_log}"
fi""")
else:
wait_block = ""
if [ -s "${{fail_log}}" ]; then
mv "${{fail_log}}" "${{download_root}}/DOWNLOAD_FAILURES.txt"
else
rm -f "${{fail_log}}"
fi
exit 0
"""
return textwrap.dedent(script).strip()
script = f"""\
download_root={shlex.quote(root_path)}
mkdir -p "${{download_root}}"
{downloader_block}
@dataclass(frozen=True)
class AssetDownloadItem:
path: str
url: str
{inline_commands}
{download_commands}
{wait_block}
exit 0"""
return script
class AssetDownloadService:
@staticmethod
def build_download_script(items: list[AssetDownloadItem], root_path: str) -> str:
# Build a portable shell script to download assets in parallel.
def build_download_script(
items: list[SandboxDownloadItem],
root_path: str,
) -> str:
"""Build a portable shell script to write inline assets and download remote ones.
Items with *content* are written first (sequential base64 decode),
then items with *url* are fetched in parallel background jobs.
The two kinds can be mixed freely in a single list.
"""
inline = [item for item in items if item.content is not None]
remote = [item for item in items if item.content is None]
inline_commands = _build_inline_commands(inline, "download_root") if inline else ""
commands: list[str] = []
for item in items:
for item in remote:
path = shlex.quote(item.path)
url = shlex.quote(item.url)
commands.append(f"download_one {path} {url} &")
download_commands = "\n".join(commands)
return _render_download_script(root_path, download_commands)
return _render_download_script(
root_path,
inline_commands,
download_commands,
need_downloader=bool(remote),
)