dify/api/core/sandbox/bash/session.py

from __future__ import annotations

import json
import logging
import mimetypes
import os
import shlex
from types import TracebackType

from core.file import File, FileTransferMethod, FileType
from core.sandbox.sandbox import Sandbox
from core.session.cli_api import CliApiSession, CliApiSessionManager, CliContext
from core.skill.entities import ToolAccessPolicy
from core.skill.entities.tool_dependencies import ToolDependencies
from core.tools.signature import sign_tool_file
from core.tools.tool_file_manager import ToolFileManager
from core.virtual_environment.__base.helpers import pipeline

from ..bash.dify_cli import DifyCliConfig
from ..entities import DifyCli
from .bash_tool import SandboxBashTool

logger = logging.getLogger(__name__)

SANDBOX_READY_TIMEOUT = 60 * 10

# Default output directory for sandbox-generated files
SANDBOX_OUTPUT_DIR = "output"
# Maximum number of files to collect from sandbox output
MAX_OUTPUT_FILES = 50
# Maximum file size to collect (10MB)
MAX_OUTPUT_FILE_SIZE = 10 * 1024 * 1024


class SandboxBashSession:
    def __init__(self, *, sandbox: Sandbox, node_id: str, tools: ToolDependencies | None) -> None:
        self._sandbox = sandbox
        self._node_id = node_id
        self._tools = tools
        self._bash_tool: SandboxBashTool | None = None
        self._cli_api_session: CliApiSession | None = None
        self._tenant_id = sandbox.tenant_id
        self._user_id = sandbox.user_id
        self._app_id = sandbox.app_id
        self._assets_id = sandbox.assets_id

    def __enter__(self) -> SandboxBashSession:
        # Ensure sandbox initialization completes before any bash commands run.
        self._sandbox.wait_ready(timeout=SANDBOX_READY_TIMEOUT)
        cli = DifyCli(self._sandbox.id)
        self._cli_api_session = CliApiSessionManager().create(
            tenant_id=self._tenant_id,
            user_id=self._user_id,
            context=CliContext(tool_access=ToolAccessPolicy.from_dependencies(self._tools)),
        )
        if self._tools is not None and not self._tools.is_empty():
            tools_path = self._setup_node_tools_directory(cli, self._node_id, self._tools, self._cli_api_session)
        else:
            tools_path = cli.global_tools_path

        self._bash_tool = SandboxBashTool(
            sandbox=self._sandbox.vm,
            tenant_id=self._tenant_id,
            tools_path=tools_path,
        )
        return self

    def _setup_node_tools_directory(
        self,
        cli: DifyCli,
        node_id: str,
        tools: ToolDependencies,
        cli_api_session: CliApiSession,
    ) -> str:
        node_tools_path = cli.node_tools_path(node_id)
        config_json = json.dumps(
            DifyCliConfig.create(session=cli_api_session, tenant_id=self._tenant_id, tool_deps=tools).model_dump(
                mode="json"
            ),
            ensure_ascii=False,
        )
        config_path = shlex.quote(cli.node_config_path(node_id))

        vm = self._sandbox.vm
        # Merge mkdir + config write into a single pipeline to reduce round-trips.
        (
            pipeline(vm)
            .add(["mkdir", "-p", cli.global_tools_path], error_message="Failed to create global tools dir")
            .add(["mkdir", "-p", node_tools_path], error_message="Failed to create node tools dir")
            # Use a quoted heredoc (<<'EOF') so the shell performs no expansion on the
            # content — safe regardless of $, `, \, or quotes inside the JSON.
            .add(
                ["sh", "-c", f"cat > {config_path} << '__DIFY_CFG__'\n{config_json}\n__DIFY_CFG__"],
                error_message="Failed to write CLI config",
            )
            .execute(raise_on_error=True)
        )

        pipeline(vm, cwd=node_tools_path).add(
            [cli.bin_path, "init"], error_message="Failed to initialize Dify CLI"
        ).execute(raise_on_error=True)

        logger.info(
            "Node %s tools initialized, path=%s, tool_count=%d", node_id, node_tools_path, len(tools.references)
        )
        return node_tools_path

    def __exit__(
        self,
        exc_type: type[BaseException] | None,
        exc: BaseException | None,
        tb: TracebackType | None,
    ) -> bool:
        try:
            if self._cli_api_session is not None:
                CliApiSessionManager().delete(self._cli_api_session.id)
                logger.debug("Cleaned up SandboxSession session_id=%s", self._cli_api_session.id)
                self._cli_api_session = None
        except Exception:
            logger.exception("Failed to cleanup SandboxSession")
        return False

    @property
    def bash_tool(self) -> SandboxBashTool:
        if self._bash_tool is None:
            raise RuntimeError("SandboxSession is not initialized")
        return self._bash_tool

    def collect_output_files(self, output_dir: str = SANDBOX_OUTPUT_DIR) -> list[File]:
        """
        Collect files from sandbox output directory and save them as ToolFiles.

        Scans the specified output directory in sandbox, downloads each file,
        saves it as a ToolFile, and returns a list of File objects. The File
        objects will have valid tool_file_id that can be referenced by subsequent
        nodes via structured output.

        Args:
            output_dir: Directory path in sandbox to scan for output files.
                       Defaults to "output" (relative to workspace).

        Returns:
            List of File objects representing the collected files.
        """
        vm = self._sandbox.vm
        collected_files: list[File] = []

        try:
            file_states = vm.list_files(output_dir, limit=MAX_OUTPUT_FILES)
        except Exception as exc:
            # Output directory may not exist if no files were generated
            logger.debug("Failed to list sandbox output files in %s: %s", output_dir, exc)
            return collected_files

        tool_file_manager = ToolFileManager()

        for file_state in file_states:
            # Skip files that are too large
            if file_state.size > MAX_OUTPUT_FILE_SIZE:
                logger.warning(
                    "Skipping sandbox output file %s: size %d exceeds limit %d",
                    file_state.path,
                    file_state.size,
                    MAX_OUTPUT_FILE_SIZE,
                )
                continue

            try:
                # file_state.path is already relative to working_path (e.g., "output/file.png")
                file_content = vm.download_file(file_state.path)
                file_binary = file_content.getvalue()

                # Determine mime type from extension
                filename = os.path.basename(file_state.path)
                mime_type, _ = mimetypes.guess_type(filename)
                if not mime_type:
                    mime_type = "application/octet-stream"

                # Save as ToolFile
                tool_file = tool_file_manager.create_file_by_raw(
                    user_id=self._user_id,
                    tenant_id=self._tenant_id,
                    conversation_id=None,
                    file_binary=file_binary,
                    mimetype=mime_type,
                    filename=filename,
                )

                # Determine file type from mime type
                file_type = _get_file_type_from_mime(mime_type)
                extension = os.path.splitext(filename)[1] if "." in filename else ".bin"
                url = sign_tool_file(tool_file.id, extension)

                # Create File object with tool_file_id as related_id
                file_obj = File(
                    id=tool_file.id,  # Use tool_file_id as the File id for easy reference
                    tenant_id=self._tenant_id,
                    type=file_type,
                    transfer_method=FileTransferMethod.TOOL_FILE,
                    filename=filename,
                    extension=extension,
                    mime_type=mime_type,
                    size=len(file_binary),
                    related_id=tool_file.id,
                    url=url,
                    storage_key=tool_file.file_key,
                )
                collected_files.append(file_obj)

                logger.info(
                    "Collected sandbox output file: %s -> tool_file_id=%s",
                    file_state.path,
                    tool_file.id,
                )

            except Exception as exc:
                logger.warning("Failed to collect sandbox output file %s: %s", file_state.path, exc)
                continue

        logger.info(
            "Collected %d files from sandbox output directory %s",
            len(collected_files),
            output_dir,
        )
        return collected_files


def _get_file_type_from_mime(mime_type: str) -> FileType:
    """Determine FileType from mime type."""
    if mime_type.startswith("image/"):
        return FileType.IMAGE
    elif mime_type.startswith("video/"):
        return FileType.VIDEO
    elif mime_type.startswith("audio/"):
        return FileType.AUDIO
    elif "text" in mime_type or "pdf" in mime_type:
        return FileType.DOCUMENT
    else:
        return FileType.CUSTOM