dify/api/dify_graph/nodes/http_request/entities.py

import mimetypes
from collections.abc import Sequence
from dataclasses import dataclass
from email.message import Message
from typing import Any, Literal

import charset_normalizer
import httpx
from pydantic import BaseModel, Field, ValidationInfo, field_validator

from dify_graph.nodes.base import BaseNodeData

HTTP_REQUEST_CONFIG_FILTER_KEY = "http_request_config"


class HttpRequestNodeAuthorizationConfig(BaseModel):
    type: Literal["basic", "bearer", "custom"]
    api_key: str
    header: str = ""


class HttpRequestNodeAuthorization(BaseModel):
    type: Literal["no-auth", "api-key"]
    config: HttpRequestNodeAuthorizationConfig | None = None

    @field_validator("config", mode="before")
    @classmethod
    def check_config(cls, v: HttpRequestNodeAuthorizationConfig, values: ValidationInfo):
        """
        Check config, if type is no-auth, config should be None, otherwise it should be a dict.
        """
        if values.data["type"] == "no-auth":
            return None
        else:
            if not v or not isinstance(v, dict):
                raise ValueError("config should be a dict")

            return v


class BodyData(BaseModel):
    key: str = ""
    type: Literal["file", "text"]
    value: str = ""
    file: Sequence[str] = Field(default_factory=list)


class HttpRequestNodeBody(BaseModel):
    type: Literal["none", "form-data", "x-www-form-urlencoded", "raw-text", "json", "binary"]
    data: Sequence[BodyData] = Field(default_factory=list)

    @field_validator("data", mode="before")
    @classmethod
    def check_data(cls, v: Any):
        """For compatibility, if body is not set, return empty list."""
        if not v:
            return []
        if isinstance(v, str):
            return [BodyData(key="", type="text", value=v)]
        return v


class HttpRequestNodeTimeout(BaseModel):
    connect: int | None = None
    read: int | None = None
    write: int | None = None


@dataclass(frozen=True, slots=True)
class HttpRequestNodeConfig:
    max_connect_timeout: int
    max_read_timeout: int
    max_write_timeout: int
    max_binary_size: int
    max_text_size: int
    ssl_verify: bool
    ssrf_default_max_retries: int

    def default_timeout(self) -> "HttpRequestNodeTimeout":
        return HttpRequestNodeTimeout(
            connect=self.max_connect_timeout,
            read=self.max_read_timeout,
            write=self.max_write_timeout,
        )


class HttpRequestNodeData(BaseNodeData):
    """
    Code Node Data.
    """

    method: Literal[
        "get",
        "post",
        "put",
        "patch",
        "delete",
        "head",
        "options",
        "GET",
        "POST",
        "PUT",
        "PATCH",
        "DELETE",
        "HEAD",
        "OPTIONS",
    ]
    url: str
    authorization: HttpRequestNodeAuthorization
    headers: str
    params: str
    body: HttpRequestNodeBody | None = None
    timeout: HttpRequestNodeTimeout | None = None
    ssl_verify: bool | None = None


class Response:
    headers: dict[str, str]
    response: httpx.Response
    _cached_text: str | None

    def __init__(self, response: httpx.Response):
        self.response = response
        self.headers = dict(response.headers)
        self._cached_text = None

    @property
    def is_file(self):
        """
        Determine if the response contains a file by checking:
        1. Content-Disposition header (RFC 6266)
        2. Content characteristics
        3. MIME type analysis
        """
        content_type = self.content_type.split(";")[0].strip().lower()
        parsed_content_disposition = self.parsed_content_disposition

        # Check if it's explicitly marked as an attachment
        if parsed_content_disposition:
            disp_type = parsed_content_disposition.get_content_disposition()  # Returns 'attachment', 'inline', or None
            filename = parsed_content_disposition.get_filename()  # Returns filename if present, None otherwise
            if disp_type == "attachment" or filename is not None:
                return True

        # For 'text/' types, only 'csv' should be downloaded as file
        if content_type.startswith("text/") and "csv" not in content_type:
            return False

        # For application types, try to detect if it's a text-based format
        if content_type.startswith("application/"):
            # Common text-based application types
            if any(
                text_type in content_type
                for text_type in ("json", "xml", "javascript", "x-www-form-urlencoded", "yaml", "graphql")
            ):
                return False

            # Try to detect if content is text-based by sampling first few bytes
            try:
                # Sample first 1024 bytes for text detection
                content_sample = self.response.content[:1024]
                content_sample.decode("utf-8")
                # If we can decode as UTF-8 and find common text patterns, likely not a file
                text_markers = (b"{", b"[", b"<", b"function", b"var ", b"const ", b"let ")
                if any(marker in content_sample for marker in text_markers):
                    return False
            except UnicodeDecodeError:
                # If we can't decode as UTF-8, likely a binary file
                return True

        # For other types, use MIME type analysis
        main_type, _ = mimetypes.guess_type("dummy" + (mimetypes.guess_extension(content_type) or ""))
        if main_type:
            return main_type.split("/")[0] in ("application", "image", "audio", "video")

        # For unknown types, check if it's a media type
        return any(media_type in content_type for media_type in ("image/", "audio/", "video/"))

    @property
    def content_type(self) -> str:
        return self.headers.get("content-type", "")

    @property
    def text(self) -> str:
        """
        Get response text with robust encoding detection.

        Uses charset_normalizer for better encoding detection than httpx's default,
        which helps handle Chinese and other non-ASCII characters properly.
        """
        # Check cache first
        if hasattr(self, "_cached_text") and self._cached_text is not None:
            return self._cached_text

        # Try charset_normalizer for robust encoding detection first
        detected_encoding = charset_normalizer.from_bytes(self.response.content).best()
        if detected_encoding and detected_encoding.encoding:
            try:
                text = self.response.content.decode(detected_encoding.encoding)
                self._cached_text = text
                return text
            except (UnicodeDecodeError, TypeError, LookupError):
                # Fallback to httpx's encoding detection if charset_normalizer fails
                pass

        # Fallback to httpx's built-in encoding detection
        text = self.response.text
        self._cached_text = text
        return text

    @property
    def content(self) -> bytes:
        return self.response.content

    @property
    def status_code(self) -> int:
        return self.response.status_code

    @property
    def size(self) -> int:
        return len(self.content)

    @property
    def readable_size(self) -> str:
        if self.size < 1024:
            return f"{self.size} bytes"
        elif self.size < 1024 * 1024:
            return f"{(self.size / 1024):.2f} KB"
        else:
            return f"{(self.size / 1024 / 1024):.2f} MB"

    @property
    def parsed_content_disposition(self) -> Message | None:
        content_disposition = self.headers.get("content-disposition", "")
        if content_disposition:
            msg = Message()
            msg["content-disposition"] = content_disposition
            return msg
        return None