Merge branch 'main' into fix/chore-fix

2026-05-04 01:18:05 +08:00 · 2025-01-08 20:36:22 +08:00
parent b56d2b739b f4ee50a7ad
commit fb309462ad
203 changed files with 3469 additions and 2327 deletions
--- a/api/core/workflow/nodes/document_extractor/node.py
+++ b/api/core/workflow/nodes/document_extractor/node.py
@ -2,14 +2,18 @@ import csv
 import io
 import json
 import logging
+import operator
 import os
 import tempfile
-from typing import cast
+from collections.abc import Mapping, Sequence
+from typing import Any, cast

 import docx
 import pandas as pd
 import pypdfium2  # type: ignore
 import yaml  # type: ignore
+from docx.table import Table
+from docx.text.paragraph import Paragraph

 from configs import dify_config
 from core.file import File, FileTransferMethod, file_manager
@ -78,6 +82,23 @@ class DocumentExtractorNode(BaseNode[DocumentExtractorNodeData]):
                process_data=process_data,
            )

+    @classmethod
+    def _extract_variable_selector_to_variable_mapping(
+        cls,
+        *,
+        graph_config: Mapping[str, Any],
+        node_id: str,
+        node_data: DocumentExtractorNodeData,
+    ) -> Mapping[str, Sequence[str]]:
+        """
+        Extract variable selector to variable mapping
+        :param graph_config: graph config
+        :param node_id: node id
+        :param node_data: node data
+        :return:
+        """
+        return {node_id + ".files": node_data.variable_selector}
+

 def _extract_text_by_mime_type(*, file_content: bytes, mime_type: str) -> str:
    """Extract text from a file based on its MIME type."""
@ -189,35 +210,56 @@ def _extract_text_from_doc(file_content: bytes) -> str:
        doc_file = io.BytesIO(file_content)
        doc = docx.Document(doc_file)
        text = []
-        # Process paragraphs
-        for paragraph in doc.paragraphs:
-            if paragraph.text.strip():
-                text.append(paragraph.text)

-        # Process tables
-        for table in doc.tables:
-            # Table header
-            try:
-                # table maybe cause errors so ignore it.
-                if len(table.rows) > 0 and table.rows[0].cells is not None:
+        # Keep track of paragraph and table positions
+        content_items: list[tuple[int, str, Table | Paragraph]] = []
+
+        # Process paragraphs and tables
+        for i, paragraph in enumerate(doc.paragraphs):
+            if paragraph.text.strip():
+                content_items.append((i, "paragraph", paragraph))
+
+        for i, table in enumerate(doc.tables):
+            content_items.append((i, "table", table))
+
+        # Sort content items based on their original position
+        content_items.sort(key=operator.itemgetter(0))
+
+        # Process sorted content
+        for _, item_type, item in content_items:
+            if item_type == "paragraph":
+                if isinstance(item, Table):
+                    continue
+                text.append(item.text)
+            elif item_type == "table":
+                # Process tables
+                if not isinstance(item, Table):
+                    continue
+                try:
                    # Check if any cell in the table has text
                    has_content = False
-                    for row in table.rows:
+                    for row in item.rows:
                        if any(cell.text.strip() for cell in row.cells):
                            has_content = True
                            break

                    if has_content:
-                        markdown_table = "| " + " | ".join(cell.text for cell in table.rows[0].cells) + " |\n"
-                        markdown_table += "| " + " | ".join(["---"] * len(table.rows[0].cells)) + " |\n"
-                        for row in table.rows[1:]:
-                            markdown_table += "| " + " | ".join(cell.text for cell in row.cells) + " |\n"
+                        cell_texts = [cell.text.replace("\n", "<br>") for cell in item.rows[0].cells]
+                        markdown_table = f"| {' | '.join(cell_texts)} |\n"
+                        markdown_table += f"| {' | '.join(['---'] * len(item.rows[0].cells))} |\n"
+
+                        for row in item.rows[1:]:
+                            # Replace newlines with <br> in each cell
+                            row_cells = [cell.text.replace("\n", "<br>") for cell in row.cells]
+                            markdown_table += "| " + " | ".join(row_cells) + " |\n"
+
                        text.append(markdown_table)
-            except Exception as e:
-                logger.warning(f"Failed to extract table from DOC/DOCX: {e}")
-                continue
+                except Exception as e:
+                    logger.warning(f"Failed to extract table from DOC/DOCX: {e}")
+                    continue

        return "\n".join(text)
+
    except Exception as e:
        raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e

--- a/api/core/workflow/nodes/http_request/entities.py
+++ b/api/core/workflow/nodes/http_request/entities.py
@ -68,7 +68,22 @@ class HttpRequestNodeData(BaseNodeData):
    Code Node Data.
    """

-    method: Literal["get", "post", "put", "patch", "delete", "head"]
+    method: Literal[
+        "get",
+        "post",
+        "put",
+        "patch",
+        "delete",
+        "head",
+        "options",
+        "GET",
+        "POST",
+        "PUT",
+        "PATCH",
+        "DELETE",
+        "HEAD",
+        "OPTIONS",
+    ]
    url: str
    authorization: HttpRequestNodeAuthorization
    headers: str
--- a/api/core/workflow/nodes/http_request/executor.py
+++ b/api/core/workflow/nodes/http_request/executor.py
@ -37,7 +37,22 @@ BODY_TYPE_TO_CONTENT_TYPE = {


 class Executor:
-    method: Literal["get", "head", "post", "put", "delete", "patch"]
+    method: Literal[
+        "get",
+        "head",
+        "post",
+        "put",
+        "delete",
+        "patch",
+        "options",
+        "GET",
+        "POST",
+        "PUT",
+        "PATCH",
+        "DELETE",
+        "HEAD",
+        "OPTIONS",
+    ]
    url: str
    params: list[tuple[str, str]] | None
    content: str | bytes | None
@ -67,12 +82,6 @@ class Executor:
                node_data.authorization.config.api_key
            ).text

-        # check if node_data.url is a valid URL
-        if not node_data.url:
-            raise InvalidURLError("url is required")
-        if not node_data.url.startswith(("http://", "https://")):
-            raise InvalidURLError("url should start with http:// or https://")
-
        self.url: str = node_data.url
        self.method = node_data.method
        self.auth = node_data.authorization
@ -99,6 +108,12 @@ class Executor:
    def _init_url(self):
        self.url = self.variable_pool.convert_template(self.node_data.url).text

+        # check if url is a valid URL
+        if not self.url:
+            raise InvalidURLError("url is required")
+        if not self.url.startswith(("http://", "https://")):
+            raise InvalidURLError("url should start with http:// or https://")
+
    def _init_params(self):
        """
        Almost same as _init_headers(), difference:
@ -158,7 +173,10 @@ class Executor:
                    if len(data) != 1:
                        raise RequestBodyError("json body type should have exactly one item")
                    json_string = self.variable_pool.convert_template(data[0].value).text
-                    json_object = json.loads(json_string, strict=False)
+                    try:
+                        json_object = json.loads(json_string, strict=False)
+                    except json.JSONDecodeError as e:
+                        raise RequestBodyError(f"Failed to parse JSON: {json_string}") from e
                    self.json = json_object
                    # self.json = self._parse_object_contains_variables(json_object)
                case "binary":
@ -246,7 +264,22 @@ class Executor:
        """
        do http request depending on api bundle
        """
-        if self.method not in {"get", "head", "post", "put", "delete", "patch"}:
+        if self.method not in {
+            "get",
+            "head",
+            "post",
+            "put",
+            "delete",
+            "patch",
+            "options",
+            "GET",
+            "POST",
+            "PUT",
+            "PATCH",
+            "DELETE",
+            "HEAD",
+            "OPTIONS",
+        }:
            raise InvalidHttpMethodError(f"Invalid http method {self.method}")

        request_args = {
@ -263,7 +296,7 @@ class Executor:
        }
        # request_args = {k: v for k, v in request_args.items() if v is not None}
        try:
-            response = getattr(ssrf_proxy, self.method)(**request_args)
+            response = getattr(ssrf_proxy, self.method.lower())(**request_args)
        except (ssrf_proxy.MaxRetriesExceededError, httpx.RequestError) as e:
            raise HttpRequestNodeError(str(e))
        # FIXME: fix type ignore, this maybe httpx type issue