merge feat/plugins

2026-04-30 07:28:05 +08:00 · 2024-12-27 14:21:32 +08:00
parent e7a6bc0ec9 df5fb6dca9
commit 1cc15d1ce8
324 changed files with 15188 additions and 8023 deletions
--- a/api/core/rag/extractor/excel_extractor.py
+++ b/api/core/rag/extractor/excel_extractor.py
@ -4,7 +4,7 @@ import os
 from typing import Optional, cast

 import pandas as pd
-from openpyxl import load_workbook
+from openpyxl import load_workbook  # type: ignore

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
--- a/api/core/rag/extractor/extract_processor.py
+++ b/api/core/rag/extractor/extract_processor.py
@ -24,7 +24,6 @@ from core.rag.extractor.unstructured.unstructured_markdown_extractor import Unst
 from core.rag.extractor.unstructured.unstructured_msg_extractor import UnstructuredMsgExtractor
 from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
 from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor
-from core.rag.extractor.unstructured.unstructured_text_extractor import UnstructuredTextExtractor
 from core.rag.extractor.unstructured.unstructured_xml_extractor import UnstructuredXmlExtractor
 from core.rag.extractor.word_extractor import WordExtractor
 from core.rag.models.document import Document
@ -103,12 +102,11 @@ class ExtractProcessor:
                input_file = Path(file_path)
                file_extension = input_file.suffix.lower()
                etl_type = dify_config.ETL_TYPE
-                unstructured_api_url = dify_config.UNSTRUCTURED_API_URL
-                unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY
-                assert unstructured_api_url is not None, "unstructured_api_url is required"
-                assert unstructured_api_key is not None, "unstructured_api_key is required"
                extractor: Optional[BaseExtractor] = None
                if etl_type == "Unstructured":
+                    unstructured_api_url = dify_config.UNSTRUCTURED_API_URL
+                    unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY or ""
+
                    if file_extension in {".xlsx", ".xls"}:
                        extractor = ExcelExtractor(file_path)
                    elif file_extension == ".pdf":
@ -141,11 +139,7 @@ class ExtractProcessor:
                        extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url, unstructured_api_key)
                    else:
                        # txt
-                        extractor = (
-                            UnstructuredTextExtractor(file_path, unstructured_api_url)
-                            if is_automatic
-                            else TextExtractor(file_path, autodetect_encoding=True)
-                        )
+                        extractor = TextExtractor(file_path, autodetect_encoding=True)
                else:
                    if file_extension in {".xlsx", ".xls"}:
                        extractor = ExcelExtractor(file_path)
--- a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
@ -1,5 +1,6 @@
 import base64
 import logging
+from typing import Optional

 from bs4 import BeautifulSoup  # type: ignore

@ -15,7 +16,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
        file_path: Path to the file to load.
    """

-    def __init__(self, file_path: str, api_url: str, api_key: str):
+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
--- a/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
@ -19,7 +19,7 @@ class UnstructuredEpubExtractor(BaseExtractor):
        self,
        file_path: str,
        api_url: Optional[str] = None,
-        api_key: Optional[str] = None,
+        api_key: str = "",
    ):
        """Initialize with file path."""
        self._file_path = file_path
@ -30,9 +30,6 @@ class UnstructuredEpubExtractor(BaseExtractor):
        if self._api_url:
            from unstructured.partition.api import partition_via_api

-            if self._api_key is None:
-                raise ValueError("api_key is required")
-
            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
        else:
            from unstructured.partition.epub import partition_epub
--- a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
@ -1,4 +1,5 @@
 import logging
+from typing import Optional

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@ -24,7 +25,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
            if the specified encoding fails.
    """

-    def __init__(self, file_path: str, api_url: str, api_key: str):
+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
--- a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
@ -1,4 +1,5 @@
 import logging
+from typing import Optional

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@ -14,7 +15,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
        file_path: Path to the file to load.
    """

-    def __init__(self, file_path: str, api_url: str, api_key: str):
+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
--- a/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
@ -1,4 +1,5 @@
 import logging
+from typing import Optional

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@ -14,7 +15,7 @@ class UnstructuredPPTExtractor(BaseExtractor):
        file_path: Path to the file to load.
    """

-    def __init__(self, file_path: str, api_url: str, api_key: str):
+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
--- a/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
@ -1,4 +1,5 @@
 import logging
+from typing import Optional

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@ -14,7 +15,7 @@ class UnstructuredPPTXExtractor(BaseExtractor):
        file_path: Path to the file to load.
    """

-    def __init__(self, file_path: str, api_url: str, api_key: str):
+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
--- a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
@ -1,4 +1,5 @@
 import logging
+from typing import Optional

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@ -14,7 +15,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
        file_path: Path to the file to load.
    """

-    def __init__(self, file_path: str, api_url: str, api_key: str):
+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
--- a/api/core/rag/extractor/word_extractor.py
+++ b/api/core/rag/extractor/word_extractor.py
@ -267,8 +267,10 @@ class WordExtractor(BaseExtractor):
                if isinstance(element.tag, str) and element.tag.endswith("p"):  # paragraph
                    para = paragraphs.pop(0)
                    parsed_paragraph = parse_paragraph(para)
-                    if parsed_paragraph:
+                    if parsed_paragraph.strip():
                        content.append(parsed_paragraph)
+                    else:
+                        content.append("\n")
                elif isinstance(element.tag, str) and element.tag.endswith("tbl"):  # table
                    table = tables.pop(0)
                    content.append(self._table_to_markdown(table, image_map))