Merge branch 'feat/queue-based-graph-engine' into feat/rag-2

2026-05-04 01:18:05 +08:00 · 2025-09-08 14:30:43 +08:00
parent b48c266908 299141ae01
commit 23cd615489
828 changed files with 7240 additions and 2951 deletions
--- a/api/core/rag/extractor/entity/extract_setting.py
+++ b/api/core/rag/extractor/entity/extract_setting.py
@ -19,7 +19,7 @@ class NotionInfo(BaseModel):
    tenant_id: str
    model_config = ConfigDict(arbitrary_types_allowed=True)

-    def __init__(self, **data) -> None:
+    def __init__(self, **data):
        super().__init__(**data)


@ -50,5 +50,5 @@ class ExtractSetting(BaseModel):
    document_model: Optional[str] = None
    model_config = ConfigDict(arbitrary_types_allowed=True)

-    def __init__(self, **data) -> None:
+    def __init__(self, **data):
        super().__init__(**data)
--- a/api/core/rag/extractor/firecrawl/firecrawl_app.py
+++ b/api/core/rag/extractor/firecrawl/firecrawl_app.py
@ -22,7 +22,6 @@ class FirecrawlApp:
            "formats": ["markdown"],
            "onlyMainContent": True,
            "timeout": 30000,
-            "integration": "dify",
        }
        if params:
            json_data.update(params)
@ -40,7 +39,7 @@ class FirecrawlApp:
    def crawl_url(self, url, params=None) -> str:
        # Documentation: https://docs.firecrawl.dev/api-reference/endpoint/crawl-post
        headers = self._prepare_headers()
-        json_data = {"url": url, "integration": "dify"}
+        json_data = {"url": url}
        if params:
            json_data.update(params)
        response = self._post_request(f"{self.base_url}/v1/crawl", json_data, headers)
@ -123,7 +122,7 @@ class FirecrawlApp:
                return response
        return response

-    def _handle_error(self, response, action) -> None:
+    def _handle_error(self, response, action):
        error_message = response.json().get("error", "Unknown error occurred")
        raise Exception(f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}")  # type: ignore[return]

@ -138,7 +137,6 @@ class FirecrawlApp:
            "timeout": 60000,
            "ignoreInvalidURLs": False,
            "scrapeOptions": {},
-            "integration": "dify",
        }
        if params:
            json_data.update(params)
--- a/api/core/rag/extractor/helpers.py
+++ b/api/core/rag/extractor/helpers.py
@ -29,7 +29,7 @@ def detect_file_encodings(file_path: str, timeout: int = 5, sample_size: int = 1
    """
    import chardet

-    def read_and_detect(file_path: str) -> list[dict]:
+    def read_and_detect(file_path: str):
        with open(file_path, "rb") as f:
            # Read only a sample of the file for encoding detection
            # This prevents timeout on large files while still providing accurate encoding detection
--- a/api/core/rag/extractor/notion_extractor.py
+++ b/api/core/rag/extractor/notion_extractor.py
@ -335,7 +335,8 @@ class NotionExtractor(BaseExtractor):

        last_edited_time = self.get_notion_last_edited_time()
        data_source_info = document_model.data_source_info_dict
-        data_source_info["last_edited_time"] = last_edited_time
+        if data_source_info:
+            data_source_info["last_edited_time"] = last_edited_time

        db.session.query(DocumentModel).filter_by(id=document_model.id).update(
            {DocumentModel.data_source_info: json.dumps(data_source_info)}
--- a/api/core/rag/extractor/watercrawl/provider.py
+++ b/api/core/rag/extractor/watercrawl/provider.py
@ -9,7 +9,7 @@ class WaterCrawlProvider:
    def __init__(self, api_key, base_url: str | None = None):
        self.client = WaterCrawlAPIClient(api_key, base_url)

-    def crawl_url(self, url, options: Optional[dict | Any] = None) -> dict:
+    def crawl_url(self, url, options: Optional[dict | Any] = None):
        options = options or {}
        spider_options = {
            "max_depth": 1,
@ -41,7 +41,7 @@ class WaterCrawlProvider:

        return {"status": "active", "job_id": result.get("uuid")}

-    def get_crawl_status(self, crawl_request_id) -> dict:
+    def get_crawl_status(self, crawl_request_id):
        response = self.client.get_crawl_request(crawl_request_id)
        data = []
        if response["status"] in ["new", "running"]:
@ -82,11 +82,11 @@ class WaterCrawlProvider:

        return None

-    def scrape_url(self, url: str) -> dict:
+    def scrape_url(self, url: str):
        response = self.client.scrape_url(url=url, sync=True, prefetched=True)
        return self._structure_data(response)

-    def _structure_data(self, result_object: dict) -> dict:
+    def _structure_data(self, result_object: dict):
        if isinstance(result_object.get("result", {}), str):
            raise ValueError("Invalid result object. Expected a dictionary.")

--- a/api/core/rag/extractor/word_extractor.py
+++ b/api/core/rag/extractor/word_extractor.py
@ -56,7 +56,7 @@ class WordExtractor(BaseExtractor):
        elif not os.path.isfile(self.file_path):
            raise ValueError(f"File path {self.file_path} is not a valid file or url")

-    def __del__(self) -> None:
+    def __del__(self):
        if hasattr(self, "temp_file"):
            self.temp_file.close()