Merge branch 'feat/queue-based-graph-engine' into feat/rag-2

This commit is contained in:
-LAN-
2025-09-08 14:30:43 +08:00
828 changed files with 7240 additions and 2951 deletions

View File

@ -19,7 +19,7 @@ class NotionInfo(BaseModel):
tenant_id: str
model_config = ConfigDict(arbitrary_types_allowed=True)
def __init__(self, **data) -> None:
def __init__(self, **data):
super().__init__(**data)
@ -50,5 +50,5 @@ class ExtractSetting(BaseModel):
document_model: Optional[str] = None
model_config = ConfigDict(arbitrary_types_allowed=True)
def __init__(self, **data) -> None:
def __init__(self, **data):
super().__init__(**data)

View File

@ -22,7 +22,6 @@ class FirecrawlApp:
"formats": ["markdown"],
"onlyMainContent": True,
"timeout": 30000,
"integration": "dify",
}
if params:
json_data.update(params)
@ -40,7 +39,7 @@ class FirecrawlApp:
def crawl_url(self, url, params=None) -> str:
# Documentation: https://docs.firecrawl.dev/api-reference/endpoint/crawl-post
headers = self._prepare_headers()
json_data = {"url": url, "integration": "dify"}
json_data = {"url": url}
if params:
json_data.update(params)
response = self._post_request(f"{self.base_url}/v1/crawl", json_data, headers)
@ -123,7 +122,7 @@ class FirecrawlApp:
return response
return response
def _handle_error(self, response, action) -> None:
def _handle_error(self, response, action):
error_message = response.json().get("error", "Unknown error occurred")
raise Exception(f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}") # type: ignore[return]
@ -138,7 +137,6 @@ class FirecrawlApp:
"timeout": 60000,
"ignoreInvalidURLs": False,
"scrapeOptions": {},
"integration": "dify",
}
if params:
json_data.update(params)

View File

@ -29,7 +29,7 @@ def detect_file_encodings(file_path: str, timeout: int = 5, sample_size: int = 1
"""
import chardet
def read_and_detect(file_path: str) -> list[dict]:
def read_and_detect(file_path: str):
with open(file_path, "rb") as f:
# Read only a sample of the file for encoding detection
# This prevents timeout on large files while still providing accurate encoding detection

View File

@ -335,7 +335,8 @@ class NotionExtractor(BaseExtractor):
last_edited_time = self.get_notion_last_edited_time()
data_source_info = document_model.data_source_info_dict
data_source_info["last_edited_time"] = last_edited_time
if data_source_info:
data_source_info["last_edited_time"] = last_edited_time
db.session.query(DocumentModel).filter_by(id=document_model.id).update(
{DocumentModel.data_source_info: json.dumps(data_source_info)}

View File

@ -9,7 +9,7 @@ class WaterCrawlProvider:
def __init__(self, api_key, base_url: str | None = None):
self.client = WaterCrawlAPIClient(api_key, base_url)
def crawl_url(self, url, options: Optional[dict | Any] = None) -> dict:
def crawl_url(self, url, options: Optional[dict | Any] = None):
options = options or {}
spider_options = {
"max_depth": 1,
@ -41,7 +41,7 @@ class WaterCrawlProvider:
return {"status": "active", "job_id": result.get("uuid")}
def get_crawl_status(self, crawl_request_id) -> dict:
def get_crawl_status(self, crawl_request_id):
response = self.client.get_crawl_request(crawl_request_id)
data = []
if response["status"] in ["new", "running"]:
@ -82,11 +82,11 @@ class WaterCrawlProvider:
return None
def scrape_url(self, url: str) -> dict:
def scrape_url(self, url: str):
response = self.client.scrape_url(url=url, sync=True, prefetched=True)
return self._structure_data(response)
def _structure_data(self, result_object: dict) -> dict:
def _structure_data(self, result_object: dict):
if isinstance(result_object.get("result", {}), str):
raise ValueError("Invalid result object. Expected a dictionary.")

View File

@ -56,7 +56,7 @@ class WordExtractor(BaseExtractor):
elif not os.path.isfile(self.file_path):
raise ValueError(f"File path {self.file_path} is not a valid file or url")
def __del__(self) -> None:
def __del__(self):
if hasattr(self, "temp_file"):
self.temp_file.close()