mirror of
https://github.com/langgenius/dify.git
synced 2026-05-04 01:18:05 +08:00
Merge branch 'feat/queue-based-graph-engine' into feat/rag-2
This commit is contained in:
@ -19,7 +19,7 @@ class NotionInfo(BaseModel):
|
||||
tenant_id: str
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
def __init__(self, **data) -> None:
|
||||
def __init__(self, **data):
|
||||
super().__init__(**data)
|
||||
|
||||
|
||||
@ -50,5 +50,5 @@ class ExtractSetting(BaseModel):
|
||||
document_model: Optional[str] = None
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
def __init__(self, **data) -> None:
|
||||
def __init__(self, **data):
|
||||
super().__init__(**data)
|
||||
|
||||
@ -22,7 +22,6 @@ class FirecrawlApp:
|
||||
"formats": ["markdown"],
|
||||
"onlyMainContent": True,
|
||||
"timeout": 30000,
|
||||
"integration": "dify",
|
||||
}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
@ -40,7 +39,7 @@ class FirecrawlApp:
|
||||
def crawl_url(self, url, params=None) -> str:
|
||||
# Documentation: https://docs.firecrawl.dev/api-reference/endpoint/crawl-post
|
||||
headers = self._prepare_headers()
|
||||
json_data = {"url": url, "integration": "dify"}
|
||||
json_data = {"url": url}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f"{self.base_url}/v1/crawl", json_data, headers)
|
||||
@ -123,7 +122,7 @@ class FirecrawlApp:
|
||||
return response
|
||||
return response
|
||||
|
||||
def _handle_error(self, response, action) -> None:
|
||||
def _handle_error(self, response, action):
|
||||
error_message = response.json().get("error", "Unknown error occurred")
|
||||
raise Exception(f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}") # type: ignore[return]
|
||||
|
||||
@ -138,7 +137,6 @@ class FirecrawlApp:
|
||||
"timeout": 60000,
|
||||
"ignoreInvalidURLs": False,
|
||||
"scrapeOptions": {},
|
||||
"integration": "dify",
|
||||
}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
|
||||
@ -29,7 +29,7 @@ def detect_file_encodings(file_path: str, timeout: int = 5, sample_size: int = 1
|
||||
"""
|
||||
import chardet
|
||||
|
||||
def read_and_detect(file_path: str) -> list[dict]:
|
||||
def read_and_detect(file_path: str):
|
||||
with open(file_path, "rb") as f:
|
||||
# Read only a sample of the file for encoding detection
|
||||
# This prevents timeout on large files while still providing accurate encoding detection
|
||||
|
||||
@ -335,7 +335,8 @@ class NotionExtractor(BaseExtractor):
|
||||
|
||||
last_edited_time = self.get_notion_last_edited_time()
|
||||
data_source_info = document_model.data_source_info_dict
|
||||
data_source_info["last_edited_time"] = last_edited_time
|
||||
if data_source_info:
|
||||
data_source_info["last_edited_time"] = last_edited_time
|
||||
|
||||
db.session.query(DocumentModel).filter_by(id=document_model.id).update(
|
||||
{DocumentModel.data_source_info: json.dumps(data_source_info)}
|
||||
|
||||
@ -9,7 +9,7 @@ class WaterCrawlProvider:
|
||||
def __init__(self, api_key, base_url: str | None = None):
|
||||
self.client = WaterCrawlAPIClient(api_key, base_url)
|
||||
|
||||
def crawl_url(self, url, options: Optional[dict | Any] = None) -> dict:
|
||||
def crawl_url(self, url, options: Optional[dict | Any] = None):
|
||||
options = options or {}
|
||||
spider_options = {
|
||||
"max_depth": 1,
|
||||
@ -41,7 +41,7 @@ class WaterCrawlProvider:
|
||||
|
||||
return {"status": "active", "job_id": result.get("uuid")}
|
||||
|
||||
def get_crawl_status(self, crawl_request_id) -> dict:
|
||||
def get_crawl_status(self, crawl_request_id):
|
||||
response = self.client.get_crawl_request(crawl_request_id)
|
||||
data = []
|
||||
if response["status"] in ["new", "running"]:
|
||||
@ -82,11 +82,11 @@ class WaterCrawlProvider:
|
||||
|
||||
return None
|
||||
|
||||
def scrape_url(self, url: str) -> dict:
|
||||
def scrape_url(self, url: str):
|
||||
response = self.client.scrape_url(url=url, sync=True, prefetched=True)
|
||||
return self._structure_data(response)
|
||||
|
||||
def _structure_data(self, result_object: dict) -> dict:
|
||||
def _structure_data(self, result_object: dict):
|
||||
if isinstance(result_object.get("result", {}), str):
|
||||
raise ValueError("Invalid result object. Expected a dictionary.")
|
||||
|
||||
|
||||
@ -56,7 +56,7 @@ class WordExtractor(BaseExtractor):
|
||||
elif not os.path.isfile(self.file_path):
|
||||
raise ValueError(f"File path {self.file_path} is not a valid file or url")
|
||||
|
||||
def __del__(self) -> None:
|
||||
def __del__(self):
|
||||
if hasattr(self, "temp_file"):
|
||||
self.temp_file.close()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user