mirror of
https://github.com/langgenius/dify.git
synced 2026-05-04 17:38:04 +08:00
feat(firecrawl): follow pagination when crawl status is completed (#33864)
Co-authored-by: Crazywoola <100913391+crazywoola@users.noreply.github.com>
This commit is contained in:
@ -95,15 +95,11 @@ class FirecrawlApp:
|
||||
if response.status_code == 200:
|
||||
crawl_status_response = response.json()
|
||||
if crawl_status_response.get("status") == "completed":
|
||||
total = crawl_status_response.get("total", 0)
|
||||
if total == 0:
|
||||
# Normalize to avoid None bypassing the zero-guard when the API returns null.
|
||||
total = crawl_status_response.get("total") or 0
|
||||
if total <= 0:
|
||||
raise Exception("Failed to check crawl status. Error: No page found")
|
||||
data = crawl_status_response.get("data", [])
|
||||
url_data_list: list[FirecrawlDocumentData] = []
|
||||
for item in data:
|
||||
if isinstance(item, dict) and "metadata" in item and "markdown" in item:
|
||||
url_data = self._extract_common_fields(item)
|
||||
url_data_list.append(url_data)
|
||||
url_data_list = self._collect_all_crawl_pages(crawl_status_response, headers)
|
||||
if url_data_list:
|
||||
file_key = "website_files/" + job_id + ".txt"
|
||||
try:
|
||||
@ -120,6 +116,36 @@ class FirecrawlApp:
|
||||
self._handle_error(response, "check crawl status")
|
||||
raise RuntimeError("unreachable: _handle_error always raises")
|
||||
|
||||
def _collect_all_crawl_pages(
|
||||
self, first_page: dict[str, Any], headers: dict[str, str]
|
||||
) -> list[FirecrawlDocumentData]:
|
||||
"""Collect all crawl result pages by following pagination links.
|
||||
|
||||
Raises an exception if any paginated request fails, to avoid returning
|
||||
partial data that is inconsistent with the reported total.
|
||||
|
||||
The number of pages processed is capped at ``total`` (the
|
||||
server-reported page count) to guard against infinite loops caused by
|
||||
a misbehaving server that keeps returning a ``next`` URL.
|
||||
"""
|
||||
total: int = first_page.get("total") or 0
|
||||
url_data_list: list[FirecrawlDocumentData] = []
|
||||
current_page = first_page
|
||||
pages_processed = 0
|
||||
while True:
|
||||
for item in current_page.get("data", []):
|
||||
if isinstance(item, dict) and "metadata" in item and "markdown" in item:
|
||||
url_data_list.append(self._extract_common_fields(item))
|
||||
next_url: str | None = current_page.get("next")
|
||||
pages_processed += 1
|
||||
if not next_url or pages_processed >= total:
|
||||
break
|
||||
response = self._get_request(next_url, headers)
|
||||
if response.status_code != 200:
|
||||
self._handle_error(response, "fetch next crawl page")
|
||||
current_page = response.json()
|
||||
return url_data_list
|
||||
|
||||
def _format_crawl_status_response(
|
||||
self,
|
||||
status: str,
|
||||
|
||||
Reference in New Issue
Block a user