feat(firecrawl): follow pagination when crawl status is completed (#33864)

Co-authored-by: Crazywoola <100913391+crazywoola@users.noreply.github.com>
This commit is contained in:
kurokobo
2026-03-23 22:19:32 +09:00
committed by GitHub
parent 30dd36505c
commit 30deeb6f1c
2 changed files with 112 additions and 8 deletions

View File

@ -95,15 +95,11 @@ class FirecrawlApp:
if response.status_code == 200:
crawl_status_response = response.json()
if crawl_status_response.get("status") == "completed":
total = crawl_status_response.get("total", 0)
if total == 0:
# Normalize to avoid None bypassing the zero-guard when the API returns null.
total = crawl_status_response.get("total") or 0
if total <= 0:
raise Exception("Failed to check crawl status. Error: No page found")
data = crawl_status_response.get("data", [])
url_data_list: list[FirecrawlDocumentData] = []
for item in data:
if isinstance(item, dict) and "metadata" in item and "markdown" in item:
url_data = self._extract_common_fields(item)
url_data_list.append(url_data)
url_data_list = self._collect_all_crawl_pages(crawl_status_response, headers)
if url_data_list:
file_key = "website_files/" + job_id + ".txt"
try:
@ -120,6 +116,36 @@ class FirecrawlApp:
self._handle_error(response, "check crawl status")
raise RuntimeError("unreachable: _handle_error always raises")
def _collect_all_crawl_pages(
self, first_page: dict[str, Any], headers: dict[str, str]
) -> list[FirecrawlDocumentData]:
"""Collect all crawl result pages by following pagination links.
Raises an exception if any paginated request fails, to avoid returning
partial data that is inconsistent with the reported total.
The number of pages processed is capped at ``total`` (the
server-reported page count) to guard against infinite loops caused by
a misbehaving server that keeps returning a ``next`` URL.
"""
total: int = first_page.get("total") or 0
url_data_list: list[FirecrawlDocumentData] = []
current_page = first_page
pages_processed = 0
while True:
for item in current_page.get("data", []):
if isinstance(item, dict) and "metadata" in item and "markdown" in item:
url_data_list.append(self._extract_common_fields(item))
next_url: str | None = current_page.get("next")
pages_processed += 1
if not next_url or pages_processed >= total:
break
response = self._get_request(next_url, headers)
if response.status_code != 200:
self._handle_error(response, "fetch next crawl page")
current_page = response.json()
return url_data_list
def _format_crawl_status_response(
self,
status: str,