feat(firecrawl): follow pagination when crawl status is completed (#33864)

Co-authored-by: Crazywoola <100913391+crazywoola@users.noreply.github.com>
2026-05-04 17:38:04 +08:00 · 2026-03-23 22:19:32 +09:00
parent 30dd36505c
commit 30deeb6f1c
2 changed files with 112 additions and 8 deletions
--- a/api/core/rag/extractor/firecrawl/firecrawl_app.py
+++ b/api/core/rag/extractor/firecrawl/firecrawl_app.py
@ -95,15 +95,11 @@ class FirecrawlApp:
        if response.status_code == 200:
            crawl_status_response = response.json()
            if crawl_status_response.get("status") == "completed":
-                total = crawl_status_response.get("total", 0)
-                if total == 0:
+                # Normalize to avoid None bypassing the zero-guard when the API returns null.
+                total = crawl_status_response.get("total") or 0
+                if total <= 0:
                    raise Exception("Failed to check crawl status. Error: No page found")
-                data = crawl_status_response.get("data", [])
-                url_data_list: list[FirecrawlDocumentData] = []
-                for item in data:
-                    if isinstance(item, dict) and "metadata" in item and "markdown" in item:
-                        url_data = self._extract_common_fields(item)
-                        url_data_list.append(url_data)
+                url_data_list = self._collect_all_crawl_pages(crawl_status_response, headers)
                if url_data_list:
                    file_key = "website_files/" + job_id + ".txt"
                    try:
@ -120,6 +116,36 @@ class FirecrawlApp:
        self._handle_error(response, "check crawl status")
        raise RuntimeError("unreachable: _handle_error always raises")

+    def _collect_all_crawl_pages(
+        self, first_page: dict[str, Any], headers: dict[str, str]
+    ) -> list[FirecrawlDocumentData]:
+        """Collect all crawl result pages by following pagination links.
+
+        Raises an exception if any paginated request fails, to avoid returning
+        partial data that is inconsistent with the reported total.
+
+        The number of pages processed is capped at ``total`` (the
+        server-reported page count) to guard against infinite loops caused by
+        a misbehaving server that keeps returning a ``next`` URL.
+        """
+        total: int = first_page.get("total") or 0
+        url_data_list: list[FirecrawlDocumentData] = []
+        current_page = first_page
+        pages_processed = 0
+        while True:
+            for item in current_page.get("data", []):
+                if isinstance(item, dict) and "metadata" in item and "markdown" in item:
+                    url_data_list.append(self._extract_common_fields(item))
+            next_url: str | None = current_page.get("next")
+            pages_processed += 1
+            if not next_url or pages_processed >= total:
+                break
+            response = self._get_request(next_url, headers)
+            if response.status_code != 200:
+                self._handle_error(response, "fetch next crawl page")
+            current_page = response.json()
+        return url_data_list
+
    def _format_crawl_status_response(
        self,
        status: str,