diff --git a/api/core/rag/extractor/firecrawl/firecrawl_app.py b/api/core/rag/extractor/firecrawl/firecrawl_app.py index 789ac8557d..5d6223db06 100644 --- a/api/core/rag/extractor/firecrawl/firecrawl_app.py +++ b/api/core/rag/extractor/firecrawl/firecrawl_app.py @@ -25,7 +25,7 @@ class FirecrawlApp: } if params: json_data.update(params) - response = self._post_request(f"{self.base_url}/v2/scrape", json_data, headers) + response = self._post_request(self._build_url("v2/scrape"), json_data, headers) if response.status_code == 200: response_data = response.json() data = response_data["data"] @@ -42,7 +42,7 @@ class FirecrawlApp: json_data = {"url": url} if params: json_data.update(params) - response = self._post_request(f"{self.base_url}/v2/crawl", json_data, headers) + response = self._post_request(self._build_url("v2/crawl"), json_data, headers) if response.status_code == 200: # There's also another two fields in the response: "success" (bool) and "url" (str) job_id = response.json().get("id") @@ -58,7 +58,7 @@ class FirecrawlApp: if params: # Pass through provided params, including optional "sitemap": "only" | "include" | "skip" json_data.update(params) - response = self._post_request(f"{self.base_url}/v2/map", json_data, headers) + response = self._post_request(self._build_url("v2/map"), json_data, headers) if response.status_code == 200: return cast(dict[str, Any], response.json()) elif response.status_code in {402, 409, 500, 429, 408}: @@ -69,7 +69,7 @@ class FirecrawlApp: def check_crawl_status(self, job_id) -> dict[str, Any]: headers = self._prepare_headers() - response = self._get_request(f"{self.base_url}/v2/crawl/{job_id}", headers) + response = self._get_request(self._build_url(f"v2/crawl/{job_id}"), headers) if response.status_code == 200: crawl_status_response = response.json() if crawl_status_response.get("status") == "completed": @@ -120,6 +120,10 @@ class FirecrawlApp: def _prepare_headers(self) -> dict[str, Any]: return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} + def _build_url(self, path: str) -> str: + # ensure exactly one slash between base and path, regardless of user-provided base_url + return f"{self.base_url.rstrip('/')}/{path.lstrip('/')}" + def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5) -> httpx.Response: for attempt in range(retries): response = httpx.post(url, headers=headers, json=data) @@ -139,7 +143,11 @@ class FirecrawlApp: return response def _handle_error(self, response, action): - error_message = response.json().get("error", "Unknown error occurred") + try: + payload = response.json() + error_message = payload.get("error") or payload.get("message") or response.text or "Unknown error occurred" + except json.JSONDecodeError: + error_message = response.text or "Unknown error occurred" raise Exception(f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}") # type: ignore[return] def search(self, query: str, params: dict[str, Any] | None = None) -> dict[str, Any]: @@ -160,7 +168,7 @@ class FirecrawlApp: } if params: json_data.update(params) - response = self._post_request(f"{self.base_url}/v2/search", json_data, headers) + response = self._post_request(self._build_url("v2/search"), json_data, headers) if response.status_code == 200: response_data = response.json() if not response_data.get("success"): diff --git a/api/services/auth/firecrawl/firecrawl.py b/api/services/auth/firecrawl/firecrawl.py index d455475bfc..b002706931 100644 --- a/api/services/auth/firecrawl/firecrawl.py +++ b/api/services/auth/firecrawl/firecrawl.py @@ -26,7 +26,7 @@ class FirecrawlAuth(ApiKeyAuthBase): "limit": 1, "scrapeOptions": {"onlyMainContent": True}, } - response = self._post_request(f"{self.base_url}/v1/crawl", options, headers) + response = self._post_request(self._build_url("v1/crawl"), options, headers) if response.status_code == 200: return True else: @@ -35,15 +35,17 @@ class FirecrawlAuth(ApiKeyAuthBase): def _prepare_headers(self): return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} + def _build_url(self, path: str) -> str: + # ensure exactly one slash between base and path, regardless of user-provided base_url + return f"{self.base_url.rstrip('/')}/{path.lstrip('/')}" + def _post_request(self, url, data, headers): return httpx.post(url, headers=headers, json=data) def _handle_error(self, response): - if response.status_code in {402, 409, 500}: - error_message = response.json().get("error", "Unknown error occurred") - raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") - else: - if response.text: - error_message = json.loads(response.text).get("error", "Unknown error occurred") - raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") - raise Exception(f"Unexpected error occurred while trying to authorize. Status code: {response.status_code}") + try: + payload = response.json() + except json.JSONDecodeError: + payload = {} + error_message = payload.get("error") or payload.get("message") or (response.text or "Unknown error occurred") + raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") diff --git a/api/tests/unit_tests/core/rag/extractor/firecrawl/test_firecrawl.py b/api/tests/unit_tests/core/rag/extractor/firecrawl/test_firecrawl.py index b4ee1b91b4..4ee04ddebc 100644 --- a/api/tests/unit_tests/core/rag/extractor/firecrawl/test_firecrawl.py +++ b/api/tests/unit_tests/core/rag/extractor/firecrawl/test_firecrawl.py @@ -1,5 +1,7 @@ import os +from unittest.mock import MagicMock +import pytest from pytest_mock import MockerFixture from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp @@ -25,3 +27,35 @@ def test_firecrawl_web_extractor_crawl_mode(mocker: MockerFixture): assert job_id is not None assert isinstance(job_id, str) + + +def test_build_url_normalizes_slashes_for_crawl(mocker: MockerFixture): + api_key = "fc-" + base_urls = ["https://custom.firecrawl.dev", "https://custom.firecrawl.dev/"] + for base in base_urls: + app = FirecrawlApp(api_key=api_key, base_url=base) + mock_post = mocker.patch("httpx.post") + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = {"id": "job123"} + mock_post.return_value = mock_resp + app.crawl_url("https://example.com", params=None) + called_url = mock_post.call_args[0][0] + assert called_url == "https://custom.firecrawl.dev/v2/crawl" + + +def test_error_handler_handles_non_json_error_bodies(mocker: MockerFixture): + api_key = "fc-" + app = FirecrawlApp(api_key=api_key, base_url="https://custom.firecrawl.dev/") + mock_post = mocker.patch("httpx.post") + mock_resp = MagicMock() + mock_resp.status_code = 404 + mock_resp.text = "Not Found" + mock_resp.json.side_effect = Exception("Not JSON") + mock_post.return_value = mock_resp + + with pytest.raises(Exception) as excinfo: + app.scrape_url("https://example.com") + + # Should not raise a JSONDecodeError; current behavior reports status code only + assert str(excinfo.value) == "Failed to scrape URL. Status code: 404" diff --git a/api/tests/unit_tests/services/auth/test_firecrawl_auth.py b/api/tests/unit_tests/services/auth/test_firecrawl_auth.py index b5ee55706d..ab50d6a92c 100644 --- a/api/tests/unit_tests/services/auth/test_firecrawl_auth.py +++ b/api/tests/unit_tests/services/auth/test_firecrawl_auth.py @@ -1,3 +1,4 @@ +import json from unittest.mock import MagicMock, patch import httpx @@ -110,9 +111,11 @@ class TestFirecrawlAuth: @pytest.mark.parametrize( ("status_code", "response_text", "has_json_error", "expected_error_contains"), [ - (403, '{"error": "Forbidden"}', True, "Failed to authorize. Status code: 403. Error: Forbidden"), - (404, "", True, "Unexpected error occurred while trying to authorize. Status code: 404"), - (401, "Not JSON", True, "Expecting value"), # JSON decode error + (403, '{"error": "Forbidden"}', False, "Failed to authorize. Status code: 403. Error: Forbidden"), + # empty body falls back to generic message + (404, "", True, "Failed to authorize. Status code: 404. Error: Unknown error occurred"), + # non-JSON body is surfaced directly + (401, "Not JSON", True, "Failed to authorize. Status code: 401. Error: Not JSON"), ], ) @patch("services.auth.firecrawl.firecrawl.httpx.post") @@ -124,12 +127,14 @@ class TestFirecrawlAuth: mock_response.status_code = status_code mock_response.text = response_text if has_json_error: - mock_response.json.side_effect = Exception("Not JSON") + mock_response.json.side_effect = json.JSONDecodeError("Not JSON", "", 0) + else: + mock_response.json.return_value = {"error": "Forbidden"} mock_post.return_value = mock_response with pytest.raises(Exception) as exc_info: auth_instance.validate_credentials() - assert expected_error_contains in str(exc_info.value) + assert str(exc_info.value) == expected_error_contains @pytest.mark.parametrize( ("exception_type", "exception_message"), @@ -164,20 +169,21 @@ class TestFirecrawlAuth: @patch("services.auth.firecrawl.firecrawl.httpx.post") def test_should_use_custom_base_url_in_validation(self, mock_post): - """Test that custom base URL is used in validation""" + """Test that custom base URL is used in validation and normalized""" mock_response = MagicMock() mock_response.status_code = 200 mock_post.return_value = mock_response - credentials = { - "auth_type": "bearer", - "config": {"api_key": "test_api_key_123", "base_url": "https://custom.firecrawl.dev"}, - } - auth = FirecrawlAuth(credentials) - result = auth.validate_credentials() + for base in ("https://custom.firecrawl.dev", "https://custom.firecrawl.dev/"): + credentials = { + "auth_type": "bearer", + "config": {"api_key": "test_api_key_123", "base_url": base}, + } + auth = FirecrawlAuth(credentials) + result = auth.validate_credentials() - assert result is True - assert mock_post.call_args[0][0] == "https://custom.firecrawl.dev/v1/crawl" + assert result is True + assert mock_post.call_args[0][0] == "https://custom.firecrawl.dev/v1/crawl" @patch("services.auth.firecrawl.firecrawl.httpx.post") def test_should_handle_timeout_with_retry_suggestion(self, mock_post, auth_instance):