mirror of
https://github.com/langgenius/dify.git
synced 2026-03-17 12:57:51 +08:00
719 lines
30 KiB
Python
719 lines
30 KiB
Python
"""Unit tests for services.website_service.
|
|
|
|
Focuses on provider dispatching, argument validation, and provider-specific branches
|
|
without making any real network/storage/redis calls.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from dataclasses import dataclass
|
|
from datetime import UTC, datetime
|
|
from typing import Any
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
import services.website_service as website_service_module
|
|
from services.website_service import (
|
|
CrawlOptions,
|
|
WebsiteCrawlApiRequest,
|
|
WebsiteCrawlStatusApiRequest,
|
|
WebsiteService,
|
|
)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class _DummyHttpxResponse:
|
|
payload: dict[str, Any]
|
|
|
|
def json(self) -> dict[str, Any]:
|
|
return self.payload
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def stub_current_user(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
monkeypatch.setattr(
|
|
website_service_module,
|
|
"current_user",
|
|
type("User", (), {"current_tenant_id": "tenant-1"})(),
|
|
)
|
|
|
|
|
|
def test_crawl_options_include_exclude_paths() -> None:
|
|
options = CrawlOptions(includes="a,b", excludes="x,y")
|
|
assert options.get_include_paths() == ["a", "b"]
|
|
assert options.get_exclude_paths() == ["x", "y"]
|
|
|
|
empty = CrawlOptions(includes=None, excludes=None)
|
|
assert empty.get_include_paths() == []
|
|
assert empty.get_exclude_paths() == []
|
|
|
|
|
|
def test_website_crawl_api_request_from_args_valid_and_to_crawl_request() -> None:
|
|
args = {
|
|
"provider": "firecrawl",
|
|
"url": "https://example.com",
|
|
"options": {
|
|
"limit": 2,
|
|
"crawl_sub_pages": True,
|
|
"only_main_content": True,
|
|
"includes": "a,b",
|
|
"excludes": "x",
|
|
"prompt": "hi",
|
|
"max_depth": 3,
|
|
"use_sitemap": False,
|
|
},
|
|
}
|
|
|
|
api_req = WebsiteCrawlApiRequest.from_args(args)
|
|
crawl_req = api_req.to_crawl_request()
|
|
|
|
assert crawl_req.provider == "firecrawl"
|
|
assert crawl_req.url == "https://example.com"
|
|
assert crawl_req.options.limit == 2
|
|
assert crawl_req.options.crawl_sub_pages is True
|
|
assert crawl_req.options.only_main_content is True
|
|
assert crawl_req.options.get_include_paths() == ["a", "b"]
|
|
assert crawl_req.options.get_exclude_paths() == ["x"]
|
|
assert crawl_req.options.prompt == "hi"
|
|
assert crawl_req.options.max_depth == 3
|
|
assert crawl_req.options.use_sitemap is False
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("args", "missing_msg"),
|
|
[
|
|
({}, "Provider is required"),
|
|
({"provider": "firecrawl"}, "URL is required"),
|
|
({"provider": "firecrawl", "url": "https://example.com"}, "Options are required"),
|
|
],
|
|
)
|
|
def test_website_crawl_api_request_from_args_requires_fields(args: dict, missing_msg: str) -> None:
|
|
with pytest.raises(ValueError, match=missing_msg):
|
|
WebsiteCrawlApiRequest.from_args(args)
|
|
|
|
|
|
def test_website_crawl_status_api_request_from_args_requires_fields() -> None:
|
|
with pytest.raises(ValueError, match="Provider is required"):
|
|
WebsiteCrawlStatusApiRequest.from_args({}, job_id="job-1")
|
|
|
|
with pytest.raises(ValueError, match="Job ID is required"):
|
|
WebsiteCrawlStatusApiRequest.from_args({"provider": "firecrawl"}, job_id="")
|
|
|
|
req = WebsiteCrawlStatusApiRequest.from_args({"provider": "firecrawl"}, job_id="job-1")
|
|
assert req.provider == "firecrawl"
|
|
assert req.job_id == "job-1"
|
|
|
|
|
|
def test_get_credentials_and_config_selects_plugin_id_and_key_firecrawl(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
service_instance = MagicMock(name="DatasourceProviderService-instance")
|
|
service_instance.get_datasource_credentials.return_value = {"firecrawl_api_key": "k", "base_url": "b"}
|
|
monkeypatch.setattr(website_service_module, "DatasourceProviderService", MagicMock(return_value=service_instance))
|
|
|
|
api_key, config = WebsiteService._get_credentials_and_config("tenant-1", "firecrawl")
|
|
assert api_key == "k"
|
|
assert config["base_url"] == "b"
|
|
|
|
service_instance.get_datasource_credentials.assert_called_once_with(
|
|
tenant_id="tenant-1",
|
|
provider="firecrawl",
|
|
plugin_id="langgenius/firecrawl_datasource",
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("provider", "plugin_id"),
|
|
[
|
|
("watercrawl", "watercrawl/watercrawl_datasource"),
|
|
("jinareader", "langgenius/jina_datasource"),
|
|
],
|
|
)
|
|
def test_get_credentials_and_config_selects_plugin_id_and_key_api_key(
|
|
monkeypatch: pytest.MonkeyPatch, provider: str, plugin_id: str
|
|
) -> None:
|
|
service_instance = MagicMock(name="DatasourceProviderService-instance")
|
|
service_instance.get_datasource_credentials.return_value = {"api_key": "enc-key", "base_url": "b"}
|
|
monkeypatch.setattr(website_service_module, "DatasourceProviderService", MagicMock(return_value=service_instance))
|
|
|
|
api_key, config = WebsiteService._get_credentials_and_config("tenant-1", provider)
|
|
assert api_key == "enc-key"
|
|
assert config["base_url"] == "b"
|
|
|
|
service_instance.get_datasource_credentials.assert_called_once_with(
|
|
tenant_id="tenant-1",
|
|
provider=provider,
|
|
plugin_id=plugin_id,
|
|
)
|
|
|
|
|
|
def test_get_credentials_and_config_rejects_invalid_provider() -> None:
|
|
with pytest.raises(ValueError, match="Invalid provider"):
|
|
WebsiteService._get_credentials_and_config("tenant-1", "unknown")
|
|
|
|
|
|
def test_get_credentials_and_config_hits_unreachable_guard_branch(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
class FlakyProvider:
|
|
def __init__(self) -> None:
|
|
self._eq_calls = 0
|
|
|
|
def __hash__(self) -> int:
|
|
return 1
|
|
|
|
def __eq__(self, other: object) -> bool:
|
|
if other == "firecrawl":
|
|
self._eq_calls += 1
|
|
return self._eq_calls == 1
|
|
return False
|
|
|
|
def __repr__(self) -> str:
|
|
return "FlakyProvider()"
|
|
|
|
service_instance = MagicMock(name="DatasourceProviderService-instance")
|
|
service_instance.get_datasource_credentials.return_value = {"firecrawl_api_key": "k"}
|
|
monkeypatch.setattr(website_service_module, "DatasourceProviderService", MagicMock(return_value=service_instance))
|
|
|
|
with pytest.raises(ValueError, match="Invalid provider"):
|
|
WebsiteService._get_credentials_and_config("tenant-1", FlakyProvider()) # type: ignore[arg-type]
|
|
|
|
|
|
def test_get_decrypted_api_key_requires_api_key(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
monkeypatch.setattr(website_service_module.encrypter, "decrypt_token", MagicMock())
|
|
with pytest.raises(ValueError, match="API key not found in configuration"):
|
|
WebsiteService._get_decrypted_api_key("tenant-1", {})
|
|
|
|
|
|
def test_get_decrypted_api_key_decrypts(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
decrypt_mock = MagicMock(return_value="plain")
|
|
monkeypatch.setattr(website_service_module.encrypter, "decrypt_token", decrypt_mock)
|
|
|
|
assert WebsiteService._get_decrypted_api_key("tenant-1", {"api_key": "enc"}) == "plain"
|
|
decrypt_mock.assert_called_once_with(tenant_id="tenant-1", token="enc")
|
|
|
|
|
|
def test_document_create_args_validate_wraps_error_message() -> None:
|
|
with pytest.raises(ValueError, match=r"^Invalid arguments: Provider is required$"):
|
|
WebsiteService.document_create_args_validate({})
|
|
|
|
|
|
def test_crawl_url_dispatches_by_provider(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
api_request = WebsiteCrawlApiRequest(provider="firecrawl", url="https://example.com", options={"limit": 1})
|
|
crawl_request = api_request.to_crawl_request()
|
|
|
|
monkeypatch.setattr(WebsiteService, "_get_credentials_and_config", MagicMock(return_value=("k", {"base_url": "b"})))
|
|
firecrawl_mock = MagicMock(return_value={"status": "active", "job_id": "j1"})
|
|
monkeypatch.setattr(WebsiteService, "_crawl_with_firecrawl", firecrawl_mock)
|
|
|
|
result = WebsiteService.crawl_url(api_request)
|
|
|
|
assert result == {"status": "active", "job_id": "j1"}
|
|
firecrawl_mock.assert_called_once()
|
|
assert firecrawl_mock.call_args.kwargs["request"] == crawl_request
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("provider", "method_name"),
|
|
[
|
|
("watercrawl", "_crawl_with_watercrawl"),
|
|
("jinareader", "_crawl_with_jinareader"),
|
|
],
|
|
)
|
|
def test_crawl_url_dispatches_other_providers(monkeypatch: pytest.MonkeyPatch, provider: str, method_name: str) -> None:
|
|
api_request = WebsiteCrawlApiRequest(provider=provider, url="https://example.com", options={"limit": 1})
|
|
monkeypatch.setattr(WebsiteService, "_get_credentials_and_config", MagicMock(return_value=("k", {"base_url": "b"})))
|
|
|
|
impl_mock = MagicMock(return_value={"status": "active"})
|
|
monkeypatch.setattr(WebsiteService, method_name, impl_mock)
|
|
|
|
assert WebsiteService.crawl_url(api_request) == {"status": "active"}
|
|
impl_mock.assert_called_once()
|
|
|
|
|
|
def test_crawl_url_rejects_invalid_provider(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
api_request = WebsiteCrawlApiRequest(provider="bad", url="https://example.com", options={"limit": 1})
|
|
monkeypatch.setattr(WebsiteService, "_get_credentials_and_config", MagicMock(return_value=("k", {})))
|
|
|
|
with pytest.raises(ValueError, match="Invalid provider"):
|
|
WebsiteService.crawl_url(api_request)
|
|
|
|
|
|
def test_crawl_with_firecrawl_builds_params_single_page_and_sets_redis(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
firecrawl_instance = MagicMock(name="FirecrawlApp-instance")
|
|
firecrawl_instance.crawl_url.return_value = "job-1"
|
|
firecrawl_cls = MagicMock(return_value=firecrawl_instance)
|
|
monkeypatch.setattr(website_service_module, "FirecrawlApp", firecrawl_cls)
|
|
|
|
redis_mock = MagicMock()
|
|
monkeypatch.setattr(website_service_module, "redis_client", redis_mock)
|
|
|
|
fixed_now = datetime(2024, 1, 1, tzinfo=UTC)
|
|
with patch.object(website_service_module.datetime, "datetime") as datetime_mock:
|
|
datetime_mock.now.return_value = fixed_now
|
|
|
|
req = WebsiteCrawlApiRequest(
|
|
provider="firecrawl", url="https://example.com", options={"limit": 5}
|
|
).to_crawl_request()
|
|
req.options.crawl_sub_pages = False
|
|
req.options.only_main_content = True
|
|
|
|
result = WebsiteService._crawl_with_firecrawl(request=req, api_key="k", config={"base_url": "b"})
|
|
|
|
assert result == {"status": "active", "job_id": "job-1"}
|
|
|
|
firecrawl_cls.assert_called_once_with(api_key="k", base_url="b")
|
|
firecrawl_instance.crawl_url.assert_called_once()
|
|
_, params = firecrawl_instance.crawl_url.call_args.args
|
|
assert params["limit"] == 1
|
|
assert params["includePaths"] == []
|
|
assert params["excludePaths"] == []
|
|
assert params["scrapeOptions"] == {"onlyMainContent": True}
|
|
|
|
redis_mock.setex.assert_called_once()
|
|
key, ttl, value = redis_mock.setex.call_args.args
|
|
assert key == "website_crawl_job-1"
|
|
assert ttl == 3600
|
|
assert float(value) == pytest.approx(fixed_now.timestamp(), rel=0, abs=1e-6)
|
|
|
|
|
|
def test_crawl_with_firecrawl_builds_params_multi_page_including_prompt(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
firecrawl_instance = MagicMock(name="FirecrawlApp-instance")
|
|
firecrawl_instance.crawl_url.return_value = "job-2"
|
|
monkeypatch.setattr(website_service_module, "FirecrawlApp", MagicMock(return_value=firecrawl_instance))
|
|
monkeypatch.setattr(website_service_module, "redis_client", MagicMock())
|
|
|
|
req = WebsiteCrawlApiRequest(
|
|
provider="firecrawl",
|
|
url="https://example.com",
|
|
options={
|
|
"crawl_sub_pages": True,
|
|
"limit": 3,
|
|
"only_main_content": False,
|
|
"includes": "a,b",
|
|
"excludes": "x",
|
|
"prompt": "use this",
|
|
},
|
|
).to_crawl_request()
|
|
|
|
WebsiteService._crawl_with_firecrawl(request=req, api_key="k", config={"base_url": None})
|
|
_, params = firecrawl_instance.crawl_url.call_args.args
|
|
assert params["includePaths"] == ["a", "b"]
|
|
assert params["excludePaths"] == ["x"]
|
|
assert params["limit"] == 3
|
|
assert params["scrapeOptions"] == {"onlyMainContent": False}
|
|
assert params["prompt"] == "use this"
|
|
|
|
|
|
def test_crawl_with_watercrawl_passes_options_dict(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
provider_instance = MagicMock()
|
|
provider_instance.crawl_url.return_value = {"status": "active", "job_id": "w1"}
|
|
provider_cls = MagicMock(return_value=provider_instance)
|
|
monkeypatch.setattr(website_service_module, "WaterCrawlProvider", provider_cls)
|
|
|
|
req = WebsiteCrawlApiRequest(
|
|
provider="watercrawl",
|
|
url="https://example.com",
|
|
options={
|
|
"limit": 2,
|
|
"crawl_sub_pages": True,
|
|
"only_main_content": True,
|
|
"includes": "a",
|
|
"excludes": None,
|
|
"max_depth": 5,
|
|
"use_sitemap": False,
|
|
},
|
|
).to_crawl_request()
|
|
|
|
result = WebsiteService._crawl_with_watercrawl(request=req, api_key="k", config={"base_url": "b"})
|
|
assert result == {"status": "active", "job_id": "w1"}
|
|
|
|
provider_cls.assert_called_once_with(api_key="k", base_url="b")
|
|
provider_instance.crawl_url.assert_called_once_with(
|
|
url="https://example.com",
|
|
options={
|
|
"limit": 2,
|
|
"crawl_sub_pages": True,
|
|
"only_main_content": True,
|
|
"includes": "a",
|
|
"excludes": None,
|
|
"max_depth": 5,
|
|
"use_sitemap": False,
|
|
},
|
|
)
|
|
|
|
|
|
def test_crawl_with_jinareader_single_page_success(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
get_mock = MagicMock(return_value=_DummyHttpxResponse({"code": 200, "data": {"title": "t"}}))
|
|
monkeypatch.setattr(website_service_module.httpx, "get", get_mock)
|
|
|
|
req = WebsiteCrawlApiRequest(
|
|
provider="jinareader", url="https://example.com", options={"crawl_sub_pages": False}
|
|
).to_crawl_request()
|
|
req.options.crawl_sub_pages = False
|
|
|
|
result = WebsiteService._crawl_with_jinareader(request=req, api_key="k")
|
|
assert result == {"status": "active", "data": {"title": "t"}}
|
|
get_mock.assert_called_once()
|
|
|
|
|
|
def test_crawl_with_jinareader_single_page_failure(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
monkeypatch.setattr(website_service_module.httpx, "get", MagicMock(return_value=_DummyHttpxResponse({"code": 500})))
|
|
req = WebsiteCrawlApiRequest(
|
|
provider="jinareader", url="https://example.com", options={"crawl_sub_pages": False}
|
|
).to_crawl_request()
|
|
req.options.crawl_sub_pages = False
|
|
|
|
with pytest.raises(ValueError, match="Failed to crawl:"):
|
|
WebsiteService._crawl_with_jinareader(request=req, api_key="k")
|
|
|
|
|
|
def test_crawl_with_jinareader_multi_page_success(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
post_mock = MagicMock(return_value=_DummyHttpxResponse({"code": 200, "data": {"taskId": "t1"}}))
|
|
monkeypatch.setattr(website_service_module.httpx, "post", post_mock)
|
|
|
|
req = WebsiteCrawlApiRequest(
|
|
provider="jinareader",
|
|
url="https://example.com",
|
|
options={"crawl_sub_pages": True, "limit": 5, "use_sitemap": True},
|
|
).to_crawl_request()
|
|
req.options.crawl_sub_pages = True
|
|
|
|
result = WebsiteService._crawl_with_jinareader(request=req, api_key="k")
|
|
assert result == {"status": "active", "job_id": "t1"}
|
|
post_mock.assert_called_once()
|
|
|
|
|
|
def test_crawl_with_jinareader_multi_page_failure(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
monkeypatch.setattr(
|
|
website_service_module.httpx, "post", MagicMock(return_value=_DummyHttpxResponse({"code": 400}))
|
|
)
|
|
req = WebsiteCrawlApiRequest(
|
|
provider="jinareader",
|
|
url="https://example.com",
|
|
options={"crawl_sub_pages": True, "limit": 2, "use_sitemap": False},
|
|
).to_crawl_request()
|
|
req.options.crawl_sub_pages = True
|
|
|
|
with pytest.raises(ValueError, match="Failed to crawl$"):
|
|
WebsiteService._crawl_with_jinareader(request=req, api_key="k")
|
|
|
|
|
|
def test_get_crawl_status_dispatches(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
monkeypatch.setattr(WebsiteService, "_get_credentials_and_config", MagicMock(return_value=("k", {"base_url": "b"})))
|
|
firecrawl_status = MagicMock(return_value={"status": "active"})
|
|
monkeypatch.setattr(WebsiteService, "_get_firecrawl_status", firecrawl_status)
|
|
|
|
result = WebsiteService.get_crawl_status("job-1", "firecrawl")
|
|
assert result == {"status": "active"}
|
|
firecrawl_status.assert_called_once_with("job-1", "k", {"base_url": "b"})
|
|
|
|
watercrawl_status = MagicMock(return_value={"status": "active", "job_id": "w"})
|
|
monkeypatch.setattr(WebsiteService, "_get_watercrawl_status", watercrawl_status)
|
|
assert WebsiteService.get_crawl_status("job-2", "watercrawl") == {"status": "active", "job_id": "w"}
|
|
watercrawl_status.assert_called_once_with("job-2", "k", {"base_url": "b"})
|
|
|
|
jinareader_status = MagicMock(return_value={"status": "active", "job_id": "j"})
|
|
monkeypatch.setattr(WebsiteService, "_get_jinareader_status", jinareader_status)
|
|
assert WebsiteService.get_crawl_status("job-3", "jinareader") == {"status": "active", "job_id": "j"}
|
|
jinareader_status.assert_called_once_with("job-3", "k")
|
|
|
|
|
|
def test_get_crawl_status_typed_rejects_invalid_provider(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
monkeypatch.setattr(WebsiteService, "_get_credentials_and_config", MagicMock(return_value=("k", {})))
|
|
with pytest.raises(ValueError, match="Invalid provider"):
|
|
WebsiteService.get_crawl_status_typed(WebsiteCrawlStatusApiRequest(provider="bad", job_id="j"))
|
|
|
|
|
|
def test_get_firecrawl_status_adds_time_consuming_when_completed_and_cached(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
firecrawl_instance = MagicMock()
|
|
firecrawl_instance.check_crawl_status.return_value = {"status": "completed", "total": 2, "current": 2, "data": []}
|
|
monkeypatch.setattr(website_service_module, "FirecrawlApp", MagicMock(return_value=firecrawl_instance))
|
|
|
|
redis_mock = MagicMock()
|
|
redis_mock.get.return_value = b"100.0"
|
|
monkeypatch.setattr(website_service_module, "redis_client", redis_mock)
|
|
|
|
with patch.object(website_service_module.datetime, "datetime") as datetime_mock:
|
|
datetime_mock.now.return_value = datetime.fromtimestamp(105.0, tz=UTC)
|
|
result = WebsiteService._get_firecrawl_status(job_id="job-1", api_key="k", config={"base_url": "b"})
|
|
|
|
assert result["status"] == "completed"
|
|
assert result["time_consuming"] == "5.00"
|
|
redis_mock.delete.assert_called_once_with("website_crawl_job-1")
|
|
|
|
|
|
def test_get_firecrawl_status_completed_without_cache_does_not_add_time(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
firecrawl_instance = MagicMock()
|
|
firecrawl_instance.check_crawl_status.return_value = {"status": "completed"}
|
|
monkeypatch.setattr(website_service_module, "FirecrawlApp", MagicMock(return_value=firecrawl_instance))
|
|
|
|
redis_mock = MagicMock()
|
|
redis_mock.get.return_value = None
|
|
monkeypatch.setattr(website_service_module, "redis_client", redis_mock)
|
|
|
|
result = WebsiteService._get_firecrawl_status(job_id="job-1", api_key="k", config={"base_url": None})
|
|
assert result["status"] == "completed"
|
|
assert "time_consuming" not in result
|
|
redis_mock.delete.assert_not_called()
|
|
|
|
|
|
def test_get_watercrawl_status_delegates(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
provider_instance = MagicMock()
|
|
provider_instance.get_crawl_status.return_value = {"status": "active", "job_id": "w1"}
|
|
monkeypatch.setattr(website_service_module, "WaterCrawlProvider", MagicMock(return_value=provider_instance))
|
|
|
|
assert WebsiteService._get_watercrawl_status("job-1", "k", {"base_url": "b"}) == {
|
|
"status": "active",
|
|
"job_id": "w1",
|
|
}
|
|
provider_instance.get_crawl_status.assert_called_once_with("job-1")
|
|
|
|
|
|
def test_get_jinareader_status_active(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
post_mock = MagicMock(
|
|
return_value=_DummyHttpxResponse(
|
|
{
|
|
"data": {
|
|
"status": "active",
|
|
"urls": ["a", "b"],
|
|
"processed": {"a": {}},
|
|
"failed": {"b": {}},
|
|
"duration": 3000,
|
|
}
|
|
}
|
|
)
|
|
)
|
|
monkeypatch.setattr(website_service_module.httpx, "post", post_mock)
|
|
|
|
result = WebsiteService._get_jinareader_status("job-1", "k")
|
|
assert result["status"] == "active"
|
|
assert result["total"] == 2
|
|
assert result["current"] == 2
|
|
assert result["time_consuming"] == 3.0
|
|
assert result["data"] == []
|
|
post_mock.assert_called_once()
|
|
|
|
|
|
def test_get_jinareader_status_completed_formats_processed_items(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
status_payload = {
|
|
"data": {
|
|
"status": "completed",
|
|
"urls": ["u1"],
|
|
"processed": {"u1": {}},
|
|
"failed": {},
|
|
"duration": 1000,
|
|
}
|
|
}
|
|
processed_payload = {
|
|
"data": {
|
|
"processed": {
|
|
"u1": {
|
|
"data": {
|
|
"title": "t",
|
|
"url": "u1",
|
|
"description": "d",
|
|
"content": "md",
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
post_mock = MagicMock(side_effect=[_DummyHttpxResponse(status_payload), _DummyHttpxResponse(processed_payload)])
|
|
monkeypatch.setattr(website_service_module.httpx, "post", post_mock)
|
|
|
|
result = WebsiteService._get_jinareader_status("job-1", "k")
|
|
assert result["status"] == "completed"
|
|
assert result["data"] == [{"title": "t", "source_url": "u1", "description": "d", "markdown": "md"}]
|
|
assert post_mock.call_count == 2
|
|
|
|
|
|
def test_get_crawl_url_data_dispatches_invalid_provider() -> None:
|
|
with pytest.raises(ValueError, match="Invalid provider"):
|
|
WebsiteService.get_crawl_url_data("job-1", "bad", "https://example.com", "tenant-1")
|
|
|
|
|
|
def test_get_crawl_url_data_hits_invalid_provider_branch_when_credentials_stubbed(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
) -> None:
|
|
monkeypatch.setattr(WebsiteService, "_get_credentials_and_config", MagicMock(return_value=("k", {})))
|
|
with pytest.raises(ValueError, match="Invalid provider"):
|
|
WebsiteService.get_crawl_url_data("job-1", object(), "u", "tenant-1") # type: ignore[arg-type]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("provider", "method_name"),
|
|
[
|
|
("firecrawl", "_get_firecrawl_url_data"),
|
|
("watercrawl", "_get_watercrawl_url_data"),
|
|
("jinareader", "_get_jinareader_url_data"),
|
|
],
|
|
)
|
|
def test_get_crawl_url_data_dispatches(monkeypatch: pytest.MonkeyPatch, provider: str, method_name: str) -> None:
|
|
monkeypatch.setattr(WebsiteService, "_get_credentials_and_config", MagicMock(return_value=("k", {"base_url": "b"})))
|
|
impl_mock = MagicMock(return_value={"ok": True})
|
|
monkeypatch.setattr(WebsiteService, method_name, impl_mock)
|
|
|
|
result = WebsiteService.get_crawl_url_data("job-1", provider, "u", "tenant-1")
|
|
assert result == {"ok": True}
|
|
impl_mock.assert_called_once()
|
|
|
|
|
|
def test_get_firecrawl_url_data_reads_from_storage_when_present(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
stored_list = [{"source_url": "https://example.com", "title": "t"}]
|
|
stored = json.dumps(stored_list).encode("utf-8")
|
|
|
|
storage_mock = MagicMock()
|
|
storage_mock.exists.return_value = True
|
|
storage_mock.load_once.return_value = stored
|
|
monkeypatch.setattr(website_service_module, "storage", storage_mock)
|
|
|
|
monkeypatch.setattr(website_service_module, "FirecrawlApp", MagicMock())
|
|
|
|
result = WebsiteService._get_firecrawl_url_data("job-1", "https://example.com", "k", {"base_url": "b"})
|
|
assert result == {"source_url": "https://example.com", "title": "t"}
|
|
assert result is not stored_list[0]
|
|
|
|
|
|
def test_get_firecrawl_url_data_returns_none_when_storage_empty(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
storage_mock = MagicMock()
|
|
storage_mock.exists.return_value = True
|
|
storage_mock.load_once.return_value = b""
|
|
monkeypatch.setattr(website_service_module, "storage", storage_mock)
|
|
|
|
assert WebsiteService._get_firecrawl_url_data("job-1", "https://example.com", "k", {}) is None
|
|
|
|
|
|
def test_get_firecrawl_url_data_raises_when_job_not_completed(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
storage_mock = MagicMock()
|
|
storage_mock.exists.return_value = False
|
|
monkeypatch.setattr(website_service_module, "storage", storage_mock)
|
|
|
|
firecrawl_instance = MagicMock()
|
|
firecrawl_instance.check_crawl_status.return_value = {"status": "active"}
|
|
monkeypatch.setattr(website_service_module, "FirecrawlApp", MagicMock(return_value=firecrawl_instance))
|
|
|
|
with pytest.raises(ValueError, match="Crawl job is not completed"):
|
|
WebsiteService._get_firecrawl_url_data("job-1", "https://example.com", "k", {"base_url": None})
|
|
|
|
|
|
def test_get_firecrawl_url_data_returns_none_when_not_found(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
storage_mock = MagicMock()
|
|
storage_mock.exists.return_value = False
|
|
monkeypatch.setattr(website_service_module, "storage", storage_mock)
|
|
|
|
firecrawl_instance = MagicMock()
|
|
firecrawl_instance.check_crawl_status.return_value = {"status": "completed", "data": [{"source_url": "x"}]}
|
|
monkeypatch.setattr(website_service_module, "FirecrawlApp", MagicMock(return_value=firecrawl_instance))
|
|
|
|
assert WebsiteService._get_firecrawl_url_data("job-1", "https://example.com", "k", {"base_url": "b"}) is None
|
|
|
|
|
|
def test_get_watercrawl_url_data_delegates(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
provider_instance = MagicMock()
|
|
provider_instance.get_crawl_url_data.return_value = {"source_url": "u"}
|
|
monkeypatch.setattr(website_service_module, "WaterCrawlProvider", MagicMock(return_value=provider_instance))
|
|
|
|
result = WebsiteService._get_watercrawl_url_data("job-1", "u", "k", {"base_url": "b"})
|
|
assert result == {"source_url": "u"}
|
|
provider_instance.get_crawl_url_data.assert_called_once_with("job-1", "u")
|
|
|
|
|
|
def test_get_jinareader_url_data_without_job_id_success(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
monkeypatch.setattr(
|
|
website_service_module.httpx,
|
|
"get",
|
|
MagicMock(return_value=_DummyHttpxResponse({"code": 200, "data": {"url": "u"}})),
|
|
)
|
|
assert WebsiteService._get_jinareader_url_data("", "u", "k") == {"url": "u"}
|
|
|
|
|
|
def test_get_jinareader_url_data_without_job_id_failure(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
monkeypatch.setattr(website_service_module.httpx, "get", MagicMock(return_value=_DummyHttpxResponse({"code": 500})))
|
|
with pytest.raises(ValueError, match="Failed to crawl$"):
|
|
WebsiteService._get_jinareader_url_data("", "u", "k")
|
|
|
|
|
|
def test_get_jinareader_url_data_with_job_id_completed_returns_matching_item(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
status_payload = {"data": {"status": "completed", "processed": {"u1": {}}}}
|
|
processed_payload = {"data": {"processed": {"u1": {"data": {"url": "u", "title": "t"}}}}}
|
|
|
|
post_mock = MagicMock(side_effect=[_DummyHttpxResponse(status_payload), _DummyHttpxResponse(processed_payload)])
|
|
monkeypatch.setattr(website_service_module.httpx, "post", post_mock)
|
|
|
|
assert WebsiteService._get_jinareader_url_data("job-1", "u", "k") == {"url": "u", "title": "t"}
|
|
assert post_mock.call_count == 2
|
|
|
|
|
|
def test_get_jinareader_url_data_with_job_id_not_completed_raises(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
post_mock = MagicMock(return_value=_DummyHttpxResponse({"data": {"status": "active"}}))
|
|
monkeypatch.setattr(website_service_module.httpx, "post", post_mock)
|
|
|
|
with pytest.raises(ValueError, match=r"Crawl job is no\s*t completed"):
|
|
WebsiteService._get_jinareader_url_data("job-1", "u", "k")
|
|
|
|
|
|
def test_get_jinareader_url_data_with_job_id_completed_but_not_found_returns_none(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
) -> None:
|
|
status_payload = {"data": {"status": "completed", "processed": {"u1": {}}}}
|
|
processed_payload = {"data": {"processed": {"u1": {"data": {"url": "other"}}}}}
|
|
|
|
post_mock = MagicMock(side_effect=[_DummyHttpxResponse(status_payload), _DummyHttpxResponse(processed_payload)])
|
|
monkeypatch.setattr(website_service_module.httpx, "post", post_mock)
|
|
|
|
assert WebsiteService._get_jinareader_url_data("job-1", "u", "k") is None
|
|
|
|
|
|
def test_get_scrape_url_data_dispatches_and_rejects_invalid_provider(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
monkeypatch.setattr(WebsiteService, "_get_credentials_and_config", MagicMock(return_value=("k", {"base_url": "b"})))
|
|
|
|
scrape_mock = MagicMock(return_value={"data": "x"})
|
|
monkeypatch.setattr(WebsiteService, "_scrape_with_firecrawl", scrape_mock)
|
|
assert WebsiteService.get_scrape_url_data("firecrawl", "u", "tenant-1", True) == {"data": "x"}
|
|
scrape_mock.assert_called_once()
|
|
|
|
watercrawl_mock = MagicMock(return_value={"data": "y"})
|
|
monkeypatch.setattr(WebsiteService, "_scrape_with_watercrawl", watercrawl_mock)
|
|
assert WebsiteService.get_scrape_url_data("watercrawl", "u", "tenant-1", False) == {"data": "y"}
|
|
watercrawl_mock.assert_called_once()
|
|
|
|
with pytest.raises(ValueError, match="Invalid provider"):
|
|
WebsiteService.get_scrape_url_data("jinareader", "u", "tenant-1", True)
|
|
|
|
|
|
def test_scrape_with_firecrawl_calls_app(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
firecrawl_instance = MagicMock()
|
|
firecrawl_instance.scrape_url.return_value = {"markdown": "m"}
|
|
monkeypatch.setattr(website_service_module, "FirecrawlApp", MagicMock(return_value=firecrawl_instance))
|
|
|
|
result = WebsiteService._scrape_with_firecrawl(
|
|
request=website_service_module.ScrapeRequest(
|
|
provider="firecrawl",
|
|
url="u",
|
|
tenant_id="tenant-1",
|
|
only_main_content=True,
|
|
),
|
|
api_key="k",
|
|
config={"base_url": "b"},
|
|
)
|
|
assert result == {"markdown": "m"}
|
|
firecrawl_instance.scrape_url.assert_called_once_with(url="u", params={"onlyMainContent": True})
|
|
|
|
|
|
def test_scrape_with_watercrawl_calls_provider(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
provider_instance = MagicMock()
|
|
provider_instance.scrape_url.return_value = {"markdown": "m"}
|
|
monkeypatch.setattr(website_service_module, "WaterCrawlProvider", MagicMock(return_value=provider_instance))
|
|
|
|
result = WebsiteService._scrape_with_watercrawl(
|
|
request=website_service_module.ScrapeRequest(
|
|
provider="watercrawl",
|
|
url="u",
|
|
tenant_id="tenant-1",
|
|
only_main_content=False,
|
|
),
|
|
api_key="k",
|
|
config={"base_url": "b"},
|
|
)
|
|
assert result == {"markdown": "m"}
|
|
provider_instance.scrape_url.assert_called_once_with("u")
|