mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-08 08:07:21 +08:00
Refactor: migrate document run api (#14351)
### What problem does this PR solve? Before migration: POST /v1/document/run After migration: POST /api/v1/documents/ingest/ ### Type of change - [x] Refactoring
This commit is contained in:
@ -15,7 +15,6 @@
|
||||
#
|
||||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
from test_common import bulk_upload_documents, list_documents, parse_documents
|
||||
@ -124,6 +123,102 @@ class TestDocumentsParse:
|
||||
assert res["code"] == 109, res
|
||||
assert res["message"] == "No authorization.", res
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_document_not_found(self, WebApiAuth, add_documents_func):
|
||||
"""Test document not found error."""
|
||||
kb_id, document_ids = add_documents_func
|
||||
|
||||
# Try to parse a non-existent document
|
||||
res = parse_documents(WebApiAuth, {"doc_ids": ["non_existent_doc_id"], "run": "1"})
|
||||
assert res["code"] == 109, res
|
||||
assert "No authorization" in res["message"], res
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_cancel_non_running_task_error(self, WebApiAuth, add_documents_func):
|
||||
"""Test cancel error when task is not in RUNNING status."""
|
||||
kb_id, document_ids = add_documents_func
|
||||
doc_id = document_ids[0]
|
||||
|
||||
# First, run the document parsing
|
||||
res = parse_documents(WebApiAuth, {"doc_ids": [doc_id], "run": "1"})
|
||||
assert res["code"] == 0, res
|
||||
|
||||
# Wait for parsing to complete
|
||||
condition(WebApiAuth, kb_id, [doc_id])
|
||||
validate_document_parse_done(WebApiAuth, kb_id, [doc_id])
|
||||
|
||||
# Now try to cancel a completed task - should fail
|
||||
res = parse_documents(WebApiAuth, {"doc_ids": [doc_id], "run": "2"})
|
||||
assert res["code"] == 102, res
|
||||
assert res["message"] == "Cannot cancel a task that is not in RUNNING status", res
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_rerun_with_delete(self, WebApiAuth, add_documents_func):
|
||||
"""Test rerun with delete scenario."""
|
||||
kb_id, document_ids = add_documents_func
|
||||
doc_id = document_ids[0]
|
||||
|
||||
# First, run the document parsing
|
||||
res = parse_documents(WebApiAuth, {"doc_ids": [doc_id], "run": "1"})
|
||||
assert res["code"] == 0, res
|
||||
|
||||
# Wait for parsing to complete
|
||||
condition(WebApiAuth, kb_id, [doc_id])
|
||||
validate_document_parse_done(WebApiAuth, kb_id, [doc_id])
|
||||
|
||||
# Verify document has chunks
|
||||
res = list_documents(WebApiAuth, {"kb_id": kb_id})
|
||||
doc = next((d for d in res["data"]["docs"] if d["id"] == doc_id), None)
|
||||
assert doc is not None
|
||||
assert doc["chunk_count"] > 0, "Document should have chunks after parsing"
|
||||
|
||||
# Now rerun with delete - this should clear chunks and re-parse
|
||||
res = parse_documents(WebApiAuth, {"doc_ids": [doc_id], "run": "1", "delete": True})
|
||||
assert res["code"] == 0, res
|
||||
|
||||
# Wait for parsing to complete
|
||||
condition(WebApiAuth, kb_id, [doc_id])
|
||||
validate_document_parse_done(WebApiAuth, kb_id, [doc_id])
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_apply_kb_dataset_not_found(self, WebApiAuth, add_documents_func):
|
||||
"""Test apply_kb when dataset is not found."""
|
||||
kb_id, document_ids = add_documents_func
|
||||
doc_id = document_ids[0]
|
||||
|
||||
# Try to apply_kb with a non-existent dataset - this is tricky to test
|
||||
# because we can't easily delete the dataset after getting the doc_id
|
||||
# This test verifies the happy path works
|
||||
res = parse_documents(WebApiAuth, {"doc_ids": [doc_id], "run": "1"})
|
||||
assert res["code"] == 0, res
|
||||
|
||||
# Wait for parsing to complete
|
||||
condition(WebApiAuth, kb_id, [doc_id])
|
||||
validate_document_parse_done(WebApiAuth, kb_id, [doc_id])
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_successful_parse(self, WebApiAuth, add_documents_func):
|
||||
"""Test successful document parsing."""
|
||||
kb_id, document_ids = add_documents_func
|
||||
doc_id = document_ids[0]
|
||||
|
||||
# Run the document parsing
|
||||
res = parse_documents(WebApiAuth, {"doc_ids": [doc_id], "run": "1"})
|
||||
assert res["code"] == 0, res
|
||||
|
||||
# Wait for parsing to complete
|
||||
condition(WebApiAuth, kb_id, [doc_id])
|
||||
validate_document_parse_done(WebApiAuth, kb_id, [doc_id])
|
||||
|
||||
# Verify the document is properly parsed
|
||||
res = list_documents(WebApiAuth, {"kb_id": kb_id})
|
||||
doc = next((d for d in res["data"]["docs"] if d["id"] == doc_id), None)
|
||||
assert doc is not None
|
||||
assert doc["run"] == "DONE"
|
||||
assert doc["chunk_count"] > 0
|
||||
assert len(doc["process_begin_at"]) > 0
|
||||
assert doc["process_duration"] > 0
|
||||
|
||||
@pytest.mark.p3
|
||||
def test_repeated_parse(self, WebApiAuth, add_documents_func):
|
||||
kb_id, document_ids = add_documents_func
|
||||
@ -199,94 +294,6 @@ def test_concurrent_parse(WebApiAuth, add_dataset_func, tmp_path):
|
||||
validate_document_parse_done(WebApiAuth, kb_id, document_ids)
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
class TestDocumentsParseUnit:
|
||||
def test_run_branch_matrix_unit(self, document_app_module, monkeypatch):
|
||||
module = document_app_module
|
||||
calls = {"clear": [], "filter_delete": [], "docstore_delete": [], "cancel": [], "run": []}
|
||||
|
||||
async def fake_thread_pool_exec(func, *args, **kwargs):
|
||||
return func(*args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(module, "thread_pool_exec", fake_thread_pool_exec)
|
||||
monkeypatch.setattr(module, "server_error_response", lambda e: {"code": 500, "message": str(e)})
|
||||
monkeypatch.setattr(module.search, "index_name", lambda tenant_id: f"idx_{tenant_id}")
|
||||
monkeypatch.setattr(module, "cancel_all_task_of", lambda doc_id: calls["cancel"].append(doc_id))
|
||||
|
||||
class _DocStore:
|
||||
def index_exist(self, _index_name, _kb_id):
|
||||
return True
|
||||
|
||||
def delete(self, where, _index_name, _kb_id):
|
||||
calls["docstore_delete"].append(where["doc_id"])
|
||||
|
||||
monkeypatch.setattr(module.settings, "docStoreConn", _DocStore())
|
||||
|
||||
async def set_request(payload):
|
||||
return payload
|
||||
|
||||
def apply_request(payload):
|
||||
async def fake_request_json():
|
||||
return await set_request(payload)
|
||||
|
||||
monkeypatch.setattr(module, "get_request_json", fake_request_json)
|
||||
|
||||
apply_request({"doc_ids": ["doc1"], "run": module.TaskStatus.RUNNING.value})
|
||||
monkeypatch.setattr(module.DocumentService, "accessible", lambda *_args, **_kwargs: False)
|
||||
res = _run(module.run.__wrapped__())
|
||||
assert res["code"] == module.RetCode.AUTHENTICATION_ERROR
|
||||
|
||||
monkeypatch.setattr(module.DocumentService, "accessible", lambda *_args, **_kwargs: True)
|
||||
monkeypatch.setattr(module.DocumentService, "get_tenant_id", lambda _doc_id: None)
|
||||
res = _run(module.run.__wrapped__())
|
||||
assert res["code"] == module.RetCode.DATA_ERROR
|
||||
assert "Tenant not found!" in res["message"]
|
||||
|
||||
monkeypatch.setattr(module.DocumentService, "get_tenant_id", lambda _doc_id: "tenant1")
|
||||
monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (False, None))
|
||||
res = _run(module.run.__wrapped__())
|
||||
assert res["code"] == module.RetCode.DATA_ERROR
|
||||
assert "Document not found!" in res["message"]
|
||||
|
||||
apply_request({"doc_ids": ["doc1"], "run": module.TaskStatus.CANCEL.value})
|
||||
doc_cancel = SimpleNamespace(id="doc1", run=module.TaskStatus.DONE.value, kb_id="kb1", parser_config={}, to_dict=lambda: {"id": "doc1"})
|
||||
monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (True, doc_cancel))
|
||||
monkeypatch.setattr(module.TaskService, "query", lambda **_kwargs: [SimpleNamespace(progress=1)])
|
||||
res = _run(module.run.__wrapped__())
|
||||
assert res["code"] == module.RetCode.DATA_ERROR
|
||||
assert "Cannot cancel a task that is not in RUNNING status" in res["message"]
|
||||
|
||||
apply_request({"doc_ids": ["doc1"], "run": module.TaskStatus.RUNNING.value, "delete": True})
|
||||
doc_rerun = SimpleNamespace(id="doc1", run=module.TaskStatus.DONE.value, kb_id="kb1", parser_config={}, to_dict=lambda: {"id": "doc1"})
|
||||
monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (True, doc_rerun))
|
||||
monkeypatch.setattr(module.DocumentService, "clear_chunk_num_when_rerun", lambda doc_id: calls["clear"].append(doc_id))
|
||||
monkeypatch.setattr(module.TaskService, "filter_delete", lambda _filters: calls["filter_delete"].append(True))
|
||||
monkeypatch.setattr(module.DocumentService, "update_by_id", lambda *_args, **_kwargs: True)
|
||||
monkeypatch.setattr(module.DocumentService, "run", lambda tenant_id, doc_dict, _kb_map: calls["run"].append((tenant_id, doc_dict)))
|
||||
res = _run(module.run.__wrapped__())
|
||||
assert res["code"] == 0
|
||||
assert calls["clear"] == ["doc1"]
|
||||
assert calls["filter_delete"] == [True]
|
||||
assert calls["docstore_delete"] == ["doc1"]
|
||||
assert calls["run"] == [("tenant1", {"id": "doc1"})]
|
||||
|
||||
apply_request({"doc_ids": ["doc1"], "run": module.TaskStatus.RUNNING.value, "apply_kb": True})
|
||||
monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (False, None))
|
||||
res = _run(module.run.__wrapped__())
|
||||
assert res["code"] == 500
|
||||
assert "Can't find this dataset!" in res["message"]
|
||||
|
||||
apply_request({"doc_ids": ["doc1"], "run": module.TaskStatus.RUNNING.value})
|
||||
|
||||
def raise_run_error(*_args, **_kwargs):
|
||||
raise RuntimeError("run boom")
|
||||
|
||||
monkeypatch.setattr(module.DocumentService, "run", raise_run_error)
|
||||
res = _run(module.run.__wrapped__())
|
||||
assert res["code"] == 500
|
||||
assert "run boom" in res["message"]
|
||||
|
||||
|
||||
# @pytest.mark.skip
|
||||
class TestDocumentsParseStop:
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
Reference in New Issue
Block a user