diff --git a/sdk/python/ragflow_sdk/modules/document.py b/sdk/python/ragflow_sdk/modules/document.py index 6cb0e3b5e..cec8de161 100644 --- a/sdk/python/ragflow_sdk/modules/document.py +++ b/sdk/python/ragflow_sdk/modules/document.py @@ -41,7 +41,7 @@ class Document(Base): self.progress = 0.0 self.progress_msg = "" self.process_begin_at = None - self.process_duration = 0.0 + self.process_duation = 0.0 self.run = "0" self.status = "1" for k in list(res_dict.keys()): diff --git a/test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py b/test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py index dfbe6d9b9..e4ac174a6 100644 --- a/test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py +++ b/test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py @@ -85,6 +85,7 @@ class TestCapability: futures = [executor.submit(create_dataset, api_key, {"name": f"dataset_{i}"}) for i in range(count)] responses = list(as_completed(futures)) assert len(responses) == count, responses + assert all(futures.result()["code"] == 0 for futures in futures) @pytest.mark.usefixtures("clear_datasets") diff --git a/test/testcases/test_http_api/test_dataset_mangement/test_delete_datasets.py b/test/testcases/test_http_api/test_dataset_mangement/test_delete_datasets.py index ec5a219e1..d79a85ba4 100644 --- a/test/testcases/test_http_api/test_dataset_mangement/test_delete_datasets.py +++ b/test/testcases/test_http_api/test_dataset_mangement/test_delete_datasets.py @@ -93,6 +93,7 @@ class TestCapability: futures = [executor.submit(delete_datasets, api_key, {"ids": ids[i : i + 1]}) for i in range(count)] responses = list(as_completed(futures)) assert len(responses) == count, responses + assert all(futures.result()["code"] == 0 for futures in futures) class TestDatasetsDelete: diff --git a/test/testcases/test_http_api/test_dataset_mangement/test_list_datasets.py b/test/testcases/test_http_api/test_dataset_mangement/test_list_datasets.py index 8fb23c4ea..0af4c004c 100644 --- a/test/testcases/test_http_api/test_dataset_mangement/test_list_datasets.py +++ b/test/testcases/test_http_api/test_dataset_mangement/test_list_datasets.py @@ -49,6 +49,7 @@ class TestCapability: futures = [executor.submit(list_datasets, api_key) for i in range(count)] responses = list(as_completed(futures)) assert len(responses) == count, responses + assert all(futures.result()["code"] == 0 for futures in futures) @pytest.mark.usefixtures("add_datasets") diff --git a/test/testcases/test_http_api/test_dataset_mangement/test_update_dataset.py b/test/testcases/test_http_api/test_dataset_mangement/test_update_dataset.py index 20c05850a..afc2cfd63 100644 --- a/test/testcases/test_http_api/test_dataset_mangement/test_update_dataset.py +++ b/test/testcases/test_http_api/test_dataset_mangement/test_update_dataset.py @@ -95,6 +95,7 @@ class TestCapability: futures = [executor.submit(update_dataset, api_key, dataset_id, {"name": f"dataset_{i}"}) for i in range(count)] responses = list(as_completed(futures)) assert len(responses) == count, responses + assert all(futures.result()["code"] == 0 for futures in futures) class TestDatasetUpdate: diff --git a/test/testcases/test_http_api/test_file_management_within_dataset/conftest.py b/test/testcases/test_http_api/test_file_management_within_dataset/conftest.py index 44fb4169d..e5fe52bd6 100644 --- a/test/testcases/test_http_api/test_file_management_within_dataset/conftest.py +++ b/test/testcases/test_http_api/test_file_management_within_dataset/conftest.py @@ -25,7 +25,7 @@ def add_document_func(request, api_key, add_dataset, ragflow_tmp_dir): document_ids = bulk_upload_documents(api_key, dataset_id, 1, ragflow_tmp_dir) def cleanup(): - delete_documents(api_key, dataset_id, {"ids": document_ids}) + delete_documents(api_key, dataset_id, {"ids": None}) request.addfinalizer(cleanup) return dataset_id, document_ids[0] @@ -37,7 +37,7 @@ def add_documents(request, api_key, add_dataset, ragflow_tmp_dir): document_ids = bulk_upload_documents(api_key, dataset_id, 5, ragflow_tmp_dir) def cleanup(): - delete_documents(api_key, dataset_id, {"ids": document_ids}) + delete_documents(api_key, dataset_id, {"ids": None}) request.addfinalizer(cleanup) return dataset_id, document_ids @@ -49,7 +49,7 @@ def add_documents_func(request, api_key, add_dataset_func, ragflow_tmp_dir): document_ids = bulk_upload_documents(api_key, dataset_id, 3, ragflow_tmp_dir) def cleanup(): - delete_documents(api_key, dataset_id, {"ids": document_ids}) + delete_documents(api_key, dataset_id, {"ids": None}) request.addfinalizer(cleanup) return dataset_id, document_ids diff --git a/test/testcases/test_http_api/test_file_management_within_dataset/test_delete_documents.py b/test/testcases/test_http_api/test_file_management_within_dataset/test_delete_documents.py index b2f282410..b44534aa5 100644 --- a/test/testcases/test_http_api/test_file_management_within_dataset/test_delete_documents.py +++ b/test/testcases/test_http_api/test_file_management_within_dataset/test_delete_documents.py @@ -13,7 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from concurrent.futures import ThreadPoolExecutor +from concurrent.futures import ThreadPoolExecutor, as_completed + import pytest from common import INVALID_API_TOKEN, bulk_upload_documents, delete_documents, list_documents @@ -148,9 +149,9 @@ class TestDocumentsDeletion: @pytest.mark.p3 def test_concurrent_deletion(api_key, add_dataset, tmp_path): - documents_num = 100 + count = 100 dataset_id = add_dataset - document_ids = bulk_upload_documents(api_key, dataset_id, documents_num, tmp_path) + document_ids = bulk_upload_documents(api_key, dataset_id, count, tmp_path) with ThreadPoolExecutor(max_workers=5) as executor: futures = [ @@ -160,10 +161,11 @@ def test_concurrent_deletion(api_key, add_dataset, tmp_path): dataset_id, {"ids": document_ids[i : i + 1]}, ) - for i in range(documents_num) + for i in range(count) ] - responses = [f.result() for f in futures] - assert all(r["code"] == 0 for r in responses) + responses = list(as_completed(futures)) + assert len(responses) == count, responses + assert all(futures.result()["code"] == 0 for futures in futures) @pytest.mark.p3 diff --git a/test/testcases/test_http_api/test_file_management_within_dataset/test_list_documents.py b/test/testcases/test_http_api/test_file_management_within_dataset/test_list_documents.py index 6fd6d067d..d4a6d6406 100644 --- a/test/testcases/test_http_api/test_file_management_within_dataset/test_list_documents.py +++ b/test/testcases/test_http_api/test_file_management_within_dataset/test_list_documents.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from concurrent.futures import ThreadPoolExecutor +from concurrent.futures import ThreadPoolExecutor, as_completed import pytest from common import INVALID_API_TOKEN, list_documents @@ -342,11 +342,13 @@ class TestDocumentsList: @pytest.mark.p3 def test_concurrent_list(self, api_key, add_documents): dataset_id, _ = add_documents + count = 100 with ThreadPoolExecutor(max_workers=5) as executor: - futures = [executor.submit(list_documents, api_key, dataset_id) for i in range(100)] - responses = [f.result() for f in futures] - assert all(r["code"] == 0 for r in responses) + futures = [executor.submit(list_documents, api_key, dataset_id) for i in range(count)] + responses = list(as_completed(futures)) + assert len(responses) == count, responses + assert all(futures.result()["code"] == 0 for futures in futures) @pytest.mark.p3 def test_invalid_params(self, api_key, add_documents): diff --git a/test/testcases/test_http_api/test_file_management_within_dataset/test_parse_documents.py b/test/testcases/test_http_api/test_file_management_within_dataset/test_parse_documents.py index 4dbd0a2a0..f6bc9b768 100644 --- a/test/testcases/test_http_api/test_file_management_within_dataset/test_parse_documents.py +++ b/test/testcases/test_http_api/test_file_management_within_dataset/test_parse_documents.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from concurrent.futures import ThreadPoolExecutor +from concurrent.futures import ThreadPoolExecutor, as_completed import pytest from common import INVALID_API_TOKEN, bulk_upload_documents, list_documents, parse_documents @@ -195,9 +195,9 @@ def test_concurrent_parse(api_key, add_dataset_func, tmp_path): return False return True - document_num = 100 + count = 100 dataset_id = add_dataset_func - document_ids = bulk_upload_documents(api_key, dataset_id, document_num, tmp_path) + document_ids = bulk_upload_documents(api_key, dataset_id, count, tmp_path) with ThreadPoolExecutor(max_workers=5) as executor: futures = [ @@ -207,11 +207,12 @@ def test_concurrent_parse(api_key, add_dataset_func, tmp_path): dataset_id, {"document_ids": document_ids[i : i + 1]}, ) - for i in range(document_num) + for i in range(count) ] - responses = [f.result() for f in futures] - assert all(r["code"] == 0 for r in responses) + responses = list(as_completed(futures)) + assert len(responses) == count, responses + assert all(futures.result()["code"] == 0 for futures in futures) - condition(api_key, dataset_id, document_num) + condition(api_key, dataset_id, count) validate_document_details(api_key, dataset_id, document_ids) diff --git a/test/testcases/test_http_api/test_file_management_within_dataset/test_upload_documents.py b/test/testcases/test_http_api/test_file_management_within_dataset/test_upload_documents.py index ca2bfedae..801b0b29b 100644 --- a/test/testcases/test_http_api/test_file_management_within_dataset/test_upload_documents.py +++ b/test/testcases/test_http_api/test_file_management_within_dataset/test_upload_documents.py @@ -213,6 +213,7 @@ class TestDocumentsUpload: futures = [executor.submit(upload_documents, api_key, dataset_id, fps[i : i + 1]) for i in range(count)] responses = list(as_completed(futures)) assert len(responses) == count, responses + assert all(futures.result()["code"] == 0 for futures in futures) res = list_datasets(api_key, {"id": dataset_id}) assert res["data"][0]["document_count"] == count diff --git a/test/testcases/test_sdk_api/common.py b/test/testcases/test_sdk_api/common.py index acfa04244..4a1092f97 100644 --- a/test/testcases/test_sdk_api/common.py +++ b/test/testcases/test_sdk_api/common.py @@ -14,10 +14,28 @@ # limitations under the License. # +from pathlib import Path + +from ragflow_sdk import DataSet, Document, RAGFlow +from utils.file_utils import create_txt_file + + # DATASET MANAGEMENT -def batch_create_datasets(client, num): +def batch_create_datasets(client: RAGFlow, num: int) -> list[DataSet]: datasets = [] for i in range(num): dataset = client.create_dataset(name=f"dataset_{i}") datasets.append(dataset) return datasets + + +# FILE MANAGEMENT WITHIN DATASET +def bulk_upload_documents(dataset: DataSet, num: int, tmp_path: Path) -> list[Document]: + document_infos = [] + for i in range(num): + fp = create_txt_file(tmp_path / f"ragflow_test_upload_{i}.txt") + with fp.open("rb") as f: + blob = f.read() + document_infos.append({"display_name": fp.name, "blob": blob}) + + return dataset.upload_documents(document_infos) diff --git a/test/testcases/test_sdk_api/conftest.py b/test/testcases/test_sdk_api/conftest.py index 49cf4657b..e4c07bf67 100644 --- a/test/testcases/test_sdk_api/conftest.py +++ b/test/testcases/test_sdk_api/conftest.py @@ -14,32 +14,109 @@ # limitations under the License. # +from pathlib import Path + import pytest from common import ( batch_create_datasets, + bulk_upload_documents, ) from configs import HOST_ADDRESS, VERSION -from ragflow_sdk import RAGFlow +from pytest import FixtureRequest +from ragflow_sdk import DataSet, RAGFlow +from utils import wait_for +from utils.file_utils import ( + create_docx_file, + create_eml_file, + create_excel_file, + create_html_file, + create_image_file, + create_json_file, + create_md_file, + create_pdf_file, + create_ppt_file, + create_txt_file, +) + + +@wait_for(30, 1, "Document parsing timeout") +def condition(_dataset: DataSet): + documents = DataSet.list_documents(page_size=1000) + for document in documents: + if document.run != "DONE": + return False + return True + + +@pytest.fixture +def generate_test_files(request, tmp_path): + file_creators = { + "docx": (tmp_path / "ragflow_test.docx", create_docx_file), + "excel": (tmp_path / "ragflow_test.xlsx", create_excel_file), + "ppt": (tmp_path / "ragflow_test.pptx", create_ppt_file), + "image": (tmp_path / "ragflow_test.png", create_image_file), + "pdf": (tmp_path / "ragflow_test.pdf", create_pdf_file), + "txt": (tmp_path / "ragflow_test.txt", create_txt_file), + "md": (tmp_path / "ragflow_test.md", create_md_file), + "json": (tmp_path / "ragflow_test.json", create_json_file), + "eml": (tmp_path / "ragflow_test.eml", create_eml_file), + "html": (tmp_path / "ragflow_test.html", create_html_file), + } + + files = {} + for file_type, (file_path, creator_func) in file_creators.items(): + if request.param in ["", file_type]: + creator_func(file_path) + files[file_type] = file_path + return files + + +@pytest.fixture(scope="class") +def ragflow_tmp_dir(request, tmp_path_factory) -> Path: + class_name = request.cls.__name__ + return tmp_path_factory.mktemp(class_name) @pytest.fixture(scope="session") -def client(token): +def client(token) -> RAGFlow: return RAGFlow(api_key=token, base_url=HOST_ADDRESS, version=VERSION) @pytest.fixture(scope="function") -def clear_datasets(request, client): +def clear_datasets(request: FixtureRequest, client: RAGFlow): def cleanup(): client.delete_datasets(ids=None) request.addfinalizer(cleanup) +@pytest.fixture(scope="class") +def add_dataset(request: FixtureRequest, client: RAGFlow): + def cleanup(): + client.delete_datasets(ids=None) + + request.addfinalizer(cleanup) + + dataset_ids = batch_create_datasets(client, 1) + return dataset_ids[0] + + @pytest.fixture(scope="function") -def add_dataset_func(request, client): +def add_dataset_func(request: FixtureRequest, client: RAGFlow) -> DataSet: def cleanup(): client.delete_datasets(ids=None) request.addfinalizer(cleanup) - return batch_create_datasets(client, 1)[0] + + +@pytest.fixture(scope="class") +def add_document(request: FixtureRequest, add_dataset: DataSet, ragflow_tmp_dir): + dataset = add_dataset + documents = bulk_upload_documents(dataset, 1, ragflow_tmp_dir) + + def cleanup(): + dataset.delete_documents(ids=None) + + request.addfinalizer(cleanup) + return dataset, documents[0] diff --git a/test/testcases/test_sdk_api/test_file_management_within_dataset/conftest.py b/test/testcases/test_sdk_api/test_file_management_within_dataset/conftest.py new file mode 100644 index 000000000..32be9683a --- /dev/null +++ b/test/testcases/test_sdk_api/test_file_management_within_dataset/conftest.py @@ -0,0 +1,57 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import pytest +from common import bulk_upload_documents +from pytest import FixtureRequest +from ragflow_sdk import DataSet, Document + + +@pytest.fixture(scope="function") +def add_document_func(request: FixtureRequest, add_dataset: DataSet, ragflow_tmp_dir) -> tuple[DataSet, Document]: + dataset = add_dataset + documents = bulk_upload_documents(dataset, 1, ragflow_tmp_dir) + + def cleanup(): + dataset.delete_documents(ids=None) + + request.addfinalizer(cleanup) + return dataset, documents[0] + + +@pytest.fixture(scope="class") +def add_documents(request: FixtureRequest, add_dataset: DataSet, ragflow_tmp_dir) -> tuple[DataSet, list[Document]]: + dataset = add_dataset + documents = bulk_upload_documents(dataset, 5, ragflow_tmp_dir) + + def cleanup(): + dataset.delete_documents(ids=None) + + request.addfinalizer(cleanup) + return dataset, documents + + +@pytest.fixture(scope="function") +def add_documents_func(request: FixtureRequest, add_dataset_func: DataSet, ragflow_tmp_dir) -> tuple[DataSet, list[Document]]: + dataset = add_dataset_func + documents = bulk_upload_documents(dataset, 3, ragflow_tmp_dir) + + def cleanup(): + dataset.delete_documents(ids=None) + + request.addfinalizer(cleanup) + return dataset, documents diff --git a/test/testcases/test_sdk_api/test_file_management_within_dataset/test_delete_documents.py b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_delete_documents.py new file mode 100644 index 000000000..c90d4294e --- /dev/null +++ b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_delete_documents.py @@ -0,0 +1,118 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from concurrent.futures import ThreadPoolExecutor, as_completed + +import pytest +from common import bulk_upload_documents + + +class TestDocumentsDeletion: + @pytest.mark.p1 + @pytest.mark.parametrize( + "payload, expected_message, remaining", + [ + ({"ids": None}, "", 0), + ({"ids": []}, "", 0), + ({"ids": ["invalid_id"]}, "Documents not found: ['invalid_id']", 3), + ({"ids": ["\n!?。;!?\"'"]}, "Documents not found: ['\\n!?。;!?\"\\'']", 3), + ("not json", "must be a mapping", 3), + (lambda r: {"ids": r[:1]}, "", 2), + (lambda r: {"ids": r}, "", 0), + ], + ) + def test_basic_scenarios( + self, + add_documents_func, + payload, + expected_message, + remaining, + ): + dataset, documents = add_documents_func + if callable(payload): + payload = payload([document.id for document in documents]) + + if expected_message: + with pytest.raises(Exception) as excinfo: + dataset.delete_documents(**payload) + assert expected_message in str(excinfo.value), str(excinfo.value) + else: + dataset.delete_documents(**payload) + + documents = dataset.list_documents() + assert len(documents) == remaining, str(documents) + + @pytest.mark.p2 + @pytest.mark.parametrize( + "payload", + [ + lambda r: {"ids": ["invalid_id"] + r}, + lambda r: {"ids": r[:1] + ["invalid_id"] + r[1:3]}, + lambda r: {"ids": r + ["invalid_id"]}, + ], + ) + def test_delete_partial_invalid_id(self, add_documents_func, payload): + dataset, documents = add_documents_func + payload = payload([document.id for document in documents]) + + with pytest.raises(Exception) as excinfo: + dataset.delete_documents(**payload) + assert "Documents not found: ['invalid_id']" in str(excinfo.value), str(excinfo.value) + + documents = dataset.list_documents() + assert len(documents) == 0, str(documents) + + @pytest.mark.p2 + def test_repeated_deletion(self, add_documents_func): + dataset, documents = add_documents_func + document_ids = [document.id for document in documents] + dataset.delete_documents(ids=document_ids) + with pytest.raises(Exception) as excinfo: + dataset.delete_documents(ids=document_ids) + assert "Documents not found" in str(excinfo.value), str(excinfo.value) + + @pytest.mark.p2 + def test_duplicate_deletion(self, add_documents_func): + dataset, documents = add_documents_func + document_ids = [document.id for document in documents] + dataset.delete_documents(ids=document_ids + document_ids) + assert len(dataset.list_documents()) == 0, str(dataset.list_documents()) + + +@pytest.mark.p3 +def test_concurrent_deletion(add_dataset, tmp_path): + count = 100 + dataset = add_dataset + documents = bulk_upload_documents(dataset, count, tmp_path) + + def delete_doc(doc_id): + dataset.delete_documents(ids=[doc_id]) + + with ThreadPoolExecutor(max_workers=5) as executor: + futures = [executor.submit(delete_doc, doc.id) for doc in documents] + + responses = list(as_completed(futures)) + assert len(responses) == count, responses + + +@pytest.mark.p3 +def test_delete_1k(add_dataset, tmp_path): + count = 1_000 + dataset = add_dataset + documents = bulk_upload_documents(dataset, count, tmp_path) + assert len(dataset.list_documents(page_size=count * 2)) == count + + dataset.delete_documents(ids=[doc.id for doc in documents]) + assert len(dataset.list_documents()) == 0 diff --git a/test/testcases/test_sdk_api/test_file_management_within_dataset/test_download_document.py b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_download_document.py new file mode 100644 index 000000000..3c9169fbb --- /dev/null +++ b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_download_document.py @@ -0,0 +1,89 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from concurrent.futures import ThreadPoolExecutor, as_completed + +import pytest +from common import bulk_upload_documents +from utils import compare_by_hash + + +@pytest.mark.p1 +@pytest.mark.parametrize( + "generate_test_files", + [ + "docx", + "excel", + "ppt", + "image", + "pdf", + "txt", + "md", + "json", + "eml", + "html", + ], + indirect=True, +) +def test_file_type_validation(add_dataset, generate_test_files, request): + dataset = add_dataset + fp = generate_test_files[request.node.callspec.params["generate_test_files"]] + with fp.open("rb") as f: + blob = f.read() + + documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}]) + + for document in documents: + with fp.with_stem("ragflow_test_download").open("wb") as f: + f.write(document.download()) + + assert compare_by_hash(fp, fp.with_stem("ragflow_test_download")) + + +class TestDocumentDownload: + @pytest.mark.p3 + def test_same_file_repeat(self, add_documents, tmp_path, ragflow_tmp_dir): + num = 5 + _, documents = add_documents + + for i in range(num): + download_path = tmp_path / f"ragflow_test_download_{i}.txt" + with download_path.open("wb") as f: + f.write(documents[0].download()) + assert compare_by_hash(ragflow_tmp_dir / "ragflow_test_upload_0.txt", download_path), f"Downloaded file {i} does not match original" + + +@pytest.mark.p3 +def test_concurrent_download(add_dataset, tmp_path): + count = 20 + dataset = add_dataset + documents = bulk_upload_documents(dataset, count, tmp_path) + + def download_doc(document, i): + download_path = tmp_path / f"ragflow_test_download_{i}.txt" + with download_path.open("wb") as f: + f.write(document.download()) + # assert compare_by_hash(tmp_path / f"ragflow_test_upload_{i}.txt", download_path), str(download_path) + + with ThreadPoolExecutor(max_workers=5) as executor: + futures = [executor.submit(download_doc, documents[i], i) for i in range(count)] + responses = list(as_completed(futures)) + assert len(responses) == count, responses + + for i in range(count): + assert compare_by_hash( + tmp_path / f"ragflow_test_upload_{i}.txt", + tmp_path / f"ragflow_test_download_{i}.txt", + ) diff --git a/test/testcases/test_sdk_api/test_file_management_within_dataset/test_list_documents.py b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_list_documents.py new file mode 100644 index 000000000..189093da2 --- /dev/null +++ b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_list_documents.py @@ -0,0 +1,247 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from concurrent.futures import ThreadPoolExecutor, as_completed + +import pytest + + +class TestDocumentsList: + @pytest.mark.p1 + def test_default(self, add_documents): + dataset, _ = add_documents + documents = dataset.list_documents() + assert len(documents) == 5, str(documents) + + @pytest.mark.p1 + @pytest.mark.parametrize( + "params, expected_page_size, expected_message", + [ + ({"page": None, "page_size": 2}, 2, "not instance of"), + ({"page": 0, "page_size": 2}, 2, ""), + ({"page": 2, "page_size": 2}, 2, ""), + ({"page": 3, "page_size": 2}, 1, ""), + ({"page": "3", "page_size": 2}, 1, "not instance of"), + pytest.param( + {"page": -1, "page_size": 2}, + 0, + "Invalid page number", + marks=pytest.mark.skip(reason="issues/5851"), + ), + pytest.param( + {"page": "a", "page_size": 2}, + 0, + "Invalid page value", + marks=pytest.mark.skip(reason="issues/5851"), + ), + ], + ) + def test_page(self, add_documents, params, expected_page_size, expected_message): + dataset, _ = add_documents + if expected_message: + with pytest.raises(Exception) as excinfo: + dataset.list_documents(**params) + assert expected_message in str(excinfo.value), str(excinfo.value) + else: + documents = dataset.list_documents(**params) + assert len(documents) == expected_page_size, str(documents) + + @pytest.mark.p1 + @pytest.mark.parametrize( + "params, expected_page_size, expected_message", + [ + ({"page_size": None}, 5, "not instance of"), + ({"page_size": 0}, 0, ""), + ({"page_size": 1}, 1, ""), + ({"page_size": 6}, 5, ""), + ({"page_size": "1"}, 1, "not instance of"), + pytest.param( + {"page_size": -1}, + 0, + "Invalid page size", + marks=pytest.mark.skip(reason="issues/5851"), + ), + pytest.param( + {"page_size": "a"}, + 0, + "Invalid page size value", + marks=pytest.mark.skip(reason="issues/5851"), + ), + ], + ) + def test_page_size(self, add_documents, params, expected_page_size, expected_message): + dataset, _ = add_documents + if expected_message: + with pytest.raises(Exception) as excinfo: + dataset.list_documents(**params) + assert expected_message in str(excinfo.value), str(excinfo.value) + else: + documents = dataset.list_documents(**params) + assert len(documents) == expected_page_size, str(documents) + + @pytest.mark.p3 + @pytest.mark.parametrize( + "params, expected_message", + [ + ({"orderby": None}, "not instance of"), + ({"orderby": "create_time"}, ""), + ({"orderby": "update_time"}, ""), + pytest.param({"orderby": "name", "desc": "False"}, "", marks=pytest.mark.skip(reason="issues/5851")), + pytest.param({"orderby": "unknown"}, "orderby should be create_time or update_time", marks=pytest.mark.skip(reason="issues/5851")), + ], + ) + def test_orderby(self, add_documents, params, expected_message): + dataset, _ = add_documents + if expected_message: + with pytest.raises(Exception) as excinfo: + dataset.list_documents(**params) + assert expected_message in str(excinfo.value), str(excinfo.value) + else: + dataset.list_documents(**params) + + @pytest.mark.p3 + @pytest.mark.parametrize( + "params, expected_message", + [ + ({"desc": None}, "not instance of"), + ({"desc": "true"}, "not instance of"), + ({"desc": "True"}, "not instance of"), + ({"desc": True}, ""), + pytest.param({"desc": "false"}, "", marks=pytest.mark.skip(reason="issues/5851")), + ({"desc": "False"}, "not instance of"), + ({"desc": False}, ""), + ({"desc": "False", "orderby": "update_time"}, "not instance of"), + pytest.param({"desc": "unknown"}, "desc should be true or false", marks=pytest.mark.skip(reason="issues/5851")), + ], + ) + def test_desc(self, add_documents, params, expected_message): + dataset, _ = add_documents + if expected_message: + with pytest.raises(Exception) as excinfo: + dataset.list_documents(**params) + assert expected_message in str(excinfo.value), str(excinfo.value) + else: + dataset.list_documents(**params) + + @pytest.mark.p2 + @pytest.mark.parametrize( + "params, expected_num", + [ + ({"keywords": None}, 5), + ({"keywords": ""}, 5), + ({"keywords": "0"}, 1), + ({"keywords": "ragflow_test_upload"}, 5), + ({"keywords": "unknown"}, 0), + ], + ) + def test_keywords(self, add_documents, params, expected_num): + dataset, _ = add_documents + documents = dataset.list_documents(**params) + assert len(documents) == expected_num, str(documents) + + @pytest.mark.p1 + @pytest.mark.parametrize( + "params, expected_num, expected_message", + [ + ({"name": None}, 5, ""), + ({"name": ""}, 5, ""), + ({"name": "ragflow_test_upload_0.txt"}, 1, ""), + ({"name": "unknown.txt"}, 0, "You don't own the document unknown.txt"), + ], + ) + def test_name(self, add_documents, params, expected_num, expected_message): + dataset, _ = add_documents + if expected_message: + with pytest.raises(Exception) as excinfo: + dataset.list_documents(**params) + assert expected_message in str(excinfo.value), str(excinfo.value) + else: + documents = dataset.list_documents(**params) + assert len(documents) == expected_num, str(documents) + if params["name"] not in [None, ""]: + assert documents[0].name == params["name"], str(documents) + + @pytest.mark.p1 + @pytest.mark.parametrize( + "document_id, expected_num, expected_message", + [ + (None, 5, ""), + ("", 5, ""), + (lambda docs: docs[0].id, 1, ""), + ("unknown.txt", 0, "You don't own the document unknown.txt"), + ], + ) + def test_id(self, add_documents, document_id, expected_num, expected_message): + dataset, documents = add_documents + if callable(document_id): + params = {"id": document_id(documents)} + else: + params = {"id": document_id} + + if expected_message: + with pytest.raises(Exception) as excinfo: + dataset.list_documents(**params) + assert expected_message in str(excinfo.value), str(excinfo.value) + else: + documents = dataset.list_documents(**params) + assert len(documents) == expected_num, str(documents) + if params["id"] not in [None, ""]: + assert documents[0].id == params["id"], str(documents) + + @pytest.mark.p3 + @pytest.mark.parametrize( + "document_id, name, expected_num, expected_message", + [ + (lambda docs: docs[0].id, "ragflow_test_upload_0.txt", 1, ""), + (lambda docs: docs[0].id, "ragflow_test_upload_1.txt", 0, ""), + (lambda docs: docs[0].id, "unknown", 0, "You don't own the document unknown"), + ("invalid_id", "ragflow_test_upload_0.txt", 0, "You don't own the document invalid_id"), + ], + ) + def test_name_and_id(self, add_documents, document_id, name, expected_num, expected_message): + dataset, documents = add_documents + params = {"id": document_id(documents) if callable(document_id) else document_id, "name": name} + + if expected_message: + with pytest.raises(Exception) as excinfo: + dataset.list_documents(**params) + assert expected_message in str(excinfo.value), str(excinfo.value) + else: + documents = dataset.list_documents(**params) + assert len(documents) == expected_num, str(documents) + + @pytest.mark.p3 + def test_concurrent_list(self, add_documents): + dataset, _ = add_documents + count = 100 + + def list_docs(): + return dataset.list_documents() + + with ThreadPoolExecutor(max_workers=5) as executor: + futures = [executor.submit(list_docs) for _ in range(count)] + responses = list(as_completed(futures)) + assert len(responses) == count, responses + for future in futures: + docs = future.result() + assert len(docs) == 5, str(docs) + + @pytest.mark.p3 + def test_invalid_params(self, add_documents): + dataset, _ = add_documents + params = {"a": "b"} + with pytest.raises(TypeError) as excinfo: + dataset.list_documents(**params) + assert "got an unexpected keyword argument" in str(excinfo.value), str(excinfo.value) diff --git a/test/testcases/test_sdk_api/test_file_management_within_dataset/test_parse_documents.py b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_parse_documents.py new file mode 100644 index 000000000..77c9fed03 --- /dev/null +++ b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_parse_documents.py @@ -0,0 +1,145 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from concurrent.futures import ThreadPoolExecutor, as_completed + +import pytest +from common import bulk_upload_documents +from ragflow_sdk import DataSet +from utils import wait_for + + +@wait_for(30, 1, "Document parsing timeout") +def condition(_dataset: DataSet, _document_ids=None): + documents = _dataset.list_documents(page_size=1000) + + if _document_ids is None: + for document in documents: + if document.run != "DONE": + return False + return True + + target_ids = set(_document_ids) + for document in documents: + if document.id in target_ids: + if document.run != "DONE": + return False + return True + + +def validate_document_details(dataset, document_ids): + documents = dataset.list_documents(page_size=1000) + for document in documents: + if document.id in document_ids: + assert document.run == "DONE" + assert len(document.process_begin_at) > 0 + assert document.process_duation > 0 + assert document.progress > 0 + assert "Task done" in document.progress_msg + + +class TestDocumentsParse: + @pytest.mark.parametrize( + "payload, expected_message", + [ + pytest.param(None, "AttributeError", marks=pytest.mark.skip), + pytest.param({"document_ids": []}, "`document_ids` is required", marks=pytest.mark.p1), + pytest.param({"document_ids": ["invalid_id"]}, "Documents not found: ['invalid_id']", marks=pytest.mark.p3), + pytest.param({"document_ids": ["\n!?。;!?\"'"]}, "Documents not found: ['\\n!?。;!?\"\\'']", marks=pytest.mark.p3), + pytest.param("not json", "AttributeError", marks=pytest.mark.skip), + pytest.param(lambda r: {"document_ids": r[:1]}, "", marks=pytest.mark.p1), + pytest.param(lambda r: {"document_ids": r}, "", marks=pytest.mark.p1), + ], + ) + def test_basic_scenarios(self, add_documents_func, payload, expected_message): + dataset, documents = add_documents_func + if callable(payload): + payload = payload([doc.id for doc in documents]) + + if expected_message: + with pytest.raises(Exception) as excinfo: + dataset.async_parse_documents(**payload) + assert expected_message in str(excinfo.value), str(excinfo.value) + else: + dataset.async_parse_documents(**payload) + condition(dataset, payload["document_ids"]) + validate_document_details(dataset, payload["document_ids"]) + + @pytest.mark.parametrize( + "payload", + [ + pytest.param(lambda r: {"document_ids": ["invalid_id"] + r}, marks=pytest.mark.p3), + pytest.param(lambda r: {"document_ids": r[:1] + ["invalid_id"] + r[1:3]}, marks=pytest.mark.p1), + pytest.param(lambda r: {"document_ids": r + ["invalid_id"]}, marks=pytest.mark.p3), + ], + ) + def test_parse_partial_invalid_document_id(self, add_documents_func, payload): + dataset, documents = add_documents_func + document_ids = [doc.id for doc in documents] + payload = payload(document_ids) + + with pytest.raises(Exception) as excinfo: + dataset.async_parse_documents(**payload) + assert "Documents not found: ['invalid_id']" in str(excinfo.value), str(excinfo.value) + + condition(dataset, document_ids) + validate_document_details(dataset, document_ids) + + @pytest.mark.p3 + def test_repeated_parse(self, add_documents_func): + dataset, documents = add_documents_func + document_ids = [doc.id for doc in documents] + dataset.async_parse_documents(document_ids=document_ids) + condition(dataset, document_ids) + dataset.async_parse_documents(document_ids=document_ids) + + @pytest.mark.p3 + def test_duplicate_parse(self, add_documents_func): + dataset, documents = add_documents_func + document_ids = [doc.id for doc in documents] + dataset.async_parse_documents(document_ids=document_ids + document_ids) + condition(dataset, document_ids) + validate_document_details(dataset, document_ids) + + +@pytest.mark.p3 +def test_parse_100_files(add_dataset_func, tmp_path): + dataset = add_dataset_func + documents = bulk_upload_documents(dataset, 100, tmp_path) + document_ids = [doc.id for doc in documents] + + dataset.async_parse_documents(document_ids=document_ids) + condition(dataset, document_ids) + validate_document_details(dataset, document_ids) + + +@pytest.mark.p3 +def test_concurrent_parse(add_dataset_func, tmp_path): + count = 100 + dataset = add_dataset_func + documents = bulk_upload_documents(dataset, count, tmp_path) + document_ids = [doc.id for doc in documents] + + def parse_doc(doc_id): + dataset.async_parse_documents(document_ids=[doc_id]) + + with ThreadPoolExecutor(max_workers=5) as executor: + futures = [executor.submit(parse_doc, doc.id) for doc in documents] + + responses = list(as_completed(futures)) + assert len(responses) == count, responses + + condition(dataset, document_ids) + validate_document_details(dataset, document_ids) diff --git a/test/testcases/test_sdk_api/test_file_management_within_dataset/test_stop_parse_documents.py b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_stop_parse_documents.py new file mode 100644 index 000000000..9b561881e --- /dev/null +++ b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_stop_parse_documents.py @@ -0,0 +1,41 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest + + +def validate_document_parse_done(dataset, document_ids): + documents = dataset.list_documents(page_size=1000) + for document in documents: + if document.id in document_ids: + assert document.run == "DONE" + assert len(document.process_begin_at) > 0 + assert document.process_duation > 0 + assert document.progress > 0 + assert "Task done" in document.progress_msg + + +def validate_document_parse_cancel(dataset, document_ids): + documents = dataset.list_documents(page_size=1000) + for document in documents: + assert document.run == "CANCEL" + assert len(document.process_begin_at) > 0 + assert document.progress == 0.0 + + +@pytest.mark.skip +class TestDocumentsParseStop: + pass diff --git a/test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py new file mode 100644 index 000000000..83615dcdb --- /dev/null +++ b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py @@ -0,0 +1,411 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +from configs import DOCUMENT_NAME_LIMIT +from ragflow_sdk import DataSet + + +class TestDocumentsUpdated: + @pytest.mark.p1 + @pytest.mark.parametrize( + "name, expected_message", + [ + ("new_name.txt", ""), + (f"{'a' * (DOCUMENT_NAME_LIMIT - 3)}.txt", "The name should be less than 128 bytes"), + (0, "AttributeError"), + (None, "AttributeError"), + ("", "The extension of file can't be changed"), + ("ragflow_test_upload_0", "The extension of file can't be changed"), + ("ragflow_test_upload_1.txt", "Duplicated document name in the same dataset"), + ("RAGFLOW_TEST_UPLOAD_1.TXT", ""), + ], + ) + def test_name(self, add_documents, name, expected_message): + dataset, documents = add_documents + document = documents[0] + + if expected_message: + with pytest.raises(Exception) as excinfo: + document.update({"name": name}) + assert expected_message in str(excinfo.value), str(excinfo.value) + else: + document.update({"name": name}) + updated_doc = dataset.list_documents(id=document.id)[0] + assert updated_doc.name == name, str(updated_doc) + + @pytest.mark.p3 + @pytest.mark.parametrize( + "meta_fields, expected_message", + [ + ({"test": "test"}, ""), + ("test", "meta_fields must be a dictionary"), + ], + ) + def test_meta_fields(self, add_documents, meta_fields, expected_message): + _, documents = add_documents + document = documents[0] + + if expected_message: + with pytest.raises(Exception) as excinfo: + document.update({"meta_fields": meta_fields}) + assert expected_message in str(excinfo.value), str(excinfo.value) + else: + document.update({"meta_fields": meta_fields}) + + @pytest.mark.p2 + @pytest.mark.parametrize( + "chunk_method, expected_message", + [ + ("naive", ""), + ("manual", ""), + ("qa", ""), + ("table", ""), + ("paper", ""), + ("book", ""), + ("laws", ""), + ("presentation", ""), + ("picture", ""), + ("one", ""), + ("knowledge_graph", ""), + ("email", ""), + ("tag", ""), + ("", "`chunk_method` doesn't exist"), + ("other_chunk_method", "`chunk_method` other_chunk_method doesn't exist"), + ], + ) + def test_chunk_method(self, add_documents, chunk_method, expected_message): + dataset, documents = add_documents + document = documents[0] + + if expected_message: + with pytest.raises(Exception) as excinfo: + document.update({"chunk_method": chunk_method}) + assert expected_message in str(excinfo.value), str(excinfo.value) + else: + document.update({"chunk_method": chunk_method}) + updated_doc = dataset.list_documents(id=document.id)[0] + assert updated_doc.chunk_method == chunk_method, str(updated_doc) + + @pytest.mark.p3 + @pytest.mark.parametrize( + "payload, expected_message", + [ + ({"chunk_count": 1}, "Can't change `chunk_count`"), + pytest.param( + {"create_date": "Fri, 14 Mar 2025 16:53:42 GMT"}, + "The input parameters are invalid", + marks=pytest.mark.skip(reason="issues/6104"), + ), + pytest.param( + {"create_time": 1}, + "The input parameters are invalid", + marks=pytest.mark.skip(reason="issues/6104"), + ), + pytest.param( + {"created_by": "ragflow_test"}, + "The input parameters are invalid", + marks=pytest.mark.skip(reason="issues/6104"), + ), + pytest.param( + {"dataset_id": "ragflow_test"}, + "The input parameters are invalid", + marks=pytest.mark.skip(reason="issues/6104"), + ), + pytest.param( + {"id": "ragflow_test"}, + "The input parameters are invalid", + marks=pytest.mark.skip(reason="issues/6104"), + ), + pytest.param( + {"location": "ragflow_test.txt"}, + "The input parameters are invalid", + marks=pytest.mark.skip(reason="issues/6104"), + ), + pytest.param( + {"process_begin_at": 1}, + "The input parameters are invalid", + marks=pytest.mark.skip(reason="issues/6104"), + ), + pytest.param( + {"process_duation": 1.0}, + "The input parameters are invalid", + marks=pytest.mark.skip(reason="issues/6104"), + ), + ({"progress": 1.0}, "Can't change `progress`"), + pytest.param( + {"progress_msg": "ragflow_test"}, + "The input parameters are invalid", + marks=pytest.mark.skip(reason="issues/6104"), + ), + pytest.param( + {"run": "ragflow_test"}, + "The input parameters are invalid", + marks=pytest.mark.skip(reason="issues/6104"), + ), + pytest.param( + {"size": 1}, + "The input parameters are invalid", + marks=pytest.mark.skip(reason="issues/6104"), + ), + pytest.param( + {"source_type": "ragflow_test"}, + "The input parameters are invalid", + marks=pytest.mark.skip(reason="issues/6104"), + ), + pytest.param( + {"thumbnail": "ragflow_test"}, + "The input parameters are invalid", + marks=pytest.mark.skip(reason="issues/6104"), + ), + ({"token_count": 1}, "Can't change `token_count`"), + pytest.param( + {"type": "ragflow_test"}, + "The input parameters are invalid", + marks=pytest.mark.skip(reason="issues/6104"), + ), + pytest.param( + {"update_date": "Fri, 14 Mar 2025 16:33:17 GMT"}, + "The input parameters are invalid", + marks=pytest.mark.skip(reason="issues/6104"), + ), + pytest.param( + {"update_time": 1}, + "The input parameters are invalid", + marks=pytest.mark.skip(reason="issues/6104"), + ), + ], + ) + def test_invalid_field(self, add_documents, payload, expected_message): + _, documents = add_documents + document = documents[0] + + with pytest.raises(Exception) as excinfo: + document.update(payload) + assert expected_message in str(excinfo.value), str(excinfo.value) + + +class TestUpdateDocumentParserConfig: + @pytest.mark.p2 + @pytest.mark.parametrize( + "chunk_method, parser_config, expected_message", + [ + ("naive", {}, ""), + ( + "naive", + { + "chunk_token_num": 128, + "layout_recognize": "DeepDOC", + "html4excel": False, + "delimiter": r"\n", + "task_page_size": 12, + "raptor": {"use_raptor": False}, + }, + "", + ), + pytest.param( + "naive", + {"chunk_token_num": -1}, + "chunk_token_num should be in range from 1 to 100000000", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"chunk_token_num": 0}, + "chunk_token_num should be in range from 1 to 100000000", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"chunk_token_num": 100000000}, + "chunk_token_num should be in range from 1 to 100000000", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"chunk_token_num": 3.14}, + "", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"chunk_token_num": "1024"}, + "", + marks=pytest.mark.skip(reason="issues/6098"), + ), + ("naive", {"layout_recognize": "DeepDOC"}, ""), + ("naive", {"layout_recognize": "Naive"}, ""), + ("naive", {"html4excel": True}, ""), + ("naive", {"html4excel": False}, ""), + pytest.param( + "naive", + {"html4excel": 1}, + "html4excel should be True or False", + marks=pytest.mark.skip(reason="issues/6098"), + ), + ("naive", {"delimiter": ""}, ""), + ("naive", {"delimiter": "`##`"}, ""), + pytest.param( + "naive", + {"delimiter": 1}, + "", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"task_page_size": -1}, + "task_page_size should be in range from 1 to 100000000", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"task_page_size": 0}, + "task_page_size should be in range from 1 to 100000000", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"task_page_size": 100000000}, + "task_page_size should be in range from 1 to 100000000", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"task_page_size": 3.14}, + "", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"task_page_size": "1024"}, + "", + marks=pytest.mark.skip(reason="issues/6098"), + ), + ("naive", {"raptor": {"use_raptor": True}}, ""), + ("naive", {"raptor": {"use_raptor": False}}, ""), + pytest.param( + "naive", + {"invalid_key": "invalid_value"}, + "Abnormal 'parser_config'. Invalid key: invalid_key", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"auto_keywords": -1}, + "auto_keywords should be in range from 0 to 32", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"auto_keywords": 32}, + "auto_keywords should be in range from 0 to 32", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"auto_keywords": 3.14}, + "", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"auto_keywords": "1024"}, + "", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"auto_questions": -1}, + "auto_questions should be in range from 0 to 10", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"auto_questions": 10}, + "auto_questions should be in range from 0 to 10", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"auto_questions": 3.14}, + "", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"auto_questions": "1024"}, + "", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"topn_tags": -1}, + "topn_tags should be in range from 0 to 10", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"topn_tags": 10}, + "topn_tags should be in range from 0 to 10", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"topn_tags": 3.14}, + "", + marks=pytest.mark.skip(reason="issues/6098"), + ), + pytest.param( + "naive", + {"topn_tags": "1024"}, + "", + marks=pytest.mark.skip(reason="issues/6098"), + ), + ], + ) + def test_parser_config(self, client, add_documents, chunk_method, parser_config, expected_message): + dataset, documents = add_documents + document = documents[0] + from operator import attrgetter + + update_data = {"chunk_method": chunk_method, "parser_config": parser_config} + + if expected_message: + with pytest.raises(Exception) as excinfo: + document.update(update_data) + assert expected_message in str(excinfo.value), str(excinfo.value) + else: + document.update(update_data) + updated_doc = dataset.list_documents(id=document.id)[0] + if parser_config: + for k, v in parser_config.items(): + if isinstance(v, dict): + for kk, vv in v.items(): + assert attrgetter(f"{k}.{kk}")(updated_doc.parser_config) == vv, str(updated_doc) + else: + assert attrgetter(k)(updated_doc.parser_config) == v, str(updated_doc) + else: + expected_config = DataSet.ParserConfig( + client, + { + "chunk_token_num": 128, + "delimiter": r"\n", + "html4excel": False, + "layout_recognize": "DeepDOC", + "raptor": {"use_raptor": False}, + }, + ) + assert str(updated_doc.parser_config) == str(expected_config), str(updated_doc) diff --git a/test/testcases/test_sdk_api/test_file_management_within_dataset/test_upload_documents.py b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_upload_documents.py new file mode 100644 index 000000000..aed84e3fe --- /dev/null +++ b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_upload_documents.py @@ -0,0 +1,210 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import string +from concurrent.futures import ThreadPoolExecutor, as_completed + +import pytest +from configs import DOCUMENT_NAME_LIMIT +from utils.file_utils import create_txt_file + + +class TestDocumentsUpload: + @pytest.mark.p1 + def test_valid_single_upload(self, add_dataset_func, tmp_path): + dataset = add_dataset_func + fp = create_txt_file(tmp_path / "ragflow_test.txt") + with fp.open("rb") as f: + blob = f.read() + + documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}]) + for document in documents: + assert document.dataset_id == dataset.id, str(document) + assert document.name == fp.name, str(document) + + @pytest.mark.p1 + @pytest.mark.parametrize( + "generate_test_files", + [ + "docx", + "excel", + "ppt", + "image", + "pdf", + "txt", + "md", + "json", + "eml", + "html", + ], + indirect=True, + ) + def test_file_type_validation(self, add_dataset_func, generate_test_files, request): + dataset = add_dataset_func + fp = generate_test_files[request.node.callspec.params["generate_test_files"]] + + with fp.open("rb") as f: + blob = f.read() + + documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}]) + for document in documents: + assert document.dataset_id == dataset.id, str(document) + assert document.name == fp.name, str(document) + + @pytest.mark.p2 + @pytest.mark.parametrize( + "file_type", + ["exe", "unknown"], + ) + def test_unsupported_file_type(self, add_dataset_func, tmp_path, file_type): + dataset = add_dataset_func + fp = tmp_path / f"ragflow_test.{file_type}" + fp.touch() + + with fp.open("rb") as f: + blob = f.read() + + with pytest.raises(Exception) as excinfo: + dataset.upload_documents([{"display_name": fp.name, "blob": blob}]) + assert str(excinfo.value) == f"ragflow_test.{file_type}: This type of file has not been supported yet!", str(excinfo.value) + + @pytest.mark.p2 + def test_missing_file(self, add_dataset_func): + dataset = add_dataset_func + with pytest.raises(Exception) as excinfo: + dataset.upload_documents([]) + assert str(excinfo.value) == "No file part!", str(excinfo.value) + + @pytest.mark.p3 + def test_empty_file(self, add_dataset_func, tmp_path): + dataset = add_dataset_func + fp = tmp_path / "empty.txt" + fp.touch() + + with fp.open("rb") as f: + blob = f.read() + + documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}]) + for document in documents: + assert document.size == 0, str(document) + + @pytest.mark.p3 + def test_filename_empty(self, add_dataset_func, tmp_path): + dataset = add_dataset_func + fp = create_txt_file(tmp_path / "ragflow_test.txt") + + with fp.open("rb") as f: + blob = f.read() + + with pytest.raises(Exception) as excinfo: + dataset.upload_documents([{"display_name": "", "blob": blob}]) + assert str(excinfo.value) == "No file selected!", str(excinfo.value) + + @pytest.mark.p2 + def test_filename_exceeds_max_length(self, add_dataset_func, tmp_path): + dataset = add_dataset_func + fp = create_txt_file(tmp_path / f"{'a' * (DOCUMENT_NAME_LIMIT - 3)}.txt") + + with fp.open("rb") as f: + blob = f.read() + + with pytest.raises(Exception) as excinfo: + dataset.upload_documents([{"display_name": fp.name, "blob": blob}]) + assert str(excinfo.value) == "File name should be less than 128 bytes.", str(excinfo.value) + + @pytest.mark.p2 + def test_duplicate_files(self, add_dataset_func, tmp_path): + dataset = add_dataset_func + fp = create_txt_file(tmp_path / "ragflow_test.txt") + + with fp.open("rb") as f: + blob = f.read() + + documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}, {"display_name": fp.name, "blob": blob}]) + + assert len(documents) == 2, str(documents) + for i, document in enumerate(documents): + assert document.dataset_id == dataset.id, str(document) + expected_name = fp.name if i == 0 else f"{fp.stem}({i}){fp.suffix}" + assert document.name == expected_name, str(document) + + @pytest.mark.p2 + def test_same_file_repeat(self, add_dataset_func, tmp_path): + dataset = add_dataset_func + fp = create_txt_file(tmp_path / "ragflow_test.txt") + + with fp.open("rb") as f: + blob = f.read() + + for i in range(3): + documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}]) + assert len(documents) == 1, str(documents) + document = documents[0] + assert document.dataset_id == dataset.id, str(document) + expected_name = fp.name if i == 0 else f"{fp.stem}({i}){fp.suffix}" + assert document.name == expected_name, str(document) + + @pytest.mark.p3 + def test_filename_special_characters(self, add_dataset_func, tmp_path): + dataset = add_dataset_func + illegal_chars = '<>:"/\\|?*' + translation_table = str.maketrans({char: "_" for char in illegal_chars}) + safe_filename = string.punctuation.translate(translation_table) + fp = tmp_path / f"{safe_filename}.txt" + fp.write_text("Sample text content") + + with fp.open("rb") as f: + blob = f.read() + + documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}]) + assert len(documents) == 1, str(documents) + document = documents[0] + assert document.dataset_id == dataset.id, str(document) + assert document.name == fp.name, str(document) + + @pytest.mark.p1 + def test_multiple_files(self, client, add_dataset_func, tmp_path): + dataset = add_dataset_func + expected_document_count = 20 + document_infos = [] + for i in range(expected_document_count): + fp = create_txt_file(tmp_path / f"ragflow_test_upload_{i}.txt") + with fp.open("rb") as f: + blob = f.read() + document_infos.append({"display_name": fp.name, "blob": blob}) + documents = dataset.upload_documents(document_infos) + assert len(documents) == expected_document_count, str(documents) + + retrieved_dataset = client.get_dataset(name=dataset.name) + assert retrieved_dataset.document_count == expected_document_count, str(retrieved_dataset) + + @pytest.mark.p3 + def test_concurrent_upload(self, client, add_dataset_func, tmp_path): + dataset = add_dataset_func + count = 20 + fps = [create_txt_file(tmp_path / f"ragflow_test_{i}.txt") for i in range(count)] + + def upload_file(fp): + with fp.open("rb") as f: + blob = f.read() + return dataset.upload_documents([{"display_name": fp.name, "blob": blob}]) + + with ThreadPoolExecutor(max_workers=5) as executor: + futures = [executor.submit(upload_file, fp) for fp in fps] + responses = list(as_completed(futures)) + assert len(responses) == count, responses + + retrieved_dataset = client.get_dataset(name=dataset.name) + assert retrieved_dataset.document_count == count, str(retrieved_dataset)