merge main

2026-05-03 08:58:09 +08:00 · 2025-07-28 16:00:38 +08:00
parent acae51c309 15757110cf
commit a95cf6f8b0
344 changed files with 2476 additions and 1275 deletions
--- a/api/controllers/console/datasets/datasets_document.py
+++ b/api/controllers/console/datasets/datasets_document.py
@ -972,7 +972,7 @@ class DocumentRetryApi(DocumentResource):
                    raise DocumentAlreadyFinishedError()
                retry_documents.append(document)
            except Exception:
-                logging.exception(f"Failed to retry document, document id: {document_id}")
+                logging.exception("Failed to retry document, document id: %s", document_id)
                continue
        # retry document
        DocumentService.retry_document(dataset_id, retry_documents)
--- a/api/controllers/console/datasets/datasets_segments.py
+++ b/api/controllers/console/datasets/datasets_segments.py
@ -1,6 +1,5 @@
 import uuid

-import pandas as pd
 from flask import request
 from flask_login import current_user
 from flask_restful import Resource, marshal, reqparse
@ -14,8 +13,6 @@ from controllers.console.datasets.error import (
    ChildChunkDeleteIndexError,
    ChildChunkIndexingError,
    InvalidActionError,
-    NoFileUploadedError,
-    TooManyFilesError,
 )
 from controllers.console.wraps import (
    account_initialization_required,
@ -32,6 +29,7 @@ from extensions.ext_redis import redis_client
 from fields.segment_fields import child_chunk_fields, segment_fields
 from libs.login import login_required
 from models.dataset import ChildChunk, DocumentSegment
+from models.model import UploadFile
 from services.dataset_service import DatasetService, DocumentService, SegmentService
 from services.entities.knowledge_entities.knowledge_entities import ChildChunkUpdateArgs, SegmentUpdateArgs
 from services.errors.chunk import ChildChunkDeleteIndexError as ChildChunkDeleteIndexServiceError
@ -184,7 +182,7 @@ class DatasetDocumentSegmentApi(Resource):
                raise ProviderNotInitializeError(ex.description)
        segment_ids = request.args.getlist("segment_id")

-        document_indexing_cache_key = "document_{}_indexing".format(document.id)
+        document_indexing_cache_key = f"document_{document.id}_indexing"
        cache_result = redis_client.get(document_indexing_cache_key)
        if cache_result is not None:
            raise InvalidActionError("Document is being indexed, please try again later")
@ -365,37 +363,28 @@ class DatasetDocumentSegmentBatchImportApi(Resource):
        document = DocumentService.get_document(dataset_id, document_id)
        if not document:
            raise NotFound("Document not found.")
-        # get file from request
-        file = request.files["file"]
-        # check file
-        if "file" not in request.files:
-            raise NoFileUploadedError()

-        if len(request.files) > 1:
-            raise TooManyFilesError()
+        parser = reqparse.RequestParser()
+        parser.add_argument("upload_file_id", type=str, required=True, nullable=False, location="json")
+        args = parser.parse_args()
+        upload_file_id = args["upload_file_id"]
+
+        upload_file = db.session.query(UploadFile).where(UploadFile.id == upload_file_id).first()
+        if not upload_file:
+            raise NotFound("UploadFile not found.")
+
        # check file type
-        if not file.filename or not file.filename.lower().endswith(".csv"):
+        if not upload_file.name or not upload_file.name.lower().endswith(".csv"):
            raise ValueError("Invalid file type. Only CSV files are allowed")

        try:
-            # Skip the first row
-            df = pd.read_csv(file)
-            result = []
-            for index, row in df.iterrows():
-                if document.doc_form == "qa_model":
-                    data = {"content": row.iloc[0], "answer": row.iloc[1]}
-                else:
-                    data = {"content": row.iloc[0]}
-                result.append(data)
-            if len(result) == 0:
-                raise ValueError("The CSV file is empty.")
            # async job
            job_id = str(uuid.uuid4())
-            indexing_cache_key = "segment_batch_import_{}".format(str(job_id))
+            indexing_cache_key = f"segment_batch_import_{str(job_id)}"
            # send batch add segments task
            redis_client.setnx(indexing_cache_key, "waiting")
            batch_create_segment_to_index_task.delay(
-                str(job_id), result, dataset_id, document_id, current_user.current_tenant_id, current_user.id
+                str(job_id), upload_file_id, dataset_id, document_id, current_user.current_tenant_id, current_user.id
            )
        except Exception as e:
            return {"error": str(e)}, 500
@ -406,7 +395,7 @@ class DatasetDocumentSegmentBatchImportApi(Resource):
    @account_initialization_required
    def get(self, job_id):
        job_id = str(job_id)
-        indexing_cache_key = "segment_batch_import_{}".format(job_id)
+        indexing_cache_key = f"segment_batch_import_{job_id}"
        cache_result = redis_client.get(indexing_cache_key)
        if cache_result is None:
            raise ValueError("The job does not exist.")