merge main

This commit is contained in:
zxhlyh
2025-07-28 16:00:38 +08:00
344 changed files with 2476 additions and 1275 deletions

View File

@ -972,7 +972,7 @@ class DocumentRetryApi(DocumentResource):
raise DocumentAlreadyFinishedError()
retry_documents.append(document)
except Exception:
logging.exception(f"Failed to retry document, document id: {document_id}")
logging.exception("Failed to retry document, document id: %s", document_id)
continue
# retry document
DocumentService.retry_document(dataset_id, retry_documents)

View File

@ -1,6 +1,5 @@
import uuid
import pandas as pd
from flask import request
from flask_login import current_user
from flask_restful import Resource, marshal, reqparse
@ -14,8 +13,6 @@ from controllers.console.datasets.error import (
ChildChunkDeleteIndexError,
ChildChunkIndexingError,
InvalidActionError,
NoFileUploadedError,
TooManyFilesError,
)
from controllers.console.wraps import (
account_initialization_required,
@ -32,6 +29,7 @@ from extensions.ext_redis import redis_client
from fields.segment_fields import child_chunk_fields, segment_fields
from libs.login import login_required
from models.dataset import ChildChunk, DocumentSegment
from models.model import UploadFile
from services.dataset_service import DatasetService, DocumentService, SegmentService
from services.entities.knowledge_entities.knowledge_entities import ChildChunkUpdateArgs, SegmentUpdateArgs
from services.errors.chunk import ChildChunkDeleteIndexError as ChildChunkDeleteIndexServiceError
@ -184,7 +182,7 @@ class DatasetDocumentSegmentApi(Resource):
raise ProviderNotInitializeError(ex.description)
segment_ids = request.args.getlist("segment_id")
document_indexing_cache_key = "document_{}_indexing".format(document.id)
document_indexing_cache_key = f"document_{document.id}_indexing"
cache_result = redis_client.get(document_indexing_cache_key)
if cache_result is not None:
raise InvalidActionError("Document is being indexed, please try again later")
@ -365,37 +363,28 @@ class DatasetDocumentSegmentBatchImportApi(Resource):
document = DocumentService.get_document(dataset_id, document_id)
if not document:
raise NotFound("Document not found.")
# get file from request
file = request.files["file"]
# check file
if "file" not in request.files:
raise NoFileUploadedError()
if len(request.files) > 1:
raise TooManyFilesError()
parser = reqparse.RequestParser()
parser.add_argument("upload_file_id", type=str, required=True, nullable=False, location="json")
args = parser.parse_args()
upload_file_id = args["upload_file_id"]
upload_file = db.session.query(UploadFile).where(UploadFile.id == upload_file_id).first()
if not upload_file:
raise NotFound("UploadFile not found.")
# check file type
if not file.filename or not file.filename.lower().endswith(".csv"):
if not upload_file.name or not upload_file.name.lower().endswith(".csv"):
raise ValueError("Invalid file type. Only CSV files are allowed")
try:
# Skip the first row
df = pd.read_csv(file)
result = []
for index, row in df.iterrows():
if document.doc_form == "qa_model":
data = {"content": row.iloc[0], "answer": row.iloc[1]}
else:
data = {"content": row.iloc[0]}
result.append(data)
if len(result) == 0:
raise ValueError("The CSV file is empty.")
# async job
job_id = str(uuid.uuid4())
indexing_cache_key = "segment_batch_import_{}".format(str(job_id))
indexing_cache_key = f"segment_batch_import_{str(job_id)}"
# send batch add segments task
redis_client.setnx(indexing_cache_key, "waiting")
batch_create_segment_to_index_task.delay(
str(job_id), result, dataset_id, document_id, current_user.current_tenant_id, current_user.id
str(job_id), upload_file_id, dataset_id, document_id, current_user.current_tenant_id, current_user.id
)
except Exception as e:
return {"error": str(e)}, 500
@ -406,7 +395,7 @@ class DatasetDocumentSegmentBatchImportApi(Resource):
@account_initialization_required
def get(self, job_id):
job_id = str(job_id)
indexing_cache_key = "segment_batch_import_{}".format(job_id)
indexing_cache_key = f"segment_batch_import_{job_id}"
cache_result = redis_client.get(indexing_cache_key)
if cache_result is None:
raise ValueError("The job does not exist.")