Merge branch 'deploy/dev' into feat/knowledgebase-summaryIndex

2026-04-24 04:45:51 +08:00 · 2026-01-28 17:52:26 +08:00
parent 334888dac9 ea44d2ea47
commit ce304f2da7
10 changed files with 1371 additions and 14 deletions
--- a/api/controllers/console/datasets/datasets_document.py
+++ b/api/controllers/console/datasets/datasets_document.py
@ -41,7 +41,7 @@ from fields.document_fields import (
 from libs.datetime_utils import naive_utc_now
 from libs.login import current_account_with_tenant, login_required
 from models import DatasetProcessRule, Document, DocumentSegment, UploadFile
-from models.dataset import DocumentPipelineExecutionLog
+from models.dataset import DocumentPipelineExecutionLog, DocumentSegmentSummary
 from services.dataset_service import DatasetService, DocumentService
 from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig, ProcessRule, RetrievalModel
 from services.file_service import FileService
--- a/api/controllers/console/datasets/datasets_segments.py
+++ b/api/controllers/console/datasets/datasets_segments.py
@ -32,7 +32,7 @@ from extensions.ext_redis import redis_client
 from fields.segment_fields import child_chunk_fields, segment_fields
 from libs.helper import escape_like_pattern
 from libs.login import current_account_with_tenant, login_required
-from models.dataset import ChildChunk, DocumentSegment
+from models.dataset import ChildChunk, DocumentSegment, DocumentSegmentSummary
 from models.model import UploadFile
 from services.dataset_service import DatasetService, DocumentService, SegmentService
 from services.entities.knowledge_entities.knowledge_entities import ChildChunkUpdateArgs, SegmentUpdateArgs
--- a/api/core/workflow/nodes/document_extractor/node.py
+++ b/api/core/workflow/nodes/document_extractor/node.py
@ -62,6 +62,21 @@ class DocumentExtractorNode(Node[DocumentExtractorNodeData]):
        inputs = {"variable_selector": variable_selector}
        process_data = {"documents": value if isinstance(value, list) else [value]}

+        # Ensure storage_key is loaded for File objects
+        files_to_check = value if isinstance(value, list) else [value]
+        files_needing_storage_key = [
+            f for f in files_to_check if isinstance(f, File) and not f.storage_key and f.related_id
+        ]
+        if files_needing_storage_key:
+            from sqlalchemy.orm import Session
+
+            from extensions.ext_database import db
+            from factories.file_factory import StorageKeyLoader
+
+            with Session(bind=db.engine) as session:
+                storage_key_loader = StorageKeyLoader(session, tenant_id=self.tenant_id)
+                storage_key_loader.load_storage_keys(files_needing_storage_key)
+
        try:
            if isinstance(value, list):
                extracted_text_list = list(map(_extract_text_from_file, value))
@ -415,6 +430,16 @@ def _download_file_content(file: File) -> bytes:
            response.raise_for_status()
            return response.content
        else:
+            # Check if storage_key is set
+            if not file.storage_key:
+                raise FileDownloadError(f"File storage_key is missing for file: {file.filename}")
+
+            # Check if file exists before downloading
+            from extensions.ext_storage import storage
+
+            if not storage.exists(file.storage_key):
+                raise FileDownloadError(f"File not found in storage: {file.storage_key}")
+
            return file_manager.download(file)
    except Exception as e:
        raise FileDownloadError(f"Error downloading file: {str(e)}") from e
--- a/api/migrations/versions/2026_01_12_1358-562dcce7d77c_add_summaryindex_feature.py
+++ b/api/migrations/versions/2026_01_12_1358-562dcce7d77c_add_summaryindex_feature.py
@ -0,0 +1,69 @@
+"""add SummaryIndex feature
+
+Revision ID: 562dcce7d77c
+Revises: 03ea244985ce
+Create Date: 2026-01-12 13:58:40.584802
+
+"""
+from alembic import op
+import models as models
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = '562dcce7d77c'
+down_revision = '03ea244985ce'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('document_segment_summary',
+    sa.Column('id', models.types.StringUUID(), nullable=False),
+    sa.Column('dataset_id', models.types.StringUUID(), nullable=False),
+    sa.Column('document_id', models.types.StringUUID(), nullable=False),
+    sa.Column('chunk_id', models.types.StringUUID(), nullable=False),
+    sa.Column('summary_content', models.types.LongText(), nullable=True),
+    sa.Column('summary_index_node_id', sa.String(length=255), nullable=True),
+    sa.Column('summary_index_node_hash', sa.String(length=255), nullable=True),
+    sa.Column('status', sa.String(length=32), server_default=sa.text("'generating'"), nullable=False),
+    sa.Column('error', models.types.LongText(), nullable=True),
+    sa.Column('enabled', sa.Boolean(), server_default=sa.text('true'), nullable=False),
+    sa.Column('disabled_at', sa.DateTime(), nullable=True),
+    sa.Column('disabled_by', models.types.StringUUID(), nullable=True),
+    sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
+    sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
+    sa.PrimaryKeyConstraint('id', name='document_segment_summary_pkey')
+    )
+    with op.batch_alter_table('document_segment_summary', schema=None) as batch_op:
+        batch_op.create_index('document_segment_summary_chunk_id_idx', ['chunk_id'], unique=False)
+        batch_op.create_index('document_segment_summary_dataset_id_idx', ['dataset_id'], unique=False)
+        batch_op.create_index('document_segment_summary_document_id_idx', ['document_id'], unique=False)
+        batch_op.create_index('document_segment_summary_status_idx', ['status'], unique=False)
+
+    with op.batch_alter_table('datasets', schema=None) as batch_op:
+        batch_op.add_column(sa.Column('summary_index_setting', models.types.AdjustedJSON(), nullable=True))
+
+    with op.batch_alter_table('documents', schema=None) as batch_op:
+        batch_op.add_column(sa.Column('need_summary', sa.Boolean(), server_default=sa.text('false'), nullable=True))
+
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('documents', schema=None) as batch_op:
+        batch_op.drop_column('need_summary')
+
+    with op.batch_alter_table('datasets', schema=None) as batch_op:
+        batch_op.drop_column('summary_index_setting')
+
+    with op.batch_alter_table('document_segment_summary', schema=None) as batch_op:
+        batch_op.drop_index('document_segment_summary_status_idx')
+        batch_op.drop_index('document_segment_summary_document_id_idx')
+        batch_op.drop_index('document_segment_summary_dataset_id_idx')
+        batch_op.drop_index('document_segment_summary_chunk_id_idx')
+
+    op.drop_table('document_segment_summary')
+    # ### end Alembic commands ###