Merge branch 'deploy/dev' into feat/knowledgebase-summaryIndex

This commit is contained in:
FFXN
2026-01-28 17:52:26 +08:00
committed by GitHub
10 changed files with 1371 additions and 14 deletions

View File

@ -41,7 +41,7 @@ from fields.document_fields import (
from libs.datetime_utils import naive_utc_now
from libs.login import current_account_with_tenant, login_required
from models import DatasetProcessRule, Document, DocumentSegment, UploadFile
from models.dataset import DocumentPipelineExecutionLog
from models.dataset import DocumentPipelineExecutionLog, DocumentSegmentSummary
from services.dataset_service import DatasetService, DocumentService
from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig, ProcessRule, RetrievalModel
from services.file_service import FileService

View File

@ -32,7 +32,7 @@ from extensions.ext_redis import redis_client
from fields.segment_fields import child_chunk_fields, segment_fields
from libs.helper import escape_like_pattern
from libs.login import current_account_with_tenant, login_required
from models.dataset import ChildChunk, DocumentSegment
from models.dataset import ChildChunk, DocumentSegment, DocumentSegmentSummary
from models.model import UploadFile
from services.dataset_service import DatasetService, DocumentService, SegmentService
from services.entities.knowledge_entities.knowledge_entities import ChildChunkUpdateArgs, SegmentUpdateArgs

View File

@ -62,6 +62,21 @@ class DocumentExtractorNode(Node[DocumentExtractorNodeData]):
inputs = {"variable_selector": variable_selector}
process_data = {"documents": value if isinstance(value, list) else [value]}
# Ensure storage_key is loaded for File objects
files_to_check = value if isinstance(value, list) else [value]
files_needing_storage_key = [
f for f in files_to_check if isinstance(f, File) and not f.storage_key and f.related_id
]
if files_needing_storage_key:
from sqlalchemy.orm import Session
from extensions.ext_database import db
from factories.file_factory import StorageKeyLoader
with Session(bind=db.engine) as session:
storage_key_loader = StorageKeyLoader(session, tenant_id=self.tenant_id)
storage_key_loader.load_storage_keys(files_needing_storage_key)
try:
if isinstance(value, list):
extracted_text_list = list(map(_extract_text_from_file, value))
@ -415,6 +430,16 @@ def _download_file_content(file: File) -> bytes:
response.raise_for_status()
return response.content
else:
# Check if storage_key is set
if not file.storage_key:
raise FileDownloadError(f"File storage_key is missing for file: {file.filename}")
# Check if file exists before downloading
from extensions.ext_storage import storage
if not storage.exists(file.storage_key):
raise FileDownloadError(f"File not found in storage: {file.storage_key}")
return file_manager.download(file)
except Exception as e:
raise FileDownloadError(f"Error downloading file: {str(e)}") from e

View File

@ -0,0 +1,69 @@
"""add SummaryIndex feature
Revision ID: 562dcce7d77c
Revises: 03ea244985ce
Create Date: 2026-01-12 13:58:40.584802
"""
from alembic import op
import models as models
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = '562dcce7d77c'
down_revision = '03ea244985ce'
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('document_segment_summary',
sa.Column('id', models.types.StringUUID(), nullable=False),
sa.Column('dataset_id', models.types.StringUUID(), nullable=False),
sa.Column('document_id', models.types.StringUUID(), nullable=False),
sa.Column('chunk_id', models.types.StringUUID(), nullable=False),
sa.Column('summary_content', models.types.LongText(), nullable=True),
sa.Column('summary_index_node_id', sa.String(length=255), nullable=True),
sa.Column('summary_index_node_hash', sa.String(length=255), nullable=True),
sa.Column('status', sa.String(length=32), server_default=sa.text("'generating'"), nullable=False),
sa.Column('error', models.types.LongText(), nullable=True),
sa.Column('enabled', sa.Boolean(), server_default=sa.text('true'), nullable=False),
sa.Column('disabled_at', sa.DateTime(), nullable=True),
sa.Column('disabled_by', models.types.StringUUID(), nullable=True),
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.PrimaryKeyConstraint('id', name='document_segment_summary_pkey')
)
with op.batch_alter_table('document_segment_summary', schema=None) as batch_op:
batch_op.create_index('document_segment_summary_chunk_id_idx', ['chunk_id'], unique=False)
batch_op.create_index('document_segment_summary_dataset_id_idx', ['dataset_id'], unique=False)
batch_op.create_index('document_segment_summary_document_id_idx', ['document_id'], unique=False)
batch_op.create_index('document_segment_summary_status_idx', ['status'], unique=False)
with op.batch_alter_table('datasets', schema=None) as batch_op:
batch_op.add_column(sa.Column('summary_index_setting', models.types.AdjustedJSON(), nullable=True))
with op.batch_alter_table('documents', schema=None) as batch_op:
batch_op.add_column(sa.Column('need_summary', sa.Boolean(), server_default=sa.text('false'), nullable=True))
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('documents', schema=None) as batch_op:
batch_op.drop_column('need_summary')
with op.batch_alter_table('datasets', schema=None) as batch_op:
batch_op.drop_column('summary_index_setting')
with op.batch_alter_table('document_segment_summary', schema=None) as batch_op:
batch_op.drop_index('document_segment_summary_status_idx')
batch_op.drop_index('document_segment_summary_document_id_idx')
batch_op.drop_index('document_segment_summary_dataset_id_idx')
batch_op.drop_index('document_segment_summary_chunk_id_idx')
op.drop_table('document_segment_summary')
# ### end Alembic commands ###