merge feat/plugins

2026-04-30 23:48:04 +08:00 · 2024-12-27 14:21:32 +08:00
parent e7a6bc0ec9 df5fb6dca9
commit 1cc15d1ce8
324 changed files with 15188 additions and 8023 deletions
--- a/api/core/rag/datasource/retrieval_service.py
+++ b/api/core/rag/datasource/retrieval_service.py
@ -6,11 +6,14 @@ from flask import Flask, current_app
 from core.rag.data_post_processor.data_post_processor import DataPostProcessor
 from core.rag.datasource.keyword.keyword_factory import Keyword
 from core.rag.datasource.vdb.vector_factory import Vector
+from core.rag.embedding.retrieval import RetrievalSegments
+from core.rag.index_processor.constant.index_type import IndexType
 from core.rag.models.document import Document
 from core.rag.rerank.rerank_type import RerankMode
 from core.rag.retrieval.retrieval_methods import RetrievalMethod
 from extensions.ext_database import db
-from models.dataset import Dataset
+from models.dataset import ChildChunk, Dataset, DocumentSegment
+from models.dataset import Document as DatasetDocument
 from services.external_knowledge_service import ExternalDatasetService

 default_retrieval_model = {
@ -248,3 +251,89 @@ class RetrievalService:
    @staticmethod
    def escape_query_for_search(query: str) -> str:
        return query.replace('"', '\\"')
+
+    @staticmethod
+    def format_retrieval_documents(documents: list[Document]) -> list[RetrievalSegments]:
+        records = []
+        include_segment_ids = []
+        segment_child_map = {}
+        for document in documents:
+            document_id = document.metadata.get("document_id")
+            dataset_document = db.session.query(DatasetDocument).filter(DatasetDocument.id == document_id).first()
+            if dataset_document:
+                if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX:
+                    child_index_node_id = document.metadata.get("doc_id")
+                    result = (
+                        db.session.query(ChildChunk, DocumentSegment)
+                        .join(DocumentSegment, ChildChunk.segment_id == DocumentSegment.id)
+                        .filter(
+                            ChildChunk.index_node_id == child_index_node_id,
+                            DocumentSegment.dataset_id == dataset_document.dataset_id,
+                            DocumentSegment.enabled == True,
+                            DocumentSegment.status == "completed",
+                        )
+                        .first()
+                    )
+                    if result:
+                        child_chunk, segment = result
+                        if not segment:
+                            continue
+                        if segment.id not in include_segment_ids:
+                            include_segment_ids.append(segment.id)
+                            child_chunk_detail = {
+                                "id": child_chunk.id,
+                                "content": child_chunk.content,
+                                "position": child_chunk.position,
+                                "score": document.metadata.get("score", 0.0),
+                            }
+                            map_detail = {
+                                "max_score": document.metadata.get("score", 0.0),
+                                "child_chunks": [child_chunk_detail],
+                            }
+                            segment_child_map[segment.id] = map_detail
+                            record = {
+                                "segment": segment,
+                            }
+                            records.append(record)
+                        else:
+                            child_chunk_detail = {
+                                "id": child_chunk.id,
+                                "content": child_chunk.content,
+                                "position": child_chunk.position,
+                                "score": document.metadata.get("score", 0.0),
+                            }
+                            segment_child_map[segment.id]["child_chunks"].append(child_chunk_detail)
+                            segment_child_map[segment.id]["max_score"] = max(
+                                segment_child_map[segment.id]["max_score"], document.metadata.get("score", 0.0)
+                            )
+                    else:
+                        continue
+                else:
+                    index_node_id = document.metadata["doc_id"]
+
+                    segment = (
+                        db.session.query(DocumentSegment)
+                        .filter(
+                            DocumentSegment.dataset_id == dataset_document.dataset_id,
+                            DocumentSegment.enabled == True,
+                            DocumentSegment.status == "completed",
+                            DocumentSegment.index_node_id == index_node_id,
+                        )
+                        .first()
+                    )
+
+                    if not segment:
+                        continue
+                    include_segment_ids.append(segment.id)
+                    record = {
+                        "segment": segment,
+                        "score": document.metadata.get("score", None),
+                    }
+
+                    records.append(record)
+            for record in records:
+                if record["segment"].id in segment_child_map:
+                    record["child_chunks"] = segment_child_map[record["segment"].id].get("child_chunks", None)
+                    record["score"] = segment_child_map[record["segment"].id]["max_score"]
+
+        return [RetrievalSegments(**record) for record in records]
--- a/api/core/rag/docstore/dataset_docstore.py
+++ b/api/core/rag/docstore/dataset_docstore.py
@ -7,7 +7,7 @@ from core.model_manager import ModelManager
 from core.model_runtime.entities.model_entities import ModelType
 from core.rag.models.document import Document
 from extensions.ext_database import db
-from models.dataset import Dataset, DocumentSegment
+from models.dataset import ChildChunk, Dataset, DocumentSegment


 class DatasetDocumentStore:
@ -60,7 +60,7 @@ class DatasetDocumentStore:

        return output

-    def add_documents(self, docs: Sequence[Document], allow_update: bool = True) -> None:
+    def add_documents(self, docs: Sequence[Document], allow_update: bool = True, save_child: bool = False) -> None:
        max_position = (
            db.session.query(func.max(DocumentSegment.position))
            .filter(DocumentSegment.document_id == self._document_id)
@ -120,13 +120,55 @@ class DatasetDocumentStore:
                    segment_document.answer = doc.metadata.pop("answer", "")

                db.session.add(segment_document)
+                db.session.flush()
+                if save_child:
+                    if doc.children:
+                        for postion, child in enumerate(doc.children, start=1):
+                            child_segment = ChildChunk(
+                                tenant_id=self._dataset.tenant_id,
+                                dataset_id=self._dataset.id,
+                                document_id=self._document_id,
+                                segment_id=segment_document.id,
+                                position=postion,
+                                index_node_id=child.metadata.get("doc_id"),
+                                index_node_hash=child.metadata.get("doc_hash"),
+                                content=child.page_content,
+                                word_count=len(child.page_content),
+                                type="automatic",
+                                created_by=self._user_id,
+                            )
+                            db.session.add(child_segment)
            else:
                segment_document.content = doc.page_content
                if doc.metadata.get("answer"):
                    segment_document.answer = doc.metadata.pop("answer", "")
-                segment_document.index_node_hash = doc.metadata["doc_hash"]
+                segment_document.index_node_hash = doc.metadata.get("doc_hash")
                segment_document.word_count = len(doc.page_content)
                segment_document.tokens = tokens
+                if save_child and doc.children:
+                    # delete the existing child chunks
+                    db.session.query(ChildChunk).filter(
+                        ChildChunk.tenant_id == self._dataset.tenant_id,
+                        ChildChunk.dataset_id == self._dataset.id,
+                        ChildChunk.document_id == self._document_id,
+                        ChildChunk.segment_id == segment_document.id,
+                    ).delete()
+                    # add new child chunks
+                    for position, child in enumerate(doc.children, start=1):
+                        child_segment = ChildChunk(
+                            tenant_id=self._dataset.tenant_id,
+                            dataset_id=self._dataset.id,
+                            document_id=self._document_id,
+                            segment_id=segment_document.id,
+                            position=position,
+                            index_node_id=child.metadata.get("doc_id"),
+                            index_node_hash=child.metadata.get("doc_hash"),
+                            content=child.page_content,
+                            word_count=len(child.page_content),
+                            type="automatic",
+                            created_by=self._user_id,
+                        )
+                        db.session.add(child_segment)

            db.session.commit()

--- a/api/core/rag/embedding/retrieval.py
+++ b/api/core/rag/embedding/retrieval.py
@ -0,0 +1,23 @@
+from typing import Optional
+
+from pydantic import BaseModel
+
+from models.dataset import DocumentSegment
+
+
+class RetrievalChildChunk(BaseModel):
+    """Retrieval segments."""
+
+    id: str
+    content: str
+    score: float
+    position: int
+
+
+class RetrievalSegments(BaseModel):
+    """Retrieval segments."""
+
+    model_config = {"arbitrary_types_allowed": True}
+    segment: DocumentSegment
+    child_chunks: Optional[list[RetrievalChildChunk]] = None
+    score: Optional[float] = None
--- a/api/core/rag/extractor/excel_extractor.py
+++ b/api/core/rag/extractor/excel_extractor.py
@ -4,7 +4,7 @@ import os
 from typing import Optional, cast

 import pandas as pd
-from openpyxl import load_workbook
+from openpyxl import load_workbook  # type: ignore

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
--- a/api/core/rag/extractor/extract_processor.py
+++ b/api/core/rag/extractor/extract_processor.py
@ -24,7 +24,6 @@ from core.rag.extractor.unstructured.unstructured_markdown_extractor import Unst
 from core.rag.extractor.unstructured.unstructured_msg_extractor import UnstructuredMsgExtractor
 from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
 from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor
-from core.rag.extractor.unstructured.unstructured_text_extractor import UnstructuredTextExtractor
 from core.rag.extractor.unstructured.unstructured_xml_extractor import UnstructuredXmlExtractor
 from core.rag.extractor.word_extractor import WordExtractor
 from core.rag.models.document import Document
@ -103,12 +102,11 @@ class ExtractProcessor:
                input_file = Path(file_path)
                file_extension = input_file.suffix.lower()
                etl_type = dify_config.ETL_TYPE
-                unstructured_api_url = dify_config.UNSTRUCTURED_API_URL
-                unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY
-                assert unstructured_api_url is not None, "unstructured_api_url is required"
-                assert unstructured_api_key is not None, "unstructured_api_key is required"
                extractor: Optional[BaseExtractor] = None
                if etl_type == "Unstructured":
+                    unstructured_api_url = dify_config.UNSTRUCTURED_API_URL
+                    unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY or ""
+
                    if file_extension in {".xlsx", ".xls"}:
                        extractor = ExcelExtractor(file_path)
                    elif file_extension == ".pdf":
@ -141,11 +139,7 @@ class ExtractProcessor:
                        extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url, unstructured_api_key)
                    else:
                        # txt
-                        extractor = (
-                            UnstructuredTextExtractor(file_path, unstructured_api_url)
-                            if is_automatic
-                            else TextExtractor(file_path, autodetect_encoding=True)
-                        )
+                        extractor = TextExtractor(file_path, autodetect_encoding=True)
                else:
                    if file_extension in {".xlsx", ".xls"}:
                        extractor = ExcelExtractor(file_path)
--- a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
@ -1,5 +1,6 @@
 import base64
 import logging
+from typing import Optional

 from bs4 import BeautifulSoup  # type: ignore

@ -15,7 +16,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
        file_path: Path to the file to load.
    """

-    def __init__(self, file_path: str, api_url: str, api_key: str):
+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
--- a/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
@ -19,7 +19,7 @@ class UnstructuredEpubExtractor(BaseExtractor):
        self,
        file_path: str,
        api_url: Optional[str] = None,
-        api_key: Optional[str] = None,
+        api_key: str = "",
    ):
        """Initialize with file path."""
        self._file_path = file_path
@ -30,9 +30,6 @@ class UnstructuredEpubExtractor(BaseExtractor):
        if self._api_url:
            from unstructured.partition.api import partition_via_api

-            if self._api_key is None:
-                raise ValueError("api_key is required")
-
            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
        else:
            from unstructured.partition.epub import partition_epub
--- a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
@ -1,4 +1,5 @@
 import logging
+from typing import Optional

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@ -24,7 +25,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
            if the specified encoding fails.
    """

-    def __init__(self, file_path: str, api_url: str, api_key: str):
+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
--- a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
@ -1,4 +1,5 @@
 import logging
+from typing import Optional

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@ -14,7 +15,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
        file_path: Path to the file to load.
    """

-    def __init__(self, file_path: str, api_url: str, api_key: str):
+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
--- a/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
@ -1,4 +1,5 @@
 import logging
+from typing import Optional

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@ -14,7 +15,7 @@ class UnstructuredPPTExtractor(BaseExtractor):
        file_path: Path to the file to load.
    """

-    def __init__(self, file_path: str, api_url: str, api_key: str):
+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
--- a/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
@ -1,4 +1,5 @@
 import logging
+from typing import Optional

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@ -14,7 +15,7 @@ class UnstructuredPPTXExtractor(BaseExtractor):
        file_path: Path to the file to load.
    """

-    def __init__(self, file_path: str, api_url: str, api_key: str):
+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
--- a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
@ -1,4 +1,5 @@
 import logging
+from typing import Optional

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@ -14,7 +15,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
        file_path: Path to the file to load.
    """

-    def __init__(self, file_path: str, api_url: str, api_key: str):
+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
--- a/api/core/rag/extractor/word_extractor.py
+++ b/api/core/rag/extractor/word_extractor.py
@ -267,8 +267,10 @@ class WordExtractor(BaseExtractor):
                if isinstance(element.tag, str) and element.tag.endswith("p"):  # paragraph
                    para = paragraphs.pop(0)
                    parsed_paragraph = parse_paragraph(para)
-                    if parsed_paragraph:
+                    if parsed_paragraph.strip():
                        content.append(parsed_paragraph)
+                    else:
+                        content.append("\n")
                elif isinstance(element.tag, str) and element.tag.endswith("tbl"):  # table
                    table = tables.pop(0)
                    content.append(self._table_to_markdown(table, image_map))
--- a/api/core/rag/index_processor/constant/index_type.py
+++ b/api/core/rag/index_processor/constant/index_type.py
@ -1,8 +1,7 @@
 from enum import Enum


-class IndexType(Enum):
+class IndexType(str, Enum):
    PARAGRAPH_INDEX = "text_model"
    QA_INDEX = "qa_model"
-    PARENT_CHILD_INDEX = "parent_child_index"
-    SUMMARY_INDEX = "summary_index"
+    PARENT_CHILD_INDEX = "hierarchical_model"
--- a/api/core/rag/index_processor/index_processor_base.py
+++ b/api/core/rag/index_processor/index_processor_base.py
@ -27,10 +27,10 @@ class BaseIndexProcessor(ABC):
        raise NotImplementedError

    @abstractmethod
-    def load(self, dataset: Dataset, documents: list[Document], with_keywords: bool = True):
+    def load(self, dataset: Dataset, documents: list[Document], with_keywords: bool = True, **kwargs):
        raise NotImplementedError

-    def clean(self, dataset: Dataset, node_ids: Optional[list[str]], with_keywords: bool = True):
+    def clean(self, dataset: Dataset, node_ids: Optional[list[str]], with_keywords: bool = True, **kwargs):
        raise NotImplementedError

    @abstractmethod
@ -45,26 +45,29 @@ class BaseIndexProcessor(ABC):
    ) -> list[Document]:
        raise NotImplementedError

-    def _get_splitter(self, processing_rule: dict, embedding_model_instance: Optional[ModelInstance]) -> TextSplitter:
+    def _get_splitter(
+        self,
+        processing_rule_mode: str,
+        max_tokens: int,
+        chunk_overlap: int,
+        separator: str,
+        embedding_model_instance: Optional[ModelInstance],
+    ) -> TextSplitter:
        """
        Get the NodeParser object according to the processing rule.
        """
-        character_splitter: TextSplitter
-        if processing_rule["mode"] == "custom":
+        if processing_rule_mode in ["custom", "hierarchical"]:
            # The user-defined segmentation rule
-            rules = processing_rule["rules"]
-            segmentation = rules["segmentation"]
            max_segmentation_tokens_length = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
-            if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > max_segmentation_tokens_length:
+            if max_tokens < 50 or max_tokens > max_segmentation_tokens_length:
                raise ValueError(f"Custom segment length should be between 50 and {max_segmentation_tokens_length}.")

-            separator = segmentation["separator"]
            if separator:
                separator = separator.replace("\\n", "\n")

            character_splitter = FixedRecursiveCharacterTextSplitter.from_encoder(
-                chunk_size=segmentation["max_tokens"],
-                chunk_overlap=segmentation.get("chunk_overlap", 0) or 0,
+                chunk_size=max_tokens,
+                chunk_overlap=chunk_overlap,
                fixed_separator=separator,
                separators=["\n\n", "。", ". ", " ", ""],
                embedding_model_instance=embedding_model_instance,
@ -78,4 +81,4 @@ class BaseIndexProcessor(ABC):
                embedding_model_instance=embedding_model_instance,
            )

-        return character_splitter
+        return character_splitter  # type: ignore
--- a/api/core/rag/index_processor/index_processor_factory.py
+++ b/api/core/rag/index_processor/index_processor_factory.py
@ -3,6 +3,7 @@
 from core.rag.index_processor.constant.index_type import IndexType
 from core.rag.index_processor.index_processor_base import BaseIndexProcessor
 from core.rag.index_processor.processor.paragraph_index_processor import ParagraphIndexProcessor
+from core.rag.index_processor.processor.parent_child_index_processor import ParentChildIndexProcessor
 from core.rag.index_processor.processor.qa_index_processor import QAIndexProcessor


@ -18,9 +19,11 @@ class IndexProcessorFactory:
        if not self._index_type:
            raise ValueError("Index type must be specified.")

-        if self._index_type == IndexType.PARAGRAPH_INDEX.value:
+        if self._index_type == IndexType.PARAGRAPH_INDEX:
            return ParagraphIndexProcessor()
-        elif self._index_type == IndexType.QA_INDEX.value:
+        elif self._index_type == IndexType.QA_INDEX:
            return QAIndexProcessor()
+        elif self._index_type == IndexType.PARENT_CHILD_INDEX:
+            return ParentChildIndexProcessor()
        else:
            raise ValueError(f"Index type {self._index_type} is not supported.")
--- a/api/core/rag/index_processor/processor/paragraph_index_processor.py
+++ b/api/core/rag/index_processor/processor/paragraph_index_processor.py
@ -13,21 +13,40 @@ from core.rag.index_processor.index_processor_base import BaseIndexProcessor
 from core.rag.models.document import Document
 from core.tools.utils.text_processing_utils import remove_leading_symbols
 from libs import helper
-from models.dataset import Dataset
+from models.dataset import Dataset, DatasetProcessRule
+from services.entities.knowledge_entities.knowledge_entities import Rule


 class ParagraphIndexProcessor(BaseIndexProcessor):
    def extract(self, extract_setting: ExtractSetting, **kwargs) -> list[Document]:
        text_docs = ExtractProcessor.extract(
-            extract_setting=extract_setting, is_automatic=kwargs.get("process_rule_mode") == "automatic"
+            extract_setting=extract_setting,
+            is_automatic=(
+                kwargs.get("process_rule_mode") == "automatic" or kwargs.get("process_rule_mode") == "hierarchical"
+            ),
        )

        return text_docs

    def transform(self, documents: list[Document], **kwargs) -> list[Document]:
+        process_rule = kwargs.get("process_rule")
+        if not process_rule:
+            raise ValueError("No process rule found.")
+        if process_rule.get("mode") == "automatic":
+            automatic_rule = DatasetProcessRule.AUTOMATIC_RULES
+            rules = Rule(**automatic_rule)
+        else:
+            if not process_rule.get("rules"):
+                raise ValueError("No rules found in process rule.")
+            rules = Rule(**process_rule.get("rules"))
        # Split the text documents into nodes.
+        if not rules.segmentation:
+            raise ValueError("No segmentation found in rules.")
        splitter = self._get_splitter(
-            processing_rule=kwargs.get("process_rule", {}),
+            processing_rule_mode=process_rule.get("mode"),
+            max_tokens=rules.segmentation.max_tokens,
+            chunk_overlap=rules.segmentation.chunk_overlap,
+            separator=rules.segmentation.separator,
            embedding_model_instance=kwargs.get("embedding_model_instance"),
        )
        all_documents = []
@ -53,15 +72,19 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
            all_documents.extend(split_documents)
        return all_documents

-    def load(self, dataset: Dataset, documents: list[Document], with_keywords: bool = True):
+    def load(self, dataset: Dataset, documents: list[Document], with_keywords: bool = True, **kwargs):
        if dataset.indexing_technique == "high_quality":
            vector = Vector(dataset)
            vector.create(documents)
        if with_keywords:
+            keywords_list = kwargs.get("keywords_list")
            keyword = Keyword(dataset)
-            keyword.create(documents)
+            if keywords_list and len(keywords_list) > 0:
+                keyword.add_texts(documents, keywords_list=keywords_list)
+            else:
+                keyword.add_texts(documents)

-    def clean(self, dataset: Dataset, node_ids: Optional[list[str]], with_keywords: bool = True):
+    def clean(self, dataset: Dataset, node_ids: Optional[list[str]], with_keywords: bool = True, **kwargs):
        if dataset.indexing_technique == "high_quality":
            vector = Vector(dataset)
            if node_ids:
--- a/api/core/rag/index_processor/processor/parent_child_index_processor.py
+++ b/api/core/rag/index_processor/processor/parent_child_index_processor.py
@ -0,0 +1,195 @@
+"""Paragraph index processor."""
+
+import uuid
+from typing import Optional
+
+from core.model_manager import ModelInstance
+from core.rag.cleaner.clean_processor import CleanProcessor
+from core.rag.datasource.retrieval_service import RetrievalService
+from core.rag.datasource.vdb.vector_factory import Vector
+from core.rag.extractor.entity.extract_setting import ExtractSetting
+from core.rag.extractor.extract_processor import ExtractProcessor
+from core.rag.index_processor.index_processor_base import BaseIndexProcessor
+from core.rag.models.document import ChildDocument, Document
+from extensions.ext_database import db
+from libs import helper
+from models.dataset import ChildChunk, Dataset, DocumentSegment
+from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule
+
+
+class ParentChildIndexProcessor(BaseIndexProcessor):
+    def extract(self, extract_setting: ExtractSetting, **kwargs) -> list[Document]:
+        text_docs = ExtractProcessor.extract(
+            extract_setting=extract_setting,
+            is_automatic=(
+                kwargs.get("process_rule_mode") == "automatic" or kwargs.get("process_rule_mode") == "hierarchical"
+            ),
+        )
+
+        return text_docs
+
+    def transform(self, documents: list[Document], **kwargs) -> list[Document]:
+        process_rule = kwargs.get("process_rule")
+        if not process_rule:
+            raise ValueError("No process rule found.")
+        if not process_rule.get("rules"):
+            raise ValueError("No rules found in process rule.")
+        rules = Rule(**process_rule.get("rules"))
+        all_documents = []  # type: ignore
+        if rules.parent_mode == ParentMode.PARAGRAPH:
+            # Split the text documents into nodes.
+            splitter = self._get_splitter(
+                processing_rule_mode=process_rule.get("mode"),
+                max_tokens=rules.segmentation.max_tokens,
+                chunk_overlap=rules.segmentation.chunk_overlap,
+                separator=rules.segmentation.separator,
+                embedding_model_instance=kwargs.get("embedding_model_instance"),
+            )
+            for document in documents:
+                # document clean
+                document_text = CleanProcessor.clean(document.page_content, process_rule)
+                document.page_content = document_text
+                # parse document to nodes
+                document_nodes = splitter.split_documents([document])
+                split_documents = []
+                for document_node in document_nodes:
+                    if document_node.page_content.strip():
+                        doc_id = str(uuid.uuid4())
+                        hash = helper.generate_text_hash(document_node.page_content)
+                        document_node.metadata["doc_id"] = doc_id
+                        document_node.metadata["doc_hash"] = hash
+                        # delete Splitter character
+                        page_content = document_node.page_content
+                        if page_content.startswith(".") or page_content.startswith("。"):
+                            page_content = page_content[1:].strip()
+                        else:
+                            page_content = page_content
+                        if len(page_content) > 0:
+                            document_node.page_content = page_content
+                            # parse document to child nodes
+                            child_nodes = self._split_child_nodes(
+                                document_node, rules, process_rule.get("mode"), kwargs.get("embedding_model_instance")
+                            )
+                            document_node.children = child_nodes
+                            split_documents.append(document_node)
+                all_documents.extend(split_documents)
+        elif rules.parent_mode == ParentMode.FULL_DOC:
+            page_content = "\n".join([document.page_content for document in documents])
+            document = Document(page_content=page_content, metadata=documents[0].metadata)
+            # parse document to child nodes
+            child_nodes = self._split_child_nodes(
+                document, rules, process_rule.get("mode"), kwargs.get("embedding_model_instance")
+            )
+            document.children = child_nodes
+            doc_id = str(uuid.uuid4())
+            hash = helper.generate_text_hash(document.page_content)
+            document.metadata["doc_id"] = doc_id
+            document.metadata["doc_hash"] = hash
+            all_documents.append(document)
+
+        return all_documents
+
+    def load(self, dataset: Dataset, documents: list[Document], with_keywords: bool = True, **kwargs):
+        if dataset.indexing_technique == "high_quality":
+            vector = Vector(dataset)
+            for document in documents:
+                child_documents = document.children
+                if child_documents:
+                    formatted_child_documents = [
+                        Document(**child_document.model_dump()) for child_document in child_documents
+                    ]
+                    vector.create(formatted_child_documents)
+
+    def clean(self, dataset: Dataset, node_ids: Optional[list[str]], with_keywords: bool = True, **kwargs):
+        # node_ids is segment's node_ids
+        if dataset.indexing_technique == "high_quality":
+            delete_child_chunks = kwargs.get("delete_child_chunks") or False
+            vector = Vector(dataset)
+            if node_ids:
+                child_node_ids = (
+                    db.session.query(ChildChunk.index_node_id)
+                    .join(DocumentSegment, ChildChunk.segment_id == DocumentSegment.id)
+                    .filter(
+                        DocumentSegment.dataset_id == dataset.id,
+                        DocumentSegment.index_node_id.in_(node_ids),
+                        ChildChunk.dataset_id == dataset.id,
+                    )
+                    .all()
+                )
+                child_node_ids = [child_node_id[0] for child_node_id in child_node_ids]
+                vector.delete_by_ids(child_node_ids)
+                if delete_child_chunks:
+                    db.session.query(ChildChunk).filter(
+                        ChildChunk.dataset_id == dataset.id, ChildChunk.index_node_id.in_(child_node_ids)
+                    ).delete()
+                    db.session.commit()
+            else:
+                vector.delete()
+
+                if delete_child_chunks:
+                    db.session.query(ChildChunk).filter(ChildChunk.dataset_id == dataset.id).delete()
+                    db.session.commit()
+
+    def retrieve(
+        self,
+        retrieval_method: str,
+        query: str,
+        dataset: Dataset,
+        top_k: int,
+        score_threshold: float,
+        reranking_model: dict,
+    ) -> list[Document]:
+        # Set search parameters.
+        results = RetrievalService.retrieve(
+            retrieval_method=retrieval_method,
+            dataset_id=dataset.id,
+            query=query,
+            top_k=top_k,
+            score_threshold=score_threshold,
+            reranking_model=reranking_model,
+        )
+        # Organize results.
+        docs = []
+        for result in results:
+            metadata = result.metadata
+            metadata["score"] = result.score
+            if result.score > score_threshold:
+                doc = Document(page_content=result.page_content, metadata=metadata)
+                docs.append(doc)
+        return docs
+
+    def _split_child_nodes(
+        self,
+        document_node: Document,
+        rules: Rule,
+        process_rule_mode: str,
+        embedding_model_instance: Optional[ModelInstance],
+    ) -> list[ChildDocument]:
+        if not rules.subchunk_segmentation:
+            raise ValueError("No subchunk segmentation found in rules.")
+        child_splitter = self._get_splitter(
+            processing_rule_mode=process_rule_mode,
+            max_tokens=rules.subchunk_segmentation.max_tokens,
+            chunk_overlap=rules.subchunk_segmentation.chunk_overlap,
+            separator=rules.subchunk_segmentation.separator,
+            embedding_model_instance=embedding_model_instance,
+        )
+        # parse document to child nodes
+        child_nodes = []
+        child_documents = child_splitter.split_documents([document_node])
+        for child_document_node in child_documents:
+            if child_document_node.page_content.strip():
+                doc_id = str(uuid.uuid4())
+                hash = helper.generate_text_hash(child_document_node.page_content)
+                child_document = ChildDocument(
+                    page_content=child_document_node.page_content, metadata=document_node.metadata
+                )
+                child_document.metadata["doc_id"] = doc_id
+                child_document.metadata["doc_hash"] = hash
+                child_page_content = child_document.page_content
+                if child_page_content.startswith(".") or child_page_content.startswith("。"):
+                    child_page_content = child_page_content[1:].strip()
+                if len(child_page_content) > 0:
+                    child_document.page_content = child_page_content
+                    child_nodes.append(child_document)
+        return child_nodes
--- a/api/core/rag/index_processor/processor/qa_index_processor.py
+++ b/api/core/rag/index_processor/processor/qa_index_processor.py
@ -21,18 +21,32 @@ from core.rag.models.document import Document
 from core.tools.utils.text_processing_utils import remove_leading_symbols
 from libs import helper
 from models.dataset import Dataset
+from services.entities.knowledge_entities.knowledge_entities import Rule


 class QAIndexProcessor(BaseIndexProcessor):
    def extract(self, extract_setting: ExtractSetting, **kwargs) -> list[Document]:
        text_docs = ExtractProcessor.extract(
-            extract_setting=extract_setting, is_automatic=kwargs.get("process_rule_mode") == "automatic"
+            extract_setting=extract_setting,
+            is_automatic=(
+                kwargs.get("process_rule_mode") == "automatic" or kwargs.get("process_rule_mode") == "hierarchical"
+            ),
        )
        return text_docs

    def transform(self, documents: list[Document], **kwargs) -> list[Document]:
+        preview = kwargs.get("preview")
+        process_rule = kwargs.get("process_rule")
+        if not process_rule:
+            raise ValueError("No process rule found.")
+        if not process_rule.get("rules"):
+            raise ValueError("No rules found in process rule.")
+        rules = Rule(**process_rule.get("rules"))
        splitter = self._get_splitter(
-            processing_rule=kwargs.get("process_rule") or {},
+            processing_rule_mode=process_rule.get("mode"),
+            max_tokens=rules.segmentation.max_tokens if rules.segmentation else 0,
+            chunk_overlap=rules.segmentation.chunk_overlap if rules.segmentation else 0,
+            separator=rules.segmentation.separator if rules.segmentation else "",
            embedding_model_instance=kwargs.get("embedding_model_instance"),
        )

@ -59,24 +73,33 @@ class QAIndexProcessor(BaseIndexProcessor):
                    document_node.page_content = remove_leading_symbols(page_content)
                    split_documents.append(document_node)
            all_documents.extend(split_documents)
-        for i in range(0, len(all_documents), 10):
-            threads = []
-            sub_documents = all_documents[i : i + 10]
-            for doc in sub_documents:
-                document_format_thread = threading.Thread(
-                    target=self._format_qa_document,
-                    kwargs={
-                        "flask_app": current_app._get_current_object(),  # type: ignore
-                        "tenant_id": kwargs.get("tenant_id"),
-                        "document_node": doc,
-                        "all_qa_documents": all_qa_documents,
-                        "document_language": kwargs.get("doc_language", "English"),
-                    },
-                )
-                threads.append(document_format_thread)
-                document_format_thread.start()
-            for thread in threads:
-                thread.join()
+        if preview:
+            self._format_qa_document(
+                current_app._get_current_object(),  # type: ignore
+                kwargs.get("tenant_id"),  # type: ignore
+                all_documents[0],
+                all_qa_documents,
+                kwargs.get("doc_language", "English"),
+            )
+        else:
+            for i in range(0, len(all_documents), 10):
+                threads = []
+                sub_documents = all_documents[i : i + 10]
+                for doc in sub_documents:
+                    document_format_thread = threading.Thread(
+                        target=self._format_qa_document,
+                        kwargs={
+                            "flask_app": current_app._get_current_object(),  # type: ignore
+                            "tenant_id": kwargs.get("tenant_id"),  # type: ignore
+                            "document_node": doc,
+                            "all_qa_documents": all_qa_documents,
+                            "document_language": kwargs.get("doc_language", "English"),
+                        },
+                    )
+                    threads.append(document_format_thread)
+                    document_format_thread.start()
+                for thread in threads:
+                    thread.join()
        return all_qa_documents

    def format_by_template(self, file: FileStorage, **kwargs) -> list[Document]:
@ -98,12 +121,12 @@ class QAIndexProcessor(BaseIndexProcessor):
            raise ValueError(str(e))
        return text_docs

-    def load(self, dataset: Dataset, documents: list[Document], with_keywords: bool = True):
+    def load(self, dataset: Dataset, documents: list[Document], with_keywords: bool = True, **kwargs):
        if dataset.indexing_technique == "high_quality":
            vector = Vector(dataset)
            vector.create(documents)

-    def clean(self, dataset: Dataset, node_ids: Optional[list[str]], with_keywords: bool = True):
+    def clean(self, dataset: Dataset, node_ids: Optional[list[str]], with_keywords: bool = True, **kwargs):
        vector = Vector(dataset)
        if node_ids:
            vector.delete_by_ids(node_ids)
--- a/api/core/rag/models/document.py
+++ b/api/core/rag/models/document.py
@ -2,7 +2,20 @@ from abc import ABC, abstractmethod
 from collections.abc import Sequence
 from typing import Any, Optional

-from pydantic import BaseModel, Field
+from pydantic import BaseModel
+
+
+class ChildDocument(BaseModel):
+    """Class for storing a piece of text and associated metadata."""
+
+    page_content: str
+
+    vector: Optional[list[float]] = None
+
+    """Arbitrary metadata about the page content (e.g., source, relationships to other
+        documents, etc.).
+    """
+    metadata: dict = {}


 class Document(BaseModel):
@ -15,10 +28,12 @@ class Document(BaseModel):
    """Arbitrary metadata about the page content (e.g., source, relationships to other
        documents, etc.).
    """
-    metadata: Optional[dict] = Field(default_factory=dict)
+    metadata: dict = {}

    provider: Optional[str] = "dify"

+    children: Optional[list[ChildDocument]] = None
+

 class BaseDocumentTransformer(ABC):
    """Abstract base class for document transformation systems.
--- a/api/core/rag/retrieval/dataset_retrieval.py
+++ b/api/core/rag/retrieval/dataset_retrieval.py
@ -164,43 +164,29 @@ class DatasetRetrieval:
                "content": item.page_content,
            }
            retrieval_resource_list.append(source)
-        document_score_list = {}
        # deal with dify documents
        if dify_documents:
-            for item in dify_documents:
-                if item.metadata.get("score"):
-                    document_score_list[item.metadata["doc_id"]] = item.metadata["score"]
-
-            index_node_ids = [document.metadata["doc_id"] for document in dify_documents]
-            segments = DocumentSegment.query.filter(
-                DocumentSegment.dataset_id.in_(dataset_ids),
-                DocumentSegment.status == "completed",
-                DocumentSegment.enabled == True,
-                DocumentSegment.index_node_id.in_(index_node_ids),
-            ).all()
-
-            if segments:
-                index_node_id_to_position = {id: position for position, id in enumerate(index_node_ids)}
-                sorted_segments = sorted(
-                    segments, key=lambda segment: index_node_id_to_position.get(segment.index_node_id, float("inf"))
-                )
-                for segment in sorted_segments:
+            records = RetrievalService.format_retrieval_documents(dify_documents)
+            if records:
+                for record in records:
+                    segment = record.segment
                    if segment.answer:
                        document_context_list.append(
                            DocumentContext(
                                content=f"question:{segment.get_sign_content()} answer:{segment.answer}",
-                                score=document_score_list.get(segment.index_node_id, None),
+                                score=record.score,
                            )
                        )
                    else:
                        document_context_list.append(
                            DocumentContext(
                                content=segment.get_sign_content(),
-                                score=document_score_list.get(segment.index_node_id, None),
+                                score=record.score,
                            )
                        )
                if show_retrieve_source:
-                    for segment in sorted_segments:
+                    for record in records:
+                        segment = record.segment
                        dataset = Dataset.query.filter_by(id=segment.dataset_id).first()
                        document = DatasetDocument.query.filter(
                            DatasetDocument.id == segment.document_id,
@ -216,7 +202,7 @@ class DatasetRetrieval:
                                "data_source_type": document.data_source_type,
                                "segment_id": segment.id,
                                "retriever_from": invoke_from.to_source(),
-                                "score": document_score_list.get(segment.index_node_id, 0.0),
+                                "score": record.score or 0.0,
                            }

                            if invoke_from.to_source() == "dev":