mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-20 16:26:42 +08:00
### What problem does this PR solve? Currently, RAGFlow's Search and Chat interfaces display only raw vectorized text chunks during retrieval, without contextual information about their source documents. Users cannot see document titles, page numbers, upload dates, or custom metadata fields that would help them understand and trust the retrieved results. This PR introduces an **optional metadata display feature** that enriches retrieved chunks with document-level metadata in both the Search tab and Chatbot interface. **Key improvements:** - **Search results**: Display document metadata as styled badges beneath chunk snippets - **Chat citations**: Show metadata in citation popovers and reference lists for better source context - **LLM context**: Metadata is injected into the LLM prompt to enable more accurate, citation-aware responses - **External API support**: Applications using RAGFlow's SDK retrieval endpoints (`/v1/retrieval`, `/v1/searchbots/retrieval_test`) can opt-in via request parameters - **User control**: Multi-select dropdown UI allows users to choose which metadata fields to display **Implementation approach:** - ✅ Reuses existing `DocMetadataService` infrastructure (no new database tables or indices) - ✅ Settings stored in existing JSON configuration fields (`search_config.reference_metadata`, `prompt_config.reference_metadata`) - ✅ No database migrations required - ✅ Disabled by default (fully opt-in and backward-compatible) - ✅ Dynamic metadata field selection populated from actual document metadata keys - ✅ Fixed critical bug where Python's builtin `set()` was shadowed by a route handler function **Modified endpoints (all backward-compatible):** - `POST /v1/retrieval` (Public SDK) - `POST /v1/searchbots/retrieval_test` (Searchbots) - `POST /v1/chunk/retrieval_test` (UI/Internal) - Chat completions endpoints (via `extra_body.reference_metadata` or `prompt_config`) ### Type of change - [x] New Feature (non-breaking change which adds functionality) ###Images - <img width="879" height="1275" alt="image" src="https://github.com/user-attachments/assets/95b2d731-31ae-45a1-b081-bf5893f52aeb" /> <br><br> <br><br> <img width="1532" height="362" alt="image" src="https://github.com/user-attachments/assets/9cebc65b-b7a7-459f-b25e-3b13fa9b638e" /> <br><br> <br><br> <img width="2586" height="1320" alt="image" src="https://github.com/user-attachments/assets/2153d493-d899-461f-a7a9-041391e07776" /> --------- Co-authored-by: Cursor Agent <cursoragent@cursor.com> Co-authored-by: Attili-sys <Attili-sys@users.noreply.github.com> Co-authored-by: Ahmad Intisar <ahmadintisar@Ahmads-MacBook-M4-Pro.local>
183 lines
6.4 KiB
Python
183 lines
6.4 KiB
Python
#
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
from typing import Any
|
|
|
|
from .base import Base
|
|
from .document import Document
|
|
|
|
|
|
class DataSet(Base):
|
|
class ParserConfig(Base):
|
|
def __init__(self, rag, res_dict):
|
|
super().__init__(rag, res_dict)
|
|
|
|
def __init__(self, rag, res_dict):
|
|
self.id = ""
|
|
self.name = ""
|
|
self.avatar = ""
|
|
self.tenant_id = None
|
|
self.description = ""
|
|
self.embedding_model = ""
|
|
self.permission = "me"
|
|
self.document_count = 0
|
|
self.chunk_count = 0
|
|
self.chunk_method = "naive"
|
|
self.parser_config = None
|
|
self.pagerank = 0
|
|
for k in list(res_dict.keys()):
|
|
if k not in self.__dict__:
|
|
res_dict.pop(k)
|
|
super().__init__(rag, res_dict)
|
|
|
|
def update(self, update_message: dict):
|
|
res = self.put(f"/datasets/{self.id}", update_message)
|
|
res = res.json()
|
|
if res.get("code") != 0:
|
|
raise Exception(res["message"])
|
|
|
|
self._update_from_dict(self.rag, res.get("data", {}))
|
|
return self
|
|
|
|
def upload_documents(self, document_list: list[dict]):
|
|
url = f"/datasets/{self.id}/documents"
|
|
files = [("file", (ele["display_name"], ele["blob"])) for ele in document_list]
|
|
res = self.post(path=url, json=None, files=files)
|
|
res = res.json()
|
|
if res.get("code") == 0:
|
|
doc_list = []
|
|
for doc in res["data"]:
|
|
document = Document(self.rag, doc)
|
|
doc_list.append(document)
|
|
return doc_list
|
|
raise Exception(res.get("message"))
|
|
|
|
def list_documents(
|
|
self,
|
|
id: str | None = None,
|
|
ids: list[str] | None = None,
|
|
name: str | None = None,
|
|
keywords: str | None = None,
|
|
page: int = 1,
|
|
page_size: int = 30,
|
|
orderby: str = "create_time",
|
|
desc: bool = True,
|
|
create_time_from: int = 0,
|
|
create_time_to: int = 0,
|
|
):
|
|
# Validate that id and ids are not used together
|
|
if id and ids:
|
|
raise ValueError("Cannot use both 'id' and 'ids' parameters at the same time.")
|
|
|
|
params = {
|
|
"id": id,
|
|
"name": name,
|
|
"keywords": keywords,
|
|
"page": page,
|
|
"page_size": page_size,
|
|
"orderby": orderby,
|
|
"desc": desc,
|
|
"create_time_from": create_time_from,
|
|
"create_time_to": create_time_to,
|
|
}
|
|
# Handle ids parameter - convert to multiple query params
|
|
if ids:
|
|
for doc_id in ids:
|
|
params.append(("ids", doc_id))
|
|
res = self.get(f"/datasets/{self.id}/documents", params=params)
|
|
res = res.json()
|
|
documents = []
|
|
if res.get("code") == 0:
|
|
for document in res["data"].get("docs"):
|
|
documents.append(Document(self.rag, document))
|
|
return documents
|
|
raise Exception(res["message"])
|
|
|
|
def delete_documents(self, ids: list[str] | None = None, delete_all: bool = False):
|
|
res = self.rm(f"/datasets/{self.id}/documents", {"ids": ids, "delete_all": delete_all})
|
|
res = res.json()
|
|
if res.get("code") != 0:
|
|
raise Exception(res["message"])
|
|
|
|
def _get_documents_status(self, document_ids):
|
|
import time
|
|
terminal_states = {"DONE", "FAIL", "CANCEL"}
|
|
interval_sec = 1
|
|
pending = set(document_ids)
|
|
finished = []
|
|
while pending:
|
|
for doc_id in list(pending):
|
|
def fetch_doc(doc_id: str) -> Document | None:
|
|
try:
|
|
docs = self.list_documents(id=doc_id)
|
|
return docs[0] if docs else None
|
|
except Exception:
|
|
return None
|
|
doc = fetch_doc(doc_id)
|
|
if doc is None:
|
|
continue
|
|
if isinstance(doc.run, str) and doc.run.upper() in terminal_states:
|
|
finished.append((doc_id, doc.run, doc.chunk_count, doc.token_count))
|
|
pending.discard(doc_id)
|
|
elif float(doc.progress or 0.0) >= 1.0:
|
|
finished.append((doc_id, "DONE", doc.chunk_count, doc.token_count))
|
|
pending.discard(doc_id)
|
|
if pending:
|
|
time.sleep(interval_sec)
|
|
return finished
|
|
|
|
def async_parse_documents(self, document_ids):
|
|
res = self.post(f"/datasets/{self.id}/chunks", {"document_ids": document_ids})
|
|
res = res.json()
|
|
if res.get("code") != 0:
|
|
raise Exception(res.get("message"))
|
|
|
|
|
|
def parse_documents(self, document_ids):
|
|
try:
|
|
self.async_parse_documents(document_ids)
|
|
self._get_documents_status(document_ids)
|
|
except KeyboardInterrupt:
|
|
self.async_cancel_parse_documents(document_ids)
|
|
|
|
return self._get_documents_status(document_ids)
|
|
|
|
|
|
def async_cancel_parse_documents(self, document_ids):
|
|
res = self.rm(f"/datasets/{self.id}/chunks", {"document_ids": document_ids})
|
|
res = res.json()
|
|
if res.get("code") != 0:
|
|
raise Exception(res.get("message"))
|
|
|
|
def get_auto_metadata(self) -> dict[str, Any]:
|
|
"""
|
|
Retrieve auto-metadata configuration for a dataset via SDK.
|
|
"""
|
|
res = self.get(f"/datasets/{self.id}/metadata/config")
|
|
res = res.json()
|
|
if res.get("code") == 0:
|
|
return res["data"]
|
|
raise Exception(res["message"])
|
|
|
|
def update_auto_metadata(self, **config: Any) -> dict[str, Any]:
|
|
"""
|
|
Update auto-metadata configuration for a dataset via SDK.
|
|
"""
|
|
res = self.put(f"/datasets/{self.id}/metadata/config", config)
|
|
res = res.json()
|
|
if res.get("code") == 0:
|
|
return res["data"]
|
|
raise Exception(res["message"])
|