fix(rag): include is_summary and original_chunk_id in default vector projection (#34950)

Co-authored-by: VFootball Dev <vfootball@example.com>
This commit is contained in:
XHamzaX
2026-04-13 04:11:08 +01:00
committed by GitHub
parent b7b03f8594
commit 596559efc9
2 changed files with 29 additions and 2 deletions

View File

@ -41,7 +41,23 @@ class AbstractVectorFactory(ABC):
class Vector:
def __init__(self, dataset: Dataset, attributes: list | None = None):
if attributes is None:
attributes = ["doc_id", "dataset_id", "document_id", "doc_hash", "doc_type"]
# `is_summary` and `original_chunk_id` are stored on summary vectors
# by `SummaryIndexService` and read back by `RetrievalService` to
# route summary hits through their original parent chunks. They
# must be listed here so vector backends that use this list as an
# explicit return-properties projection (notably Weaviate) actually
# return those fields; without them, summary hits silently
# collapse into `is_summary = False` branches and the summary
# retrieval path is a no-op. See #34884.
attributes = [
"doc_id",
"dataset_id",
"document_id",
"doc_hash",
"doc_type",
"is_summary",
"original_chunk_id",
]
self._dataset = dataset
self._embeddings = self._get_embeddings()
self._attributes = attributes

View File

@ -121,7 +121,18 @@ def test_vector_init_uses_default_and_custom_attributes(vector_factory_module):
default_vector = vector_factory_module.Vector(dataset)
custom_vector = vector_factory_module.Vector(dataset, attributes=["doc_id"])
assert default_vector._attributes == ["doc_id", "dataset_id", "document_id", "doc_hash", "doc_type"]
# `is_summary` and `original_chunk_id` must be in the default return-properties
# projection so summary index retrieval works on backends that honor the list
# as an explicit projection (e.g. Weaviate). See #34884.
assert default_vector._attributes == [
"doc_id",
"dataset_id",
"document_id",
"doc_hash",
"doc_type",
"is_summary",
"original_chunk_id",
]
assert custom_vector._attributes == ["doc_id"]
assert default_vector._embeddings == "embeddings"
assert default_vector._vector_processor == "processor"