fix: skip empty documents before vector embedding (#35763)

Co-authored-by: Asuka Minato <i@asukaminato.eu.org> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
2026-05-06 10:28:10 +08:00 · 2026-05-04 18:36:58 +05:30
parent 81090effe2
commit 4b7dc17546
2 changed files with 87 additions and 0 deletions
--- a/api/core/rag/datasource/vdb/vector_factory.py
+++ b/api/core/rag/datasource/vdb/vector_factory.py
@ -144,8 +144,20 @@ class Vector:
    def get_vector_factory(vector_type: str) -> type[AbstractVectorFactory]:
        return get_vector_factory_class(vector_type)

+    @staticmethod
+    def _filter_empty_text_documents(documents: list[Document]) -> list[Document]:
+        filtered_documents = [document for document in documents if document.page_content.strip()]
+        skipped_count = len(documents) - len(filtered_documents)
+        if skipped_count:
+            logger.warning("skip %d empty documents before vector embedding", skipped_count)
+        return filtered_documents
+
    def create(self, texts: list | None = None, **kwargs):
        if texts:
+            texts = self._filter_empty_text_documents(texts)
+            if not texts:
+                return
+
            start = time.time()
            logger.info("start embedding %s texts %s", len(texts), start)
            batch_size = 1000
@ -203,8 +215,14 @@ class Vector:
            logger.info("Embedding %s files took %s s", len(file_documents), time.time() - start)

    def add_texts(self, documents: list[Document], **kwargs):
+        documents = self._filter_empty_text_documents(documents)
+        if not documents:
+            return
+
        if kwargs.get("duplicate_check", False):
            documents = self._filter_duplicate_texts(documents)
+            if not documents:
+                return

        embeddings = self._embeddings.embed_documents([document.page_content for document in documents])
        self._vector_processor.create(texts=documents, embeddings=embeddings, **kwargs)