fix: skip empty documents before vector embedding (#35763)

Co-authored-by: Asuka Minato <i@asukaminato.eu.org>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
Prince Pal
2026-05-04 18:36:58 +05:30
committed by GitHub
parent 81090effe2
commit 4b7dc17546
2 changed files with 87 additions and 0 deletions

View File

@ -144,8 +144,20 @@ class Vector:
def get_vector_factory(vector_type: str) -> type[AbstractVectorFactory]:
return get_vector_factory_class(vector_type)
@staticmethod
def _filter_empty_text_documents(documents: list[Document]) -> list[Document]:
filtered_documents = [document for document in documents if document.page_content.strip()]
skipped_count = len(documents) - len(filtered_documents)
if skipped_count:
logger.warning("skip %d empty documents before vector embedding", skipped_count)
return filtered_documents
def create(self, texts: list | None = None, **kwargs):
if texts:
texts = self._filter_empty_text_documents(texts)
if not texts:
return
start = time.time()
logger.info("start embedding %s texts %s", len(texts), start)
batch_size = 1000
@ -203,8 +215,14 @@ class Vector:
logger.info("Embedding %s files took %s s", len(file_documents), time.time() - start)
def add_texts(self, documents: list[Document], **kwargs):
documents = self._filter_empty_text_documents(documents)
if not documents:
return
if kwargs.get("duplicate_check", False):
documents = self._filter_duplicate_texts(documents)
if not documents:
return
embeddings = self._embeddings.embed_documents([document.page_content for document in documents])
self._vector_processor.create(texts=documents, embeddings=embeddings, **kwargs)