Merge branch 'main' into feat/workflow

# Conflicts: # docker/docker-compose.yaml
2026-05-04 01:18:05 +08:00 · 2024-03-29 21:18:16 +08:00
parent de3b7e8815 1387f9b23e
commit a30a6dda63
40 changed files with 876 additions and 141 deletions
--- a/api/core/rag/datasource/vdb/milvus/milvus_vector.py
+++ b/api/core/rag/datasource/vdb/milvus/milvus_vector.py
@ -144,6 +144,16 @@ class MilvusVector(BaseVector):
            utility.drop_collection(self._collection_name, None, using=alias)

    def text_exists(self, id: str) -> bool:
+        alias = uuid4().hex
+        if self._client_config.secure:
+            uri = "https://" + str(self._client_config.host) + ":" + str(self._client_config.port)
+        else:
+            uri = "http://" + str(self._client_config.host) + ":" + str(self._client_config.port)
+        connections.connect(alias=alias, uri=uri, user=self._client_config.user, password=self._client_config.password)
+
+        from pymilvus import utility
+        if not utility.has_collection(self._collection_name, using=alias):
+            return False

        result = self._client.query(collection_name=self._collection_name,
                                    filter=f'metadata["doc_id"] == "{id}"',
--- a/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py
+++ b/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py
@ -275,6 +275,13 @@ class QdrantVector(BaseVector):
            )

    def text_exists(self, id: str) -> bool:
+        all_collection_name = []
+        collections_response = self._client.get_collections()
+        collection_list = collections_response.collections
+        for collection in collection_list:
+            all_collection_name.append(collection.name)
+        if self._collection_name not in all_collection_name:
+            return False
        response = self._client.retrieve(
            collection_name=self._collection_name,
            ids=[id]
--- a/api/core/rag/datasource/vdb/vector_factory.py
+++ b/api/core/rag/datasource/vdb/vector_factory.py
@ -128,8 +128,8 @@ class Vector:
        if kwargs.get('duplicate_check', False):
            documents = self._filter_duplicate_texts(documents)
        embeddings = self._embeddings.embed_documents([document.page_content for document in documents])
-        self._vector_processor.add_texts(
-            documents=documents,
+        self._vector_processor.create(
+            texts=documents,
            embeddings=embeddings,
            **kwargs
        )
--- a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py
+++ b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py
@ -134,6 +134,11 @@ class WeaviateVector(BaseVector):

    def text_exists(self, id: str) -> bool:
        collection_name = self._collection_name
+        schema = self._default_schema(self._collection_name)
+
+        # check whether the index already exists
+        if not self._client.schema.contains(schema):
+            return False
        result = self._client.query.get(collection_name).with_additional(["id"]).with_where({
            "path": ["doc_id"],
            "operator": "Equal",
--- a/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py
@ -53,7 +53,7 @@ class UnstructuredWordExtractor(BaseExtractor):
            elements = partition_docx(filename=self._file_path)

        from unstructured.chunking.title import chunk_by_title
-        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
        documents = []
        for chunk in chunks:
            text = chunk.text.strip()
--- a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
@ -43,7 +43,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
            pass

        from unstructured.chunking.title import chunk_by_title
-        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
        documents = []
        for chunk in chunks:
            text = chunk.text.strip()
--- a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
@ -38,7 +38,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):

        elements = partition_md(filename=self._file_path, api_url=self._api_url)
        from unstructured.chunking.title import chunk_by_title
-        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
        documents = []
        for chunk in chunks:
            text = chunk.text.strip()
--- a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
@ -28,7 +28,7 @@ class UnstructuredMsgExtractor(BaseExtractor):

        elements = partition_msg(filename=self._file_path, api_url=self._api_url)
        from unstructured.chunking.title import chunk_by_title
-        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
        documents = []
        for chunk in chunks:
            text = chunk.text.strip()
--- a/api/core/rag/extractor/unstructured/unstructured_text_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_text_extractor.py
@ -28,7 +28,7 @@ class UnstructuredTextExtractor(BaseExtractor):

        elements = partition_text(filename=self._file_path, api_url=self._api_url)
        from unstructured.chunking.title import chunk_by_title
-        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
        documents = []
        for chunk in chunks:
            text = chunk.text.strip()
--- a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
@ -28,7 +28,7 @@ class UnstructuredXmlExtractor(BaseExtractor):

        elements = partition_xml(filename=self._file_path, xml_keep_tags=True, api_url=self._api_url)
        from unstructured.chunking.title import chunk_by_title
-        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
        documents = []
        for chunk in chunks:
            text = chunk.text.strip()