Merge branch 'feat/mcp-06-18' into deploy/dev

2026-04-27 14:08:18 +08:00 · 2025-10-14 21:40:29 +08:00
parent 7877706b5f 3c6035490d
commit 4cb42499ae
187 changed files with 9526 additions and 3521 deletions
--- a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py
+++ b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py
@ -1,9 +1,24 @@
+"""
+Weaviate vector database implementation for Dify's RAG system.
+
+This module provides integration with Weaviate vector database for storing and retrieving
+document embeddings used in retrieval-augmented generation workflows.
+"""
+
 import datetime
 import json
+import logging
+import uuid as _uuid
 from typing import Any
+from urllib.parse import urlparse

-import weaviate  # type: ignore
+import weaviate
+import weaviate.classes.config as wc
 from pydantic import BaseModel, model_validator
+from weaviate.classes.data import DataObject
+from weaviate.classes.init import Auth
+from weaviate.classes.query import Filter, MetadataQuery
+from weaviate.exceptions import UnexpectedStatusCodeError

 from configs import dify_config
 from core.rag.datasource.vdb.field import Field
@ -15,265 +30,394 @@ from core.rag.models.document import Document
 from extensions.ext_redis import redis_client
 from models.dataset import Dataset

+logger = logging.getLogger(__name__)
+

 class WeaviateConfig(BaseModel):
+    """
+    Configuration model for Weaviate connection settings.
+
+    Attributes:
+        endpoint: Weaviate server endpoint URL
+        api_key: Optional API key for authentication
+        batch_size: Number of objects to batch per insert operation
+    """
+
    endpoint: str
    api_key: str | None = None
    batch_size: int = 100

    @model_validator(mode="before")
    @classmethod
-    def validate_config(cls, values: dict):
+    def validate_config(cls, values: dict) -> dict:
+        """Validates that required configuration values are present."""
        if not values["endpoint"]:
            raise ValueError("config WEAVIATE_ENDPOINT is required")
        return values


 class WeaviateVector(BaseVector):
+    """
+    Weaviate vector database implementation for document storage and retrieval.
+
+    Handles creation, insertion, deletion, and querying of document embeddings
+    in a Weaviate collection.
+    """
+
    def __init__(self, collection_name: str, config: WeaviateConfig, attributes: list):
+        """
+        Initializes the Weaviate vector store.
+
+        Args:
+            collection_name: Name of the Weaviate collection
+            config: Weaviate configuration settings
+            attributes: List of metadata attributes to store
+        """
        super().__init__(collection_name)
        self._client = self._init_client(config)
        self._attributes = attributes

-    def _init_client(self, config: WeaviateConfig) -> weaviate.Client:
-        auth_config = weaviate.AuthApiKey(api_key=config.api_key or "")
+    def _init_client(self, config: WeaviateConfig) -> weaviate.WeaviateClient:
+        """
+        Initializes and returns a connected Weaviate client.

-        weaviate.connect.connection.has_grpc = False  # ty: ignore [unresolved-attribute]
+        Configures both HTTP and gRPC connections with proper authentication.
+        """
+        p = urlparse(config.endpoint)
+        host = p.hostname or config.endpoint.replace("https://", "").replace("http://", "")
+        http_secure = p.scheme == "https"
+        http_port = p.port or (443 if http_secure else 80)

-        try:
-            client = weaviate.Client(
-                url=config.endpoint, auth_client_secret=auth_config, timeout_config=(5, 60), startup_period=None
-            )
-        except Exception as exc:
-            raise ConnectionError("Vector database connection error") from exc
+        grpc_host = host
+        grpc_secure = http_secure
+        grpc_port = 443 if grpc_secure else 50051

-        client.batch.configure(
-            # `batch_size` takes an `int` value to enable auto-batching
-            # (`None` is used for manual batching)
-            batch_size=config.batch_size,
-            # dynamically update the `batch_size` based on import speed
-            dynamic=True,
-            # `timeout_retries` takes an `int` value to retry on time outs
-            timeout_retries=3,
+        client = weaviate.connect_to_custom(
+            http_host=host,
+            http_port=http_port,
+            http_secure=http_secure,
+            grpc_host=grpc_host,
+            grpc_port=grpc_port,
+            grpc_secure=grpc_secure,
+            auth_credentials=Auth.api_key(config.api_key) if config.api_key else None,
        )

+        if not client.is_ready():
+            raise ConnectionError("Vector database is not ready")
+
        return client

    def get_type(self) -> str:
+        """Returns the vector database type identifier."""
        return VectorType.WEAVIATE

    def get_collection_name(self, dataset: Dataset) -> str:
+        """
+        Retrieves or generates the collection name for a dataset.
+
+        Uses existing index structure if available, otherwise generates from dataset ID.
+        """
        if dataset.index_struct_dict:
            class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"]
            if not class_prefix.endswith("_Node"):
-                # original class_prefix
                class_prefix += "_Node"
-
            return class_prefix

        dataset_id = dataset.id
        return Dataset.gen_collection_name_by_id(dataset_id)

-    def to_index_struct(self):
+    def to_index_struct(self) -> dict:
+        """Returns the index structure dictionary for persistence."""
        return {"type": self.get_type(), "vector_store": {"class_prefix": self._collection_name}}

    def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
-        # create collection
+        """
+        Creates a new collection and adds initial documents with embeddings.
+        """
        self._create_collection()
-        # create vector
        self.add_texts(texts, embeddings)

    def _create_collection(self):
+        """
+        Creates the Weaviate collection with required schema if it doesn't exist.
+
+        Uses Redis locking to prevent concurrent creation attempts.
+        """
        lock_name = f"vector_indexing_lock_{self._collection_name}"
        with redis_client.lock(lock_name, timeout=20):
-            collection_exist_cache_key = f"vector_indexing_{self._collection_name}"
-            if redis_client.get(collection_exist_cache_key):
+            cache_key = f"vector_indexing_{self._collection_name}"
+            if redis_client.get(cache_key):
                return
-            schema = self._default_schema(self._collection_name)
-            if not self._client.schema.contains(schema):
-                # create collection
-                self._client.schema.create_class(schema)
-            redis_client.set(collection_exist_cache_key, 1, ex=3600)
+
+            try:
+                if not self._client.collections.exists(self._collection_name):
+                    self._client.collections.create(
+                        name=self._collection_name,
+                        properties=[
+                            wc.Property(
+                                name=Field.TEXT_KEY.value,
+                                data_type=wc.DataType.TEXT,
+                                tokenization=wc.Tokenization.WORD,
+                            ),
+                            wc.Property(name="document_id", data_type=wc.DataType.TEXT),
+                            wc.Property(name="doc_id", data_type=wc.DataType.TEXT),
+                            wc.Property(name="chunk_index", data_type=wc.DataType.INT),
+                        ],
+                        vector_config=wc.Configure.Vectors.self_provided(),
+                    )
+
+                self._ensure_properties()
+                redis_client.set(cache_key, 1, ex=3600)
+            except Exception as e:
+                logger.exception("Error creating collection %s", self._collection_name)
+                raise
+
+    def _ensure_properties(self) -> None:
+        """
+        Ensures all required properties exist in the collection schema.
+
+        Adds missing properties if the collection exists but lacks them.
+        """
+        if not self._client.collections.exists(self._collection_name):
+            return
+
+        col = self._client.collections.use(self._collection_name)
+        cfg = col.config.get()
+        existing = {p.name for p in (cfg.properties or [])}
+
+        to_add = []
+        if "document_id" not in existing:
+            to_add.append(wc.Property(name="document_id", data_type=wc.DataType.TEXT))
+        if "doc_id" not in existing:
+            to_add.append(wc.Property(name="doc_id", data_type=wc.DataType.TEXT))
+        if "chunk_index" not in existing:
+            to_add.append(wc.Property(name="chunk_index", data_type=wc.DataType.INT))
+
+        for prop in to_add:
+            try:
+                col.config.add_property(prop)
+            except Exception as e:
+                logger.warning("Could not add property %s: %s", prop.name, e)
+
+    def _get_uuids(self, documents: list[Document]) -> list[str]:
+        """
+        Generates deterministic UUIDs for documents based on their content.
+
+        Uses UUID5 with URL namespace to ensure consistent IDs for identical content.
+        """
+        URL_NAMESPACE = _uuid.UUID("6ba7b811-9dad-11d1-80b4-00c04fd430c8")
+
+        uuids = []
+        for doc in documents:
+            uuid_val = _uuid.uuid5(URL_NAMESPACE, doc.page_content)
+            uuids.append(str(uuid_val))
+
+        return uuids

    def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs):
+        """
+        Adds documents with their embeddings to the collection.
+
+        Batches insertions for efficiency and returns the list of inserted object IDs.
+        """
        uuids = self._get_uuids(documents)
        texts = [d.page_content for d in documents]
        metadatas = [d.metadata for d in documents]

-        ids = []
+        col = self._client.collections.use(self._collection_name)
+        objs: list[DataObject] = []
+        ids_out: list[str] = []

-        with self._client.batch as batch:
-            for i, text in enumerate(texts):
-                data_properties = {Field.TEXT_KEY: text}
-                if metadatas is not None:
-                    # metadata maybe None
-                    for key, val in (metadatas[i] or {}).items():
-                        data_properties[key] = self._json_serializable(val)
+        for i, text in enumerate(texts):
+            props: dict[str, Any] = {Field.TEXT_KEY.value: text}
+            meta = metadatas[i] or {}
+            for k, v in meta.items():
+                props[k] = self._json_serializable(v)

-                batch.add_data_object(
-                    data_object=data_properties,
-                    class_name=self._collection_name,
-                    uuid=uuids[i],
-                    vector=embeddings[i] if embeddings else None,
+            candidate = uuids[i] if uuids else None
+            uid = candidate if (candidate and self._is_uuid(candidate)) else str(_uuid.uuid4())
+            ids_out.append(uid)
+
+            vec_payload = None
+            if embeddings and i < len(embeddings) and embeddings[i]:
+                vec_payload = {"default": embeddings[i]}
+
+            objs.append(
+                DataObject(
+                    uuid=uid,
+                    properties=props,  # type: ignore[arg-type]  # mypy incorrectly infers DataObject signature
+                    vector=vec_payload,
                )
-                ids.append(uuids[i])
-        return ids
+            )

-    def delete_by_metadata_field(self, key: str, value: str):
-        # check whether the index already exists
-        schema = self._default_schema(self._collection_name)
-        if self._client.schema.contains(schema):
-            where_filter = {"operator": "Equal", "path": [key], "valueText": value}
+        batch_size = max(1, int(dify_config.WEAVIATE_BATCH_SIZE or 100))
+        with col.batch.dynamic() as batch:
+            for obj in objs:
+                batch.add_object(properties=obj.properties, uuid=obj.uuid, vector=obj.vector)

-            self._client.batch.delete_objects(class_name=self._collection_name, where=where_filter, output="minimal")
+        return ids_out
+
+    def _is_uuid(self, val: str) -> bool:
+        """Validates whether a string is a valid UUID format."""
+        try:
+            _uuid.UUID(str(val))
+            return True
+        except Exception:
+            return False
+
+    def delete_by_metadata_field(self, key: str, value: str) -> None:
+        """Deletes all objects matching a specific metadata field value."""
+        if not self._client.collections.exists(self._collection_name):
+            return
+
+        col = self._client.collections.use(self._collection_name)
+        col.data.delete_many(where=Filter.by_property(key).equal(value))

    def delete(self):
-        # check whether the index already exists
-        schema = self._default_schema(self._collection_name)
-        if self._client.schema.contains(schema):
-            self._client.schema.delete_class(self._collection_name)
+        """Deletes the entire collection from Weaviate."""
+        if self._client.collections.exists(self._collection_name):
+            self._client.collections.delete(self._collection_name)

    def text_exists(self, id: str) -> bool:
-        collection_name = self._collection_name
-        schema = self._default_schema(self._collection_name)
-
-        # check whether the index already exists
-        if not self._client.schema.contains(schema):
+        """Checks if a document with the given doc_id exists in the collection."""
+        if not self._client.collections.exists(self._collection_name):
            return False
-        result = (
-            self._client.query.get(collection_name)
-            .with_additional(["id"])
-            .with_where(
-                {
-                    "path": ["doc_id"],
-                    "operator": "Equal",
-                    "valueText": id,
-                }
-            )
-            .with_limit(1)
-            .do()
+
+        col = self._client.collections.use(self._collection_name)
+        res = col.query.fetch_objects(
+            filters=Filter.by_property("doc_id").equal(id),
+            limit=1,
+            return_properties=["doc_id"],
        )

-        if "errors" in result:
-            raise ValueError(f"Error during query: {result['errors']}")
+        return len(res.objects) > 0

-        entries = result["data"]["Get"][collection_name]
-        if len(entries) == 0:
-            return False
+    def delete_by_ids(self, ids: list[str]) -> None:
+        """
+        Deletes objects by their UUID identifiers.

-        return True
+        Silently ignores 404 errors for non-existent IDs.
+        """
+        if not self._client.collections.exists(self._collection_name):
+            return

-    def delete_by_ids(self, ids: list[str]):
-        # check whether the index already exists
-        schema = self._default_schema(self._collection_name)
-        if self._client.schema.contains(schema):
-            for uuid in ids:
-                try:
-                    self._client.data_object.delete(
-                        class_name=self._collection_name,
-                        uuid=uuid,
-                    )
-                except weaviate.UnexpectedStatusCodeException as e:
-                    # tolerate not found error
-                    if e.status_code != 404:
-                        raise e
+        col = self._client.collections.use(self._collection_name)
+
+        for uid in ids:
+            try:
+                col.data.delete_by_id(uid)
+            except UnexpectedStatusCodeError as e:
+                if getattr(e, "status_code", None) != 404:
+                    raise

    def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]:
-        """Look up similar documents by embedding vector in Weaviate."""
-        collection_name = self._collection_name
-        properties = self._attributes
-        properties.append(Field.TEXT_KEY)
-        query_obj = self._client.query.get(collection_name, properties)
+        """
+        Performs vector similarity search using the provided query vector.

-        vector = {"vector": query_vector}
-        document_ids_filter = kwargs.get("document_ids_filter")
-        if document_ids_filter:
-            operands = []
-            for document_id_filter in document_ids_filter:
-                operands.append({"path": ["document_id"], "operator": "Equal", "valueText": document_id_filter})
-            where_filter = {"operator": "Or", "operands": operands}
-            query_obj = query_obj.with_where(where_filter)
-        result = (
-            query_obj.with_near_vector(vector)
-            .with_limit(kwargs.get("top_k", 4))
-            .with_additional(["vector", "distance"])
-            .do()
+        Filters by document IDs if provided and applies score threshold.
+        Returns documents sorted by relevance score.
+        """
+        if not self._client.collections.exists(self._collection_name):
+            return []
+
+        col = self._client.collections.use(self._collection_name)
+        props = list({*self._attributes, "document_id", Field.TEXT_KEY.value})
+
+        where = None
+        doc_ids = kwargs.get("document_ids_filter") or []
+        if doc_ids:
+            ors = [Filter.by_property("document_id").equal(x) for x in doc_ids]
+            where = ors[0]
+            for f in ors[1:]:
+                where = where | f
+
+        top_k = int(kwargs.get("top_k", 4))
+        score_threshold = float(kwargs.get("score_threshold") or 0.0)
+
+        res = col.query.near_vector(
+            near_vector=query_vector,
+            limit=top_k,
+            return_properties=props,
+            return_metadata=MetadataQuery(distance=True),
+            include_vector=False,
+            filters=where,
+            target_vector="default",
        )
-        if "errors" in result:
-            raise ValueError(f"Error during query: {result['errors']}")

-        docs_and_scores = []
-        for res in result["data"]["Get"][collection_name]:
-            text = res.pop(Field.TEXT_KEY)
-            score = 1 - res["_additional"]["distance"]
-            docs_and_scores.append((Document(page_content=text, metadata=res), score))
+        docs: list[Document] = []
+        for obj in res.objects:
+            properties = dict(obj.properties or {})
+            text = properties.pop(Field.TEXT_KEY.value, "")
+            distance = (obj.metadata.distance if obj.metadata else None) or 1.0
+            score = 1.0 - distance

-        docs = []
-        for doc, score in docs_and_scores:
-            score_threshold = float(kwargs.get("score_threshold") or 0.0)
-            # check score threshold
-            if score >= score_threshold:
-                if doc.metadata is not None:
-                    doc.metadata["score"] = score
-                    docs.append(doc)
-        # Sort the documents by score in descending order
-        docs = sorted(docs, key=lambda x: x.metadata.get("score", 0) if x.metadata else 0, reverse=True)
+            if score > score_threshold:
+                properties["score"] = score
+                docs.append(Document(page_content=text, metadata=properties))
+
+        docs.sort(key=lambda d: d.metadata.get("score", 0.0), reverse=True)
        return docs

    def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]:
-        """Return docs using BM25F.
-
-        Args:
-            query: Text to look up documents similar to.
-
-        Returns:
-            List of Documents most similar to the query.
        """
-        collection_name = self._collection_name
-        content: dict[str, Any] = {"concepts": [query]}
-        properties = self._attributes
-        properties.append(Field.TEXT_KEY)
-        if kwargs.get("search_distance"):
-            content["certainty"] = kwargs.get("search_distance")
-        query_obj = self._client.query.get(collection_name, properties)
-        document_ids_filter = kwargs.get("document_ids_filter")
-        if document_ids_filter:
-            operands = []
-            for document_id_filter in document_ids_filter:
-                operands.append({"path": ["document_id"], "operator": "Equal", "valueText": document_id_filter})
-            where_filter = {"operator": "Or", "operands": operands}
-            query_obj = query_obj.with_where(where_filter)
-        query_obj = query_obj.with_additional(["vector"])
-        properties = ["text"]
-        result = query_obj.with_bm25(query=query, properties=properties).with_limit(kwargs.get("top_k", 4)).do()
-        if "errors" in result:
-            raise ValueError(f"Error during query: {result['errors']}")
-        docs = []
-        for res in result["data"]["Get"][collection_name]:
-            text = res.pop(Field.TEXT_KEY)
-            additional = res.pop("_additional")
-            docs.append(Document(page_content=text, vector=additional["vector"], metadata=res))
+        Performs BM25 full-text search on document content.
+
+        Filters by document IDs if provided and returns matching documents with vectors.
+        """
+        if not self._client.collections.exists(self._collection_name):
+            return []
+
+        col = self._client.collections.use(self._collection_name)
+        props = list({*self._attributes, Field.TEXT_KEY.value})
+
+        where = None
+        doc_ids = kwargs.get("document_ids_filter") or []
+        if doc_ids:
+            ors = [Filter.by_property("document_id").equal(x) for x in doc_ids]
+            where = ors[0]
+            for f in ors[1:]:
+                where = where | f
+
+        top_k = int(kwargs.get("top_k", 4))
+
+        res = col.query.bm25(
+            query=query,
+            query_properties=[Field.TEXT_KEY.value],
+            limit=top_k,
+            return_properties=props,
+            include_vector=True,
+            filters=where,
+        )
+
+        docs: list[Document] = []
+        for obj in res.objects:
+            properties = dict(obj.properties or {})
+            text = properties.pop(Field.TEXT_KEY.value, "")
+
+            vec = obj.vector
+            if isinstance(vec, dict):
+                vec = vec.get("default") or next(iter(vec.values()), None)
+
+            docs.append(Document(page_content=text, vector=vec, metadata=properties))
        return docs

-    def _default_schema(self, index_name: str):
-        return {
-            "class": index_name,
-            "properties": [
-                {
-                    "name": "text",
-                    "dataType": ["text"],
-                }
-            ],
-        }
-
-    def _json_serializable(self, value: Any):
+    def _json_serializable(self, value: Any) -> Any:
+        """Converts values to JSON-serializable format, handling datetime objects."""
        if isinstance(value, datetime.datetime):
            return value.isoformat()
        return value


 class WeaviateVectorFactory(AbstractVectorFactory):
+    """Factory class for creating WeaviateVector instances."""
+
    def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> WeaviateVector:
+        """
+        Initializes a WeaviateVector instance for the given dataset.
+
+        Uses existing collection name from dataset index structure or generates a new one.
+        Updates dataset index structure if not already set.
+        """
        if dataset.index_struct_dict:
            class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"]
            collection_name = class_prefix
@ -281,7 +425,6 @@ class WeaviateVectorFactory(AbstractVectorFactory):
            dataset_id = dataset.id
            collection_name = Dataset.gen_collection_name_by_id(dataset_id)
            dataset.index_struct = json.dumps(self.gen_index_struct_dict(VectorType.WEAVIATE, collection_name))
-
        return WeaviateVector(
            collection_name=collection_name,
            config=WeaviateConfig(