feat: enterprise OTEL telemetry exporter (squash merge from feat/otel-telemetry-ee)

2026-05-05 09:58:04 +08:00 · 2026-03-15 21:21:45 -07:00
parent bdbec77c54
commit 45c28905f2
394 changed files with 14287 additions and 3929 deletions
--- a/api/tests/integration_tests/vdb/__mock/hologres.py
+++ b/api/tests/integration_tests/vdb/__mock/hologres.py
@ -0,0 +1,209 @@
+import json
+import os
+from typing import Any
+
+import holo_search_sdk as holo
+import pytest
+from _pytest.monkeypatch import MonkeyPatch
+from psycopg import sql as psql
+
+# Shared in-memory storage: {table_name: {doc_id: {"id", "text", "meta", "embedding"}}}
+_mock_tables: dict[str, dict[str, dict[str, Any]]] = {}
+
+
+class MockSearchQuery:
+    """Mock query builder for search_vector and search_text results."""
+
+    def __init__(self, table_name: str, search_type: str):
+        self._table_name = table_name
+        self._search_type = search_type
+        self._limit_val = 10
+        self._filter_sql = None
+
+    def select(self, columns):
+        return self
+
+    def limit(self, n):
+        self._limit_val = n
+        return self
+
+    def where(self, filter_sql):
+        self._filter_sql = filter_sql
+        return self
+
+    def _apply_filter(self, row: dict[str, Any]) -> bool:
+        """Apply the filter SQL to check if a row matches."""
+        if self._filter_sql is None:
+            return True
+
+        # Extract literals (the document IDs) from the filter SQL
+        # Filter format: meta->>'document_id' IN ('doc1', 'doc2')
+        literals = [v for t, v in _extract_identifiers_and_literals(self._filter_sql) if t == "literal"]
+        if not literals:
+            return True
+
+        # Get the document_id from the row's meta field
+        meta = row.get("meta", "{}")
+        if isinstance(meta, str):
+            meta = json.loads(meta)
+        doc_id = meta.get("document_id")
+
+        return doc_id in literals
+
+    def fetchall(self):
+        data = _mock_tables.get(self._table_name, {})
+        results = []
+        for row in list(data.values())[: self._limit_val]:
+            # Apply filter if present
+            if not self._apply_filter(row):
+                continue
+
+            if self._search_type == "vector":
+                # row format expected by _process_vector_results: (distance, id, text, meta)
+                results.append((0.1, row["id"], row["text"], row["meta"]))
+            else:
+                # row format expected by _process_full_text_results: (id, text, meta, embedding, score)
+                results.append((row["id"], row["text"], row["meta"], row.get("embedding", []), 0.9))
+        return results
+
+
+class MockTable:
+    """Mock table object returned by client.open_table()."""
+
+    def __init__(self, table_name: str):
+        self._table_name = table_name
+
+    def upsert_multi(self, index_column, values, column_names, update=True, update_columns=None):
+        if self._table_name not in _mock_tables:
+            _mock_tables[self._table_name] = {}
+        id_idx = column_names.index("id")
+        for row in values:
+            doc_id = row[id_idx]
+            _mock_tables[self._table_name][doc_id] = dict(zip(column_names, row))
+
+    def search_vector(self, vector, column, distance_method, output_name):
+        return MockSearchQuery(self._table_name, "vector")
+
+    def search_text(self, column, expression, return_score=False, return_score_name="score", return_all_columns=False):
+        return MockSearchQuery(self._table_name, "text")
+
+    def set_vector_index(
+        self, column, distance_method, base_quantization_type, max_degree, ef_construction, use_reorder
+    ):
+        pass
+
+    def create_text_index(self, index_name, column, tokenizer):
+        pass
+
+
+def _extract_sql_template(query) -> str:
+    """Extract the SQL template string from a psycopg Composed object."""
+    if isinstance(query, psql.Composed):
+        for part in query:
+            if isinstance(part, psql.SQL):
+                return part._obj
+    if isinstance(query, psql.SQL):
+        return query._obj
+    return ""
+
+
+def _extract_identifiers_and_literals(query) -> list[Any]:
+    """Extract Identifier and Literal values from a psycopg Composed object."""
+    values: list[Any] = []
+    if isinstance(query, psql.Composed):
+        for part in query:
+            if isinstance(part, psql.Identifier):
+                values.append(("ident", part._obj[0] if part._obj else ""))
+            elif isinstance(part, psql.Literal):
+                values.append(("literal", part._obj))
+            elif isinstance(part, psql.Composed):
+                # Handles SQL(...).join(...) for IN clauses
+                for sub in part:
+                    if isinstance(sub, psql.Literal):
+                        values.append(("literal", sub._obj))
+    return values
+
+
+class MockHologresClient:
+    """Mock holo_search_sdk client that stores data in memory."""
+
+    def connect(self):
+        pass
+
+    def check_table_exist(self, table_name):
+        return table_name in _mock_tables
+
+    def open_table(self, table_name):
+        return MockTable(table_name)
+
+    def execute(self, query, fetch_result=False):
+        template = _extract_sql_template(query)
+        params = _extract_identifiers_and_literals(query)
+
+        if "CREATE TABLE" in template.upper():
+            # Extract table name from first identifier
+            table_name = next((v for t, v in params if t == "ident"), "unknown")
+            if table_name not in _mock_tables:
+                _mock_tables[table_name] = {}
+            return None
+
+        if "SELECT 1" in template:
+            # text_exists: SELECT 1 FROM {table} WHERE id = {id} LIMIT 1
+            table_name = next((v for t, v in params if t == "ident"), "")
+            doc_id = next((v for t, v in params if t == "literal"), "")
+            data = _mock_tables.get(table_name, {})
+            return [(1,)] if doc_id in data else []
+
+        if "SELECT id" in template:
+            # get_ids_by_metadata_field: SELECT id FROM {table} WHERE meta->>{key} = {value}
+            table_name = next((v for t, v in params if t == "ident"), "")
+            literals = [v for t, v in params if t == "literal"]
+            key = literals[0] if len(literals) > 0 else ""
+            value = literals[1] if len(literals) > 1 else ""
+            data = _mock_tables.get(table_name, {})
+            return [(doc_id,) for doc_id, row in data.items() if json.loads(row.get("meta", "{}")).get(key) == value]
+
+        if "DELETE" in template.upper():
+            table_name = next((v for t, v in params if t == "ident"), "")
+            if "id IN" in template:
+                # delete_by_ids
+                ids_to_delete = [v for t, v in params if t == "literal"]
+                for did in ids_to_delete:
+                    _mock_tables.get(table_name, {}).pop(did, None)
+            elif "meta->>" in template:
+                # delete_by_metadata_field
+                literals = [v for t, v in params if t == "literal"]
+                key = literals[0] if len(literals) > 0 else ""
+                value = literals[1] if len(literals) > 1 else ""
+                data = _mock_tables.get(table_name, {})
+                to_remove = [
+                    doc_id for doc_id, row in data.items() if json.loads(row.get("meta", "{}")).get(key) == value
+                ]
+                for did in to_remove:
+                    data.pop(did, None)
+            return None
+
+        return [] if fetch_result else None
+
+    def drop_table(self, table_name):
+        _mock_tables.pop(table_name, None)
+
+
+def mock_connect(**kwargs):
+    """Replacement for holo_search_sdk.connect() that returns a mock client."""
+    return MockHologresClient()
+
+
+MOCK = os.getenv("MOCK_SWITCH", "false").lower() == "true"
+
+
+@pytest.fixture
+def setup_hologres_mock(monkeypatch: MonkeyPatch):
+    if MOCK:
+        monkeypatch.setattr(holo, "connect", mock_connect)
+
+    yield
+
+    if MOCK:
+        _mock_tables.clear()
+        monkeypatch.undo()
--- a/api/tests/integration_tests/vdb/hologres/init.py
+++ b/api/tests/integration_tests/vdb/hologres/init.py
--- a/api/tests/integration_tests/vdb/hologres/test_hologres.py
+++ b/api/tests/integration_tests/vdb/hologres/test_hologres.py
@ -0,0 +1,149 @@
+import os
+import uuid
+from typing import cast
+
+from holo_search_sdk.types import BaseQuantizationType, DistanceType, TokenizerType
+
+from core.rag.datasource.vdb.hologres.hologres_vector import HologresVector, HologresVectorConfig
+from core.rag.models.document import Document
+from tests.integration_tests.vdb.__mock.hologres import setup_hologres_mock
+from tests.integration_tests.vdb.test_vector_store import AbstractVectorTest, get_example_text, setup_mock_redis
+
+MOCK = os.getenv("MOCK_SWITCH", "false").lower() == "true"
+
+
+class HologresVectorTest(AbstractVectorTest):
+    def __init__(self):
+        super().__init__()
+        # Hologres requires collection names to be lowercase
+        self.collection_name = self.collection_name.lower()
+        self.vector = HologresVector(
+            collection_name=self.collection_name,
+            config=HologresVectorConfig(
+                host=os.environ.get("HOLOGRES_HOST", "localhost"),
+                port=int(os.environ.get("HOLOGRES_PORT", "80")),
+                database=os.environ.get("HOLOGRES_DATABASE", "test_db"),
+                access_key_id=os.environ.get("HOLOGRES_ACCESS_KEY_ID", "test_key"),
+                access_key_secret=os.environ.get("HOLOGRES_ACCESS_KEY_SECRET", "test_secret"),
+                schema_name=os.environ.get("HOLOGRES_SCHEMA", "public"),
+                tokenizer=cast(TokenizerType, os.environ.get("HOLOGRES_TOKENIZER", "jieba")),
+                distance_method=cast(DistanceType, os.environ.get("HOLOGRES_DISTANCE_METHOD", "Cosine")),
+                base_quantization_type=cast(
+                    BaseQuantizationType, os.environ.get("HOLOGRES_BASE_QUANTIZATION_TYPE", "rabitq")
+                ),
+                max_degree=int(os.environ.get("HOLOGRES_MAX_DEGREE", "64")),
+                ef_construction=int(os.environ.get("HOLOGRES_EF_CONSTRUCTION", "400")),
+            ),
+        )
+
+    def search_by_full_text(self):
+        """Override: full-text index may not be immediately ready in real mode."""
+        hits_by_full_text = self.vector.search_by_full_text(query=get_example_text())
+        if MOCK:
+            # In mock mode, full-text search should return the document we inserted
+            assert len(hits_by_full_text) == 1
+            assert hits_by_full_text[0].metadata["doc_id"] == self.example_doc_id
+        else:
+            # In real mode, full-text index may need time to become active
+            assert len(hits_by_full_text) >= 0
+
+    def search_by_vector_with_filter(self):
+        """Test vector search with document_ids_filter."""
+        # Create another document with different document_id
+        other_doc_id = str(uuid.uuid4())
+        other_doc = Document(
+            page_content="other_text",
+            metadata={
+                "doc_id": other_doc_id,
+                "doc_hash": other_doc_id,
+                "document_id": other_doc_id,
+                "dataset_id": self.dataset_id,
+            },
+        )
+        self.vector.add_texts(documents=[other_doc], embeddings=[self.example_embedding])
+
+        # Search with filter - should only return the original document
+        hits = self.vector.search_by_vector(
+            query_vector=self.example_embedding,
+            document_ids_filter=[self.example_doc_id],
+        )
+        assert len(hits) == 1
+        assert hits[0].metadata["doc_id"] == self.example_doc_id
+
+        # Search without filter - should return both
+        all_hits = self.vector.search_by_vector(query_vector=self.example_embedding, top_k=10)
+        assert len(all_hits) >= 2
+
+    def search_by_full_text_with_filter(self):
+        """Test full-text search with document_ids_filter."""
+        # Create another document with different document_id
+        other_doc_id = str(uuid.uuid4())
+        other_doc = Document(
+            page_content="unique_other_text",
+            metadata={
+                "doc_id": other_doc_id,
+                "doc_hash": other_doc_id,
+                "document_id": other_doc_id,
+                "dataset_id": self.dataset_id,
+            },
+        )
+        self.vector.add_texts(documents=[other_doc], embeddings=[self.example_embedding])
+
+        # Search with filter - should only return the original document
+        hits = self.vector.search_by_full_text(
+            query=get_example_text(),
+            document_ids_filter=[self.example_doc_id],
+        )
+        if MOCK:
+            assert len(hits) == 1
+            assert hits[0].metadata["doc_id"] == self.example_doc_id
+
+    def get_ids_by_metadata_field(self):
+        """Override: Hologres implements this method via JSONB query."""
+        ids = self.vector.get_ids_by_metadata_field(key="document_id", value=self.example_doc_id)
+        assert ids is not None
+        assert len(ids) == 1
+
+    def run_all_tests(self):
+        # Clean up before running tests
+        self.vector.delete()
+        # Run base tests (create, search, text_exists, get_ids, add_texts, delete_by_ids, delete)
+        super().run_all_tests()
+
+        # Additional filter tests require fresh data (table was deleted by base tests)
+        if MOCK:
+            # Recreate collection for filter tests
+            self.vector.create(
+                texts=[
+                    Document(
+                        page_content=get_example_text(),
+                        metadata={
+                            "doc_id": self.example_doc_id,
+                            "doc_hash": self.example_doc_id,
+                            "document_id": self.example_doc_id,
+                            "dataset_id": self.dataset_id,
+                        },
+                    )
+                ],
+                embeddings=[self.example_embedding],
+            )
+            self.search_by_vector_with_filter()
+            self.search_by_full_text_with_filter()
+            # Clean up
+            self.vector.delete()
+
+
+def test_hologres_vector(setup_mock_redis, setup_hologres_mock):
+    """
+    Test Hologres vector database implementation.
+
+    This test covers:
+    - Creating collection with vector index
+    - Adding texts with embeddings
+    - Vector similarity search
+    - Full-text search
+    - Text existence check
+    - Batch deletion by IDs
+    - Collection deletion
+    """
+    HologresVectorTest().run_all_tests()