mirror of
https://github.com/langgenius/dify.git
synced 2026-05-05 09:58:04 +08:00
feat: enterprise OTEL telemetry exporter (squash merge from feat/otel-telemetry-ee)
This commit is contained in:
209
api/tests/integration_tests/vdb/__mock/hologres.py
Normal file
209
api/tests/integration_tests/vdb/__mock/hologres.py
Normal file
@ -0,0 +1,209 @@
|
||||
import json
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
import holo_search_sdk as holo
|
||||
import pytest
|
||||
from _pytest.monkeypatch import MonkeyPatch
|
||||
from psycopg import sql as psql
|
||||
|
||||
# Shared in-memory storage: {table_name: {doc_id: {"id", "text", "meta", "embedding"}}}
|
||||
_mock_tables: dict[str, dict[str, dict[str, Any]]] = {}
|
||||
|
||||
|
||||
class MockSearchQuery:
|
||||
"""Mock query builder for search_vector and search_text results."""
|
||||
|
||||
def __init__(self, table_name: str, search_type: str):
|
||||
self._table_name = table_name
|
||||
self._search_type = search_type
|
||||
self._limit_val = 10
|
||||
self._filter_sql = None
|
||||
|
||||
def select(self, columns):
|
||||
return self
|
||||
|
||||
def limit(self, n):
|
||||
self._limit_val = n
|
||||
return self
|
||||
|
||||
def where(self, filter_sql):
|
||||
self._filter_sql = filter_sql
|
||||
return self
|
||||
|
||||
def _apply_filter(self, row: dict[str, Any]) -> bool:
|
||||
"""Apply the filter SQL to check if a row matches."""
|
||||
if self._filter_sql is None:
|
||||
return True
|
||||
|
||||
# Extract literals (the document IDs) from the filter SQL
|
||||
# Filter format: meta->>'document_id' IN ('doc1', 'doc2')
|
||||
literals = [v for t, v in _extract_identifiers_and_literals(self._filter_sql) if t == "literal"]
|
||||
if not literals:
|
||||
return True
|
||||
|
||||
# Get the document_id from the row's meta field
|
||||
meta = row.get("meta", "{}")
|
||||
if isinstance(meta, str):
|
||||
meta = json.loads(meta)
|
||||
doc_id = meta.get("document_id")
|
||||
|
||||
return doc_id in literals
|
||||
|
||||
def fetchall(self):
|
||||
data = _mock_tables.get(self._table_name, {})
|
||||
results = []
|
||||
for row in list(data.values())[: self._limit_val]:
|
||||
# Apply filter if present
|
||||
if not self._apply_filter(row):
|
||||
continue
|
||||
|
||||
if self._search_type == "vector":
|
||||
# row format expected by _process_vector_results: (distance, id, text, meta)
|
||||
results.append((0.1, row["id"], row["text"], row["meta"]))
|
||||
else:
|
||||
# row format expected by _process_full_text_results: (id, text, meta, embedding, score)
|
||||
results.append((row["id"], row["text"], row["meta"], row.get("embedding", []), 0.9))
|
||||
return results
|
||||
|
||||
|
||||
class MockTable:
|
||||
"""Mock table object returned by client.open_table()."""
|
||||
|
||||
def __init__(self, table_name: str):
|
||||
self._table_name = table_name
|
||||
|
||||
def upsert_multi(self, index_column, values, column_names, update=True, update_columns=None):
|
||||
if self._table_name not in _mock_tables:
|
||||
_mock_tables[self._table_name] = {}
|
||||
id_idx = column_names.index("id")
|
||||
for row in values:
|
||||
doc_id = row[id_idx]
|
||||
_mock_tables[self._table_name][doc_id] = dict(zip(column_names, row))
|
||||
|
||||
def search_vector(self, vector, column, distance_method, output_name):
|
||||
return MockSearchQuery(self._table_name, "vector")
|
||||
|
||||
def search_text(self, column, expression, return_score=False, return_score_name="score", return_all_columns=False):
|
||||
return MockSearchQuery(self._table_name, "text")
|
||||
|
||||
def set_vector_index(
|
||||
self, column, distance_method, base_quantization_type, max_degree, ef_construction, use_reorder
|
||||
):
|
||||
pass
|
||||
|
||||
def create_text_index(self, index_name, column, tokenizer):
|
||||
pass
|
||||
|
||||
|
||||
def _extract_sql_template(query) -> str:
|
||||
"""Extract the SQL template string from a psycopg Composed object."""
|
||||
if isinstance(query, psql.Composed):
|
||||
for part in query:
|
||||
if isinstance(part, psql.SQL):
|
||||
return part._obj
|
||||
if isinstance(query, psql.SQL):
|
||||
return query._obj
|
||||
return ""
|
||||
|
||||
|
||||
def _extract_identifiers_and_literals(query) -> list[Any]:
|
||||
"""Extract Identifier and Literal values from a psycopg Composed object."""
|
||||
values: list[Any] = []
|
||||
if isinstance(query, psql.Composed):
|
||||
for part in query:
|
||||
if isinstance(part, psql.Identifier):
|
||||
values.append(("ident", part._obj[0] if part._obj else ""))
|
||||
elif isinstance(part, psql.Literal):
|
||||
values.append(("literal", part._obj))
|
||||
elif isinstance(part, psql.Composed):
|
||||
# Handles SQL(...).join(...) for IN clauses
|
||||
for sub in part:
|
||||
if isinstance(sub, psql.Literal):
|
||||
values.append(("literal", sub._obj))
|
||||
return values
|
||||
|
||||
|
||||
class MockHologresClient:
|
||||
"""Mock holo_search_sdk client that stores data in memory."""
|
||||
|
||||
def connect(self):
|
||||
pass
|
||||
|
||||
def check_table_exist(self, table_name):
|
||||
return table_name in _mock_tables
|
||||
|
||||
def open_table(self, table_name):
|
||||
return MockTable(table_name)
|
||||
|
||||
def execute(self, query, fetch_result=False):
|
||||
template = _extract_sql_template(query)
|
||||
params = _extract_identifiers_and_literals(query)
|
||||
|
||||
if "CREATE TABLE" in template.upper():
|
||||
# Extract table name from first identifier
|
||||
table_name = next((v for t, v in params if t == "ident"), "unknown")
|
||||
if table_name not in _mock_tables:
|
||||
_mock_tables[table_name] = {}
|
||||
return None
|
||||
|
||||
if "SELECT 1" in template:
|
||||
# text_exists: SELECT 1 FROM {table} WHERE id = {id} LIMIT 1
|
||||
table_name = next((v for t, v in params if t == "ident"), "")
|
||||
doc_id = next((v for t, v in params if t == "literal"), "")
|
||||
data = _mock_tables.get(table_name, {})
|
||||
return [(1,)] if doc_id in data else []
|
||||
|
||||
if "SELECT id" in template:
|
||||
# get_ids_by_metadata_field: SELECT id FROM {table} WHERE meta->>{key} = {value}
|
||||
table_name = next((v for t, v in params if t == "ident"), "")
|
||||
literals = [v for t, v in params if t == "literal"]
|
||||
key = literals[0] if len(literals) > 0 else ""
|
||||
value = literals[1] if len(literals) > 1 else ""
|
||||
data = _mock_tables.get(table_name, {})
|
||||
return [(doc_id,) for doc_id, row in data.items() if json.loads(row.get("meta", "{}")).get(key) == value]
|
||||
|
||||
if "DELETE" in template.upper():
|
||||
table_name = next((v for t, v in params if t == "ident"), "")
|
||||
if "id IN" in template:
|
||||
# delete_by_ids
|
||||
ids_to_delete = [v for t, v in params if t == "literal"]
|
||||
for did in ids_to_delete:
|
||||
_mock_tables.get(table_name, {}).pop(did, None)
|
||||
elif "meta->>" in template:
|
||||
# delete_by_metadata_field
|
||||
literals = [v for t, v in params if t == "literal"]
|
||||
key = literals[0] if len(literals) > 0 else ""
|
||||
value = literals[1] if len(literals) > 1 else ""
|
||||
data = _mock_tables.get(table_name, {})
|
||||
to_remove = [
|
||||
doc_id for doc_id, row in data.items() if json.loads(row.get("meta", "{}")).get(key) == value
|
||||
]
|
||||
for did in to_remove:
|
||||
data.pop(did, None)
|
||||
return None
|
||||
|
||||
return [] if fetch_result else None
|
||||
|
||||
def drop_table(self, table_name):
|
||||
_mock_tables.pop(table_name, None)
|
||||
|
||||
|
||||
def mock_connect(**kwargs):
|
||||
"""Replacement for holo_search_sdk.connect() that returns a mock client."""
|
||||
return MockHologresClient()
|
||||
|
||||
|
||||
MOCK = os.getenv("MOCK_SWITCH", "false").lower() == "true"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup_hologres_mock(monkeypatch: MonkeyPatch):
|
||||
if MOCK:
|
||||
monkeypatch.setattr(holo, "connect", mock_connect)
|
||||
|
||||
yield
|
||||
|
||||
if MOCK:
|
||||
_mock_tables.clear()
|
||||
monkeypatch.undo()
|
||||
149
api/tests/integration_tests/vdb/hologres/test_hologres.py
Normal file
149
api/tests/integration_tests/vdb/hologres/test_hologres.py
Normal file
@ -0,0 +1,149 @@
|
||||
import os
|
||||
import uuid
|
||||
from typing import cast
|
||||
|
||||
from holo_search_sdk.types import BaseQuantizationType, DistanceType, TokenizerType
|
||||
|
||||
from core.rag.datasource.vdb.hologres.hologres_vector import HologresVector, HologresVectorConfig
|
||||
from core.rag.models.document import Document
|
||||
from tests.integration_tests.vdb.__mock.hologres import setup_hologres_mock
|
||||
from tests.integration_tests.vdb.test_vector_store import AbstractVectorTest, get_example_text, setup_mock_redis
|
||||
|
||||
MOCK = os.getenv("MOCK_SWITCH", "false").lower() == "true"
|
||||
|
||||
|
||||
class HologresVectorTest(AbstractVectorTest):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# Hologres requires collection names to be lowercase
|
||||
self.collection_name = self.collection_name.lower()
|
||||
self.vector = HologresVector(
|
||||
collection_name=self.collection_name,
|
||||
config=HologresVectorConfig(
|
||||
host=os.environ.get("HOLOGRES_HOST", "localhost"),
|
||||
port=int(os.environ.get("HOLOGRES_PORT", "80")),
|
||||
database=os.environ.get("HOLOGRES_DATABASE", "test_db"),
|
||||
access_key_id=os.environ.get("HOLOGRES_ACCESS_KEY_ID", "test_key"),
|
||||
access_key_secret=os.environ.get("HOLOGRES_ACCESS_KEY_SECRET", "test_secret"),
|
||||
schema_name=os.environ.get("HOLOGRES_SCHEMA", "public"),
|
||||
tokenizer=cast(TokenizerType, os.environ.get("HOLOGRES_TOKENIZER", "jieba")),
|
||||
distance_method=cast(DistanceType, os.environ.get("HOLOGRES_DISTANCE_METHOD", "Cosine")),
|
||||
base_quantization_type=cast(
|
||||
BaseQuantizationType, os.environ.get("HOLOGRES_BASE_QUANTIZATION_TYPE", "rabitq")
|
||||
),
|
||||
max_degree=int(os.environ.get("HOLOGRES_MAX_DEGREE", "64")),
|
||||
ef_construction=int(os.environ.get("HOLOGRES_EF_CONSTRUCTION", "400")),
|
||||
),
|
||||
)
|
||||
|
||||
def search_by_full_text(self):
|
||||
"""Override: full-text index may not be immediately ready in real mode."""
|
||||
hits_by_full_text = self.vector.search_by_full_text(query=get_example_text())
|
||||
if MOCK:
|
||||
# In mock mode, full-text search should return the document we inserted
|
||||
assert len(hits_by_full_text) == 1
|
||||
assert hits_by_full_text[0].metadata["doc_id"] == self.example_doc_id
|
||||
else:
|
||||
# In real mode, full-text index may need time to become active
|
||||
assert len(hits_by_full_text) >= 0
|
||||
|
||||
def search_by_vector_with_filter(self):
|
||||
"""Test vector search with document_ids_filter."""
|
||||
# Create another document with different document_id
|
||||
other_doc_id = str(uuid.uuid4())
|
||||
other_doc = Document(
|
||||
page_content="other_text",
|
||||
metadata={
|
||||
"doc_id": other_doc_id,
|
||||
"doc_hash": other_doc_id,
|
||||
"document_id": other_doc_id,
|
||||
"dataset_id": self.dataset_id,
|
||||
},
|
||||
)
|
||||
self.vector.add_texts(documents=[other_doc], embeddings=[self.example_embedding])
|
||||
|
||||
# Search with filter - should only return the original document
|
||||
hits = self.vector.search_by_vector(
|
||||
query_vector=self.example_embedding,
|
||||
document_ids_filter=[self.example_doc_id],
|
||||
)
|
||||
assert len(hits) == 1
|
||||
assert hits[0].metadata["doc_id"] == self.example_doc_id
|
||||
|
||||
# Search without filter - should return both
|
||||
all_hits = self.vector.search_by_vector(query_vector=self.example_embedding, top_k=10)
|
||||
assert len(all_hits) >= 2
|
||||
|
||||
def search_by_full_text_with_filter(self):
|
||||
"""Test full-text search with document_ids_filter."""
|
||||
# Create another document with different document_id
|
||||
other_doc_id = str(uuid.uuid4())
|
||||
other_doc = Document(
|
||||
page_content="unique_other_text",
|
||||
metadata={
|
||||
"doc_id": other_doc_id,
|
||||
"doc_hash": other_doc_id,
|
||||
"document_id": other_doc_id,
|
||||
"dataset_id": self.dataset_id,
|
||||
},
|
||||
)
|
||||
self.vector.add_texts(documents=[other_doc], embeddings=[self.example_embedding])
|
||||
|
||||
# Search with filter - should only return the original document
|
||||
hits = self.vector.search_by_full_text(
|
||||
query=get_example_text(),
|
||||
document_ids_filter=[self.example_doc_id],
|
||||
)
|
||||
if MOCK:
|
||||
assert len(hits) == 1
|
||||
assert hits[0].metadata["doc_id"] == self.example_doc_id
|
||||
|
||||
def get_ids_by_metadata_field(self):
|
||||
"""Override: Hologres implements this method via JSONB query."""
|
||||
ids = self.vector.get_ids_by_metadata_field(key="document_id", value=self.example_doc_id)
|
||||
assert ids is not None
|
||||
assert len(ids) == 1
|
||||
|
||||
def run_all_tests(self):
|
||||
# Clean up before running tests
|
||||
self.vector.delete()
|
||||
# Run base tests (create, search, text_exists, get_ids, add_texts, delete_by_ids, delete)
|
||||
super().run_all_tests()
|
||||
|
||||
# Additional filter tests require fresh data (table was deleted by base tests)
|
||||
if MOCK:
|
||||
# Recreate collection for filter tests
|
||||
self.vector.create(
|
||||
texts=[
|
||||
Document(
|
||||
page_content=get_example_text(),
|
||||
metadata={
|
||||
"doc_id": self.example_doc_id,
|
||||
"doc_hash": self.example_doc_id,
|
||||
"document_id": self.example_doc_id,
|
||||
"dataset_id": self.dataset_id,
|
||||
},
|
||||
)
|
||||
],
|
||||
embeddings=[self.example_embedding],
|
||||
)
|
||||
self.search_by_vector_with_filter()
|
||||
self.search_by_full_text_with_filter()
|
||||
# Clean up
|
||||
self.vector.delete()
|
||||
|
||||
|
||||
def test_hologres_vector(setup_mock_redis, setup_hologres_mock):
|
||||
"""
|
||||
Test Hologres vector database implementation.
|
||||
|
||||
This test covers:
|
||||
- Creating collection with vector index
|
||||
- Adding texts with embeddings
|
||||
- Vector similarity search
|
||||
- Full-text search
|
||||
- Text existence check
|
||||
- Batch deletion by IDs
|
||||
- Collection deletion
|
||||
"""
|
||||
HologresVectorTest().run_all_tests()
|
||||
Reference in New Issue
Block a user