mirror of
https://github.com/langgenius/dify.git
synced 2026-05-02 16:38:04 +08:00
merge main
This commit is contained in:
@ -83,11 +83,15 @@ class DataPostProcessor:
|
||||
if reranking_model:
|
||||
try:
|
||||
model_manager = ModelManager()
|
||||
reranking_provider_name = reranking_model.get("reranking_provider_name")
|
||||
reranking_model_name = reranking_model.get("reranking_model_name")
|
||||
if not reranking_provider_name or not reranking_model_name:
|
||||
return None
|
||||
rerank_model_instance = model_manager.get_model_instance(
|
||||
tenant_id=tenant_id,
|
||||
provider=reranking_model["reranking_provider_name"],
|
||||
provider=reranking_provider_name,
|
||||
model_type=ModelType.RERANK,
|
||||
model=reranking_model["reranking_model_name"],
|
||||
model=reranking_model_name,
|
||||
)
|
||||
return rerank_model_instance
|
||||
except InvokeAuthorizationError:
|
||||
|
||||
@ -1,18 +1,19 @@
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
import jieba
|
||||
from jieba.analyse import default_tfidf
|
||||
|
||||
from core.rag.datasource.keyword.jieba.stopwords import STOPWORDS
|
||||
|
||||
|
||||
class JiebaKeywordTableHandler:
|
||||
def __init__(self):
|
||||
default_tfidf.stop_words = STOPWORDS
|
||||
import jieba.analyse
|
||||
|
||||
from core.rag.datasource.keyword.jieba.stopwords import STOPWORDS
|
||||
|
||||
jieba.analyse.default_tfidf.stop_words = STOPWORDS
|
||||
|
||||
def extract_keywords(self, text: str, max_keywords_per_chunk: Optional[int] = 10) -> set[str]:
|
||||
"""Extract keywords with JIEBA tfidf."""
|
||||
import jieba
|
||||
|
||||
keywords = jieba.analyse.extract_tags(
|
||||
sentence=text,
|
||||
topK=max_keywords_per_chunk,
|
||||
@ -22,6 +23,8 @@ class JiebaKeywordTableHandler:
|
||||
|
||||
def _expand_tokens_with_subtokens(self, tokens: set[str]) -> set[str]:
|
||||
"""Get subtokens from a list of tokens., filtering for stopwords."""
|
||||
from core.rag.datasource.keyword.jieba.stopwords import STOPWORDS
|
||||
|
||||
results = set()
|
||||
for token in tokens:
|
||||
results.add(token)
|
||||
|
||||
@ -103,7 +103,7 @@ class RetrievalService:
|
||||
|
||||
if exceptions:
|
||||
exception_message = ";\n".join(exceptions)
|
||||
raise Exception(exception_message)
|
||||
raise ValueError(exception_message)
|
||||
|
||||
if retrieval_method == RetrievalMethod.HYBRID_SEARCH.value:
|
||||
data_post_processor = DataPostProcessor(
|
||||
|
||||
@ -50,10 +50,10 @@ class LindormVectorStoreConfig(BaseModel):
|
||||
|
||||
|
||||
class LindormVectorStore(BaseVector):
|
||||
def __init__(self, collection_name: str, config: LindormVectorStoreConfig, **kwargs):
|
||||
def __init__(self, collection_name: str, config: LindormVectorStoreConfig, using_ugc: bool, **kwargs):
|
||||
self._routing = None
|
||||
self._routing_field = None
|
||||
if config.using_ugc:
|
||||
if using_ugc:
|
||||
routing_value: str = kwargs.get("routing_value")
|
||||
if routing_value is None:
|
||||
raise ValueError("UGC index should init vector with valid 'routing_value' parameter value")
|
||||
@ -65,7 +65,7 @@ class LindormVectorStore(BaseVector):
|
||||
super().__init__(collection_name.lower())
|
||||
self._client_config = config
|
||||
self._client = OpenSearch(**config.to_opensearch_params())
|
||||
self._using_ugc = config.using_ugc
|
||||
self._using_ugc = using_ugc
|
||||
self.kwargs = kwargs
|
||||
|
||||
def get_type(self) -> str:
|
||||
@ -484,12 +484,16 @@ class LindormVectorStoreFactory(AbstractVectorFactory):
|
||||
using_ugc = dify_config.USING_UGC_INDEX
|
||||
routing_value = None
|
||||
if dataset.index_struct:
|
||||
if using_ugc:
|
||||
# if an existed record's index_struct_dict doesn't contain using_ugc field,
|
||||
# it actually stores in the normal index format
|
||||
stored_in_ugc = dataset.index_struct_dict.get("using_ugc", False)
|
||||
using_ugc = stored_in_ugc
|
||||
if stored_in_ugc:
|
||||
dimension = dataset.index_struct_dict["dimension"]
|
||||
index_type = dataset.index_struct_dict["index_type"]
|
||||
distance_type = dataset.index_struct_dict["distance_type"]
|
||||
index_name = f"{UGC_INDEX_PREFIX}_{dimension}_{index_type}_{distance_type}"
|
||||
routing_value = dataset.index_struct_dict["vector_store"]["class_prefix"]
|
||||
index_name = f"{UGC_INDEX_PREFIX}_{dimension}_{index_type}_{distance_type}"
|
||||
else:
|
||||
index_name = dataset.index_struct_dict["vector_store"]["class_prefix"]
|
||||
else:
|
||||
@ -504,6 +508,7 @@ class LindormVectorStoreFactory(AbstractVectorFactory):
|
||||
"index_type": index_type,
|
||||
"dimension": dimension,
|
||||
"distance_type": distance_type,
|
||||
"using_ugc": using_ugc,
|
||||
}
|
||||
dataset.index_struct = json.dumps(index_struct_dict)
|
||||
if using_ugc:
|
||||
@ -511,4 +516,4 @@ class LindormVectorStoreFactory(AbstractVectorFactory):
|
||||
routing_value = class_prefix
|
||||
else:
|
||||
index_name = class_prefix
|
||||
return LindormVectorStore(index_name, lindorm_config, routing_value=routing_value)
|
||||
return LindormVectorStore(index_name, lindorm_config, routing_value=routing_value, using_ugc=using_ugc)
|
||||
|
||||
@ -6,10 +6,8 @@ from contextlib import contextmanager
|
||||
from typing import Any
|
||||
|
||||
import jieba.posseg as pseg
|
||||
import nltk
|
||||
import numpy
|
||||
import oracledb
|
||||
from nltk.corpus import stopwords
|
||||
from pydantic import BaseModel, model_validator
|
||||
|
||||
from configs import dify_config
|
||||
@ -202,6 +200,10 @@ class OracleVector(BaseVector):
|
||||
return docs
|
||||
|
||||
def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]:
|
||||
# lazy import
|
||||
import nltk
|
||||
from nltk.corpus import stopwords
|
||||
|
||||
top_k = kwargs.get("top_k", 5)
|
||||
# just not implement fetch by score_threshold now, may be later
|
||||
score_threshold = float(kwargs.get("score_threshold") or 0.0)
|
||||
|
||||
@ -65,6 +65,11 @@ class CacheEmbedding(Embeddings):
|
||||
for vector in embedding_result.embeddings:
|
||||
try:
|
||||
normalized_embedding = (vector / np.linalg.norm(vector)).tolist()
|
||||
# stackoverflow best way: https://stackoverflow.com/questions/20319813/how-to-check-list-containing-nan
|
||||
if np.isnan(normalized_embedding).any():
|
||||
# for issue #11827 float values are not json compliant
|
||||
logger.warning(f"Normalized embedding is nan: {normalized_embedding}")
|
||||
continue
|
||||
embedding_queue_embeddings.append(normalized_embedding)
|
||||
except IntegrityError:
|
||||
db.session.rollback()
|
||||
@ -111,6 +116,8 @@ class CacheEmbedding(Embeddings):
|
||||
|
||||
embedding_results = embedding_result.embeddings[0]
|
||||
embedding_results = (embedding_results / np.linalg.norm(embedding_results)).tolist()
|
||||
if np.isnan(embedding_results).any():
|
||||
raise ValueError("Normalized embedding is nan please try again")
|
||||
except Exception as ex:
|
||||
if dify_config.DEBUG:
|
||||
logging.exception(f"Failed to embed query text '{text[:10]}...({len(text)} chars)'")
|
||||
|
||||
Reference in New Issue
Block a user