feat: fix i18n missing keys and merge upstream/main (#24615)

Signed-off-by: -LAN- <laipz8200@outlook.com>
Signed-off-by: kenwoodjw <blackxin55+@gmail.com>
Signed-off-by: Yongtao Huang <yongtaoh2022@gmail.com>
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
Signed-off-by: zhanluxianshen <zhanluxianshen@163.com>
Co-authored-by: -LAN- <laipz8200@outlook.com>
Co-authored-by: GuanMu <ballmanjq@gmail.com>
Co-authored-by: Davide Delbianco <davide.delbianco@outlook.com>
Co-authored-by: NeatGuyCoding <15627489+NeatGuyCoding@users.noreply.github.com>
Co-authored-by: kenwoodjw <blackxin55+@gmail.com>
Co-authored-by: Yongtao Huang <yongtaoh2022@gmail.com>
Co-authored-by: Yongtao Huang <99629139+hyongtao-db@users.noreply.github.com>
Co-authored-by: Qiang Lee <18018968632@163.com>
Co-authored-by: 李强04 <liqiang04@gaotu.cn>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Asuka Minato <i@asukaminato.eu.org>
Co-authored-by: Matri Qi <matrixdom@126.com>
Co-authored-by: huayaoyue6 <huayaoyue@163.com>
Co-authored-by: Bowen Liang <liangbowen@gf.com.cn>
Co-authored-by: znn <jubinkumarsoni@gmail.com>
Co-authored-by: crazywoola <427733928@qq.com>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: yihong <zouzou0208@gmail.com>
Co-authored-by: Muke Wang <shaodwaaron@gmail.com>
Co-authored-by: wangmuke <wangmuke@kingsware.cn>
Co-authored-by: Wu Tianwei <30284043+WTW0313@users.noreply.github.com>
Co-authored-by: quicksand <quicksandzn@gmail.com>
Co-authored-by: 非法操作 <hjlarry@163.com>
Co-authored-by: zxhlyh <jasonapring2015@outlook.com>
Co-authored-by: Eric Guo <eric.guocz@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: jiangbo721 <jiangbo721@163.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: hjlarry <25834719+hjlarry@users.noreply.github.com>
Co-authored-by: lxsummer <35754229+lxjustdoit@users.noreply.github.com>
Co-authored-by: 湛露先生 <zhanluxianshen@163.com>
Co-authored-by: Guangdong Liu <liugddx@gmail.com>
Co-authored-by: QuantumGhost <obelisk.reg+git@gmail.com>
Co-authored-by: Claude <noreply@anthropic.com>
Co-authored-by: Yessenia-d <yessenia.contact@gmail.com>
Co-authored-by: huangzhuo1949 <167434202+huangzhuo1949@users.noreply.github.com>
Co-authored-by: huangzhuo <huangzhuo1@xiaomi.com>
Co-authored-by: 17hz <0x149527@gmail.com>
Co-authored-by: Amy <1530140574@qq.com>
Co-authored-by: Joel <iamjoel007@gmail.com>
Co-authored-by: Nite Knite <nkCoding@gmail.com>
Co-authored-by: Yeuoly <45712896+Yeuoly@users.noreply.github.com>
Co-authored-by: Petrus Han <petrus.hanks@gmail.com>
Co-authored-by: iamjoel <2120155+iamjoel@users.noreply.github.com>
Co-authored-by: Kalo Chin <frog.beepers.0n@icloud.com>
Co-authored-by: Ujjwal Maurya <ujjwalsbx@gmail.com>
Co-authored-by: Maries <xh001x@hotmail.com>
This commit is contained in:
lyzno1
2025-08-27 15:07:28 +08:00
committed by GitHub
parent a63d1e87b1
commit 5bbf685035
625 changed files with 23778 additions and 10693 deletions

View File

@ -101,7 +101,7 @@ class MilvusVector(BaseVector):
if "Zilliz Cloud" in milvus_version:
return True
# For standard Milvus installations, check version number
return version.parse(milvus_version).base_version >= version.parse("2.5.0").base_version
return version.parse(milvus_version) >= version.parse("2.5.0")
except Exception as e:
logger.warning("Failed to check Milvus version: %s. Disabling hybrid search.", str(e))
return False
@ -259,8 +259,16 @@ class MilvusVector(BaseVector):
"""
Search for documents by full-text search (if hybrid search is enabled).
"""
if not self._hybrid_search_enabled or not self.field_exists(Field.SPARSE_VECTOR.value):
logger.warning("Full-text search is not supported in current Milvus version (requires >= 2.5.0)")
if not self._hybrid_search_enabled:
logger.warning(
"Full-text search is disabled: set MILVUS_ENABLE_HYBRID_SEARCH=true (requires Milvus >= 2.5.0)."
)
return []
if not self.field_exists(Field.SPARSE_VECTOR.value):
logger.warning(
"Full-text search unavailable: collection missing 'sparse_vector' field; "
"recreate the collection after enabling MILVUS_ENABLE_HYBRID_SEARCH to add BM25 sparse index."
)
return []
document_ids_filter = kwargs.get("document_ids_filter")
filter = ""

View File

@ -15,6 +15,8 @@ from core.rag.embedding.embedding_base import Embeddings
from core.rag.models.document import Document
from models.dataset import Dataset
logger = logging.getLogger(__name__)
class MyScaleConfig(BaseModel):
host: str
@ -53,7 +55,7 @@ class MyScaleVector(BaseVector):
return self.add_texts(documents=texts, embeddings=embeddings, **kwargs)
def _create_collection(self, dimension: int):
logging.info("create MyScale collection %s with dimension %s", self._collection_name, dimension)
logger.info("create MyScale collection %s with dimension %s", self._collection_name, dimension)
self._client.command(f"CREATE DATABASE IF NOT EXISTS {self._config.database}")
fts_params = f"('{self._config.fts_params}')" if self._config.fts_params else ""
sql = f"""
@ -151,7 +153,7 @@ class MyScaleVector(BaseVector):
for r in self._client.query(sql).named_results()
]
except Exception as e:
logging.exception("\033[91m\033[1m%s\033[0m \033[95m%s\033[0m", type(e), str(e)) # noqa:TRY401
logger.exception("\033[91m\033[1m%s\033[0m \033[95m%s\033[0m", type(e), str(e)) # noqa:TRY401
return []
def delete(self) -> None:

View File

@ -152,7 +152,7 @@ class OceanBaseVector(BaseVector):
ob_full_version = result.fetchone()[0]
ob_version = ob_full_version.split()[1]
logger.debug("Current OceanBase version is %s", ob_version)
return version.parse(ob_version).base_version >= version.parse("4.3.5.1").base_version
return version.parse(ob_version) >= version.parse("4.3.5.1")
except Exception as e:
logger.warning("Failed to check OceanBase version: %s. Disabling hybrid search.", str(e))
return False

View File

@ -188,14 +188,17 @@ class OracleVector(BaseVector):
def text_exists(self, id: str) -> bool:
with self._get_connection() as conn:
with conn.cursor() as cur:
cur.execute(f"SELECT id FROM {self.table_name} WHERE id = '%s'" % (id,))
cur.execute(f"SELECT id FROM {self.table_name} WHERE id = :1", (id,))
return cur.fetchone() is not None
conn.close()
def get_by_ids(self, ids: list[str]) -> list[Document]:
if not ids:
return []
with self._get_connection() as conn:
with conn.cursor() as cur:
cur.execute(f"SELECT meta, text FROM {self.table_name} WHERE id IN %s", (tuple(ids),))
placeholders = ", ".join(f":{i + 1}" for i in range(len(ids)))
cur.execute(f"SELECT meta, text FROM {self.table_name} WHERE id IN ({placeholders})", ids)
docs = []
for record in cur:
docs.append(Document(page_content=record[1], metadata=record[0]))
@ -208,14 +211,15 @@ class OracleVector(BaseVector):
return
with self._get_connection() as conn:
with conn.cursor() as cur:
cur.execute(f"DELETE FROM {self.table_name} WHERE id IN %s" % (tuple(ids),))
placeholders = ", ".join(f":{i + 1}" for i in range(len(ids)))
cur.execute(f"DELETE FROM {self.table_name} WHERE id IN ({placeholders})", ids)
conn.commit()
conn.close()
def delete_by_metadata_field(self, key: str, value: str) -> None:
with self._get_connection() as conn:
with conn.cursor() as cur:
cur.execute(f"DELETE FROM {self.table_name} WHERE meta->>%s = %s", (key, value))
cur.execute(f"DELETE FROM {self.table_name} WHERE JSON_VALUE(meta, '$." + key + "') = :1", (value,))
conn.commit()
conn.close()
@ -227,12 +231,20 @@ class OracleVector(BaseVector):
:param top_k: The number of nearest neighbors to return, default is 5.
:return: List of Documents that are nearest to the query vector.
"""
# Validate and sanitize top_k to prevent SQL injection
top_k = kwargs.get("top_k", 4)
if not isinstance(top_k, int) or top_k <= 0 or top_k > 10000:
top_k = 4 # Use default if invalid
document_ids_filter = kwargs.get("document_ids_filter")
where_clause = ""
params = [numpy.array(query_vector)]
if document_ids_filter:
document_ids = ", ".join(f"'{id}'" for id in document_ids_filter)
where_clause = f"WHERE metadata->>'document_id' in ({document_ids})"
placeholders = ", ".join(f":{i + 2}" for i in range(len(document_ids_filter)))
where_clause = f"WHERE JSON_VALUE(meta, '$.document_id') IN ({placeholders})"
params.extend(document_ids_filter)
with self._get_connection() as conn:
conn.inputtypehandler = self.input_type_handler
conn.outputtypehandler = self.output_type_handler
@ -241,7 +253,7 @@ class OracleVector(BaseVector):
f"""SELECT meta, text, vector_distance(embedding,(select to_vector(:1) from dual),cosine)
AS distance FROM {self.table_name}
{where_clause} ORDER BY distance fetch first {top_k} rows only""",
[numpy.array(query_vector)],
params,
)
docs = []
score_threshold = float(kwargs.get("score_threshold") or 0.0)
@ -259,9 +271,11 @@ class OracleVector(BaseVector):
import nltk # type: ignore
from nltk.corpus import stopwords # type: ignore
# Validate and sanitize top_k to prevent SQL injection
top_k = kwargs.get("top_k", 5)
if not isinstance(top_k, int) or top_k <= 0 or top_k > 10000:
top_k = 5 # Use default if invalid
# just not implement fetch by score_threshold now, may be later
score_threshold = float(kwargs.get("score_threshold") or 0.0)
if len(query) > 0:
# Check which language the query is in
zh_pattern = re.compile("[\u4e00-\u9fa5]+")
@ -297,14 +311,21 @@ class OracleVector(BaseVector):
with conn.cursor() as cur:
document_ids_filter = kwargs.get("document_ids_filter")
where_clause = ""
params: dict[str, Any] = {"kk": " ACCUM ".join(entities)}
if document_ids_filter:
document_ids = ", ".join(f"'{id}'" for id in document_ids_filter)
where_clause = f" AND metadata->>'document_id' in ({document_ids}) "
placeholders = []
for i, doc_id in enumerate(document_ids_filter):
param_name = f"doc_id_{i}"
placeholders.append(f":{param_name}")
params[param_name] = doc_id
where_clause = f" AND JSON_VALUE(meta, '$.document_id') IN ({', '.join(placeholders)}) "
cur.execute(
f"""select meta, text, embedding FROM {self.table_name}
WHERE CONTAINS(text, :kk, 1) > 0 {where_clause}
order by score(1) desc fetch first {top_k} rows only""",
kk=" ACCUM ".join(entities),
params,
)
docs = []
for record in cur:

View File

@ -19,6 +19,8 @@ from core.rag.models.document import Document
from extensions.ext_redis import redis_client
from models.dataset import Dataset
logger = logging.getLogger(__name__)
class PGVectorConfig(BaseModel):
host: str
@ -155,7 +157,7 @@ class PGVector(BaseVector):
cur.execute(f"DELETE FROM {self.table_name} WHERE id IN %s", (tuple(ids),))
except psycopg2.errors.UndefinedTable:
# table not exists
logging.warning("Table %s not found, skipping delete operation.", self.table_name)
logger.warning("Table %s not found, skipping delete operation.", self.table_name)
return
except Exception as e:
raise e

View File

@ -17,6 +17,8 @@ from core.rag.models.document import Document
from extensions.ext_redis import redis_client
from models import Dataset
logger = logging.getLogger(__name__)
class TableStoreConfig(BaseModel):
access_key_id: Optional[str] = None
@ -145,7 +147,7 @@ class TableStoreVector(BaseVector):
with redis_client.lock(lock_name, timeout=20):
collection_exist_cache_key = f"vector_indexing_{self._collection_name}"
if redis_client.get(collection_exist_cache_key):
logging.info("Collection %s already exists.", self._collection_name)
logger.info("Collection %s already exists.", self._collection_name)
return
self._create_table_if_not_exist()
@ -155,7 +157,7 @@ class TableStoreVector(BaseVector):
def _create_table_if_not_exist(self) -> None:
table_list = self._tablestore_client.list_table()
if self._table_name in table_list:
logging.info("Tablestore system table[%s] already exists", self._table_name)
logger.info("Tablestore system table[%s] already exists", self._table_name)
return None
schema_of_primary_key = [("id", "STRING")]
@ -163,12 +165,12 @@ class TableStoreVector(BaseVector):
table_options = tablestore.TableOptions()
reserved_throughput = tablestore.ReservedThroughput(tablestore.CapacityUnit(0, 0))
self._tablestore_client.create_table(table_meta, table_options, reserved_throughput)
logging.info("Tablestore create table[%s] successfully.", self._table_name)
logger.info("Tablestore create table[%s] successfully.", self._table_name)
def _create_search_index_if_not_exist(self, dimension: int) -> None:
search_index_list = self._tablestore_client.list_search_index(table_name=self._table_name)
if self._index_name in [t[1] for t in search_index_list]:
logging.info("Tablestore system index[%s] already exists", self._index_name)
logger.info("Tablestore system index[%s] already exists", self._index_name)
return None
field_schemas = [
@ -206,20 +208,20 @@ class TableStoreVector(BaseVector):
index_meta = tablestore.SearchIndexMeta(field_schemas)
self._tablestore_client.create_search_index(self._table_name, self._index_name, index_meta)
logging.info("Tablestore create system index[%s] successfully.", self._index_name)
logger.info("Tablestore create system index[%s] successfully.", self._index_name)
def _delete_table_if_exist(self):
search_index_list = self._tablestore_client.list_search_index(table_name=self._table_name)
for resp_tuple in search_index_list:
self._tablestore_client.delete_search_index(resp_tuple[0], resp_tuple[1])
logging.info("Tablestore delete index[%s] successfully.", self._index_name)
logger.info("Tablestore delete index[%s] successfully.", self._index_name)
self._tablestore_client.delete_table(self._table_name)
logging.info("Tablestore delete system table[%s] successfully.", self._index_name)
logger.info("Tablestore delete system table[%s] successfully.", self._index_name)
def _delete_search_index(self) -> None:
self._tablestore_client.delete_search_index(self._table_name, self._index_name)
logging.info("Tablestore delete index[%s] successfully.", self._index_name)
logger.info("Tablestore delete index[%s] successfully.", self._index_name)
def _write_row(self, primary_key: str, attributes: dict[str, Any]) -> None:
pk = [("id", primary_key)]

View File

@ -83,14 +83,14 @@ class TiDBVector(BaseVector):
self._dimension = 1536
def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
logger.info("create collection and add texts, collection_name: " + self._collection_name)
logger.info("create collection and add texts, collection_name: %s", self._collection_name)
self._create_collection(len(embeddings[0]))
self.add_texts(texts, embeddings)
self._dimension = len(embeddings[0])
pass
def _create_collection(self, dimension: int):
logger.info("_create_collection, collection_name " + self._collection_name)
logger.info("_create_collection, collection_name %s", self._collection_name)
lock_name = f"vector_indexing_lock_{self._collection_name}"
with redis_client.lock(lock_name, timeout=20):
collection_exist_cache_key = f"vector_indexing_{self._collection_name}"