feat: fix i18n missing keys and merge upstream/main (#24615)

Signed-off-by: -LAN- <laipz8200@outlook.com> Signed-off-by: kenwoodjw <blackxin55+@gmail.com> Signed-off-by: Yongtao Huang <yongtaoh2022@gmail.com> Signed-off-by: yihong0618 <zouzou0208@gmail.com> Signed-off-by: zhanluxianshen <zhanluxianshen@163.com> Co-authored-by: -LAN- <laipz8200@outlook.com> Co-authored-by: GuanMu <ballmanjq@gmail.com> Co-authored-by: Davide Delbianco <davide.delbianco@outlook.com> Co-authored-by: NeatGuyCoding <15627489+NeatGuyCoding@users.noreply.github.com> Co-authored-by: kenwoodjw <blackxin55+@gmail.com> Co-authored-by: Yongtao Huang <yongtaoh2022@gmail.com> Co-authored-by: Yongtao Huang <99629139+hyongtao-db@users.noreply.github.com> Co-authored-by: Qiang Lee <18018968632@163.com> Co-authored-by: 李强04 <liqiang04@gaotu.cn> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Asuka Minato <i@asukaminato.eu.org> Co-authored-by: Matri Qi <matrixdom@126.com> Co-authored-by: huayaoyue6 <huayaoyue@163.com> Co-authored-by: Bowen Liang <liangbowen@gf.com.cn> Co-authored-by: znn <jubinkumarsoni@gmail.com> Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: yihong <zouzou0208@gmail.com> Co-authored-by: Muke Wang <shaodwaaron@gmail.com> Co-authored-by: wangmuke <wangmuke@kingsware.cn> Co-authored-by: Wu Tianwei <30284043+WTW0313@users.noreply.github.com> Co-authored-by: quicksand <quicksandzn@gmail.com> Co-authored-by: 非法操作 <hjlarry@163.com> Co-authored-by: zxhlyh <jasonapring2015@outlook.com> Co-authored-by: Eric Guo <eric.guocz@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: jiangbo721 <jiangbo721@163.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: hjlarry <25834719+hjlarry@users.noreply.github.com> Co-authored-by: lxsummer <35754229+lxjustdoit@users.noreply.github.com> Co-authored-by: 湛露先生 <zhanluxianshen@163.com> Co-authored-by: Guangdong Liu <liugddx@gmail.com> Co-authored-by: QuantumGhost <obelisk.reg+git@gmail.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Yessenia-d <yessenia.contact@gmail.com> Co-authored-by: huangzhuo1949 <167434202+huangzhuo1949@users.noreply.github.com> Co-authored-by: huangzhuo <huangzhuo1@xiaomi.com> Co-authored-by: 17hz <0x149527@gmail.com> Co-authored-by: Amy <1530140574@qq.com> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Nite Knite <nkCoding@gmail.com> Co-authored-by: Yeuoly <45712896+Yeuoly@users.noreply.github.com> Co-authored-by: Petrus Han <petrus.hanks@gmail.com> Co-authored-by: iamjoel <2120155+iamjoel@users.noreply.github.com> Co-authored-by: Kalo Chin <frog.beepers.0n@icloud.com> Co-authored-by: Ujjwal Maurya <ujjwalsbx@gmail.com> Co-authored-by: Maries <xh001x@hotmail.com>
2026-03-29 09:59:59 +08:00 · 2025-08-27 15:07:28 +08:00
parent a63d1e87b1
commit 5bbf685035
625 changed files with 23778 additions and 10693 deletions
--- a/api/core/rag/datasource/vdb/milvus/milvus_vector.py
+++ b/api/core/rag/datasource/vdb/milvus/milvus_vector.py
@ -101,7 +101,7 @@ class MilvusVector(BaseVector):
            if "Zilliz Cloud" in milvus_version:
                return True
            # For standard Milvus installations, check version number
-            return version.parse(milvus_version).base_version >= version.parse("2.5.0").base_version
+            return version.parse(milvus_version) >= version.parse("2.5.0")
        except Exception as e:
            logger.warning("Failed to check Milvus version: %s. Disabling hybrid search.", str(e))
            return False
@ -259,8 +259,16 @@ class MilvusVector(BaseVector):
        """
        Search for documents by full-text search (if hybrid search is enabled).
        """
-        if not self._hybrid_search_enabled or not self.field_exists(Field.SPARSE_VECTOR.value):
-            logger.warning("Full-text search is not supported in current Milvus version (requires >= 2.5.0)")
+        if not self._hybrid_search_enabled:
+            logger.warning(
+                "Full-text search is disabled: set MILVUS_ENABLE_HYBRID_SEARCH=true (requires Milvus >= 2.5.0)."
+            )
+            return []
+        if not self.field_exists(Field.SPARSE_VECTOR.value):
+            logger.warning(
+                "Full-text search unavailable: collection missing 'sparse_vector' field; "
+                "recreate the collection after enabling MILVUS_ENABLE_HYBRID_SEARCH to add BM25 sparse index."
+            )
            return []
        document_ids_filter = kwargs.get("document_ids_filter")
        filter = ""
--- a/api/core/rag/datasource/vdb/myscale/myscale_vector.py
+++ b/api/core/rag/datasource/vdb/myscale/myscale_vector.py
@ -15,6 +15,8 @@ from core.rag.embedding.embedding_base import Embeddings
 from core.rag.models.document import Document
 from models.dataset import Dataset

+logger = logging.getLogger(__name__)
+

 class MyScaleConfig(BaseModel):
    host: str
@ -53,7 +55,7 @@ class MyScaleVector(BaseVector):
        return self.add_texts(documents=texts, embeddings=embeddings, **kwargs)

    def _create_collection(self, dimension: int):
-        logging.info("create MyScale collection %s with dimension %s", self._collection_name, dimension)
+        logger.info("create MyScale collection %s with dimension %s", self._collection_name, dimension)
        self._client.command(f"CREATE DATABASE IF NOT EXISTS {self._config.database}")
        fts_params = f"('{self._config.fts_params}')" if self._config.fts_params else ""
        sql = f"""
@ -151,7 +153,7 @@ class MyScaleVector(BaseVector):
                for r in self._client.query(sql).named_results()
            ]
        except Exception as e:
-            logging.exception("\033[91m\033[1m%s\033[0m \033[95m%s\033[0m", type(e), str(e))  # noqa:TRY401
+            logger.exception("\033[91m\033[1m%s\033[0m \033[95m%s\033[0m", type(e), str(e))  # noqa:TRY401
            return []

    def delete(self) -> None:
--- a/api/core/rag/datasource/vdb/oceanbase/oceanbase_vector.py
+++ b/api/core/rag/datasource/vdb/oceanbase/oceanbase_vector.py
@ -152,7 +152,7 @@ class OceanBaseVector(BaseVector):
            ob_full_version = result.fetchone()[0]
            ob_version = ob_full_version.split()[1]
            logger.debug("Current OceanBase version is %s", ob_version)
-            return version.parse(ob_version).base_version >= version.parse("4.3.5.1").base_version
+            return version.parse(ob_version) >= version.parse("4.3.5.1")
        except Exception as e:
            logger.warning("Failed to check OceanBase version: %s. Disabling hybrid search.", str(e))
            return False
--- a/api/core/rag/datasource/vdb/oracle/oraclevector.py
+++ b/api/core/rag/datasource/vdb/oracle/oraclevector.py
@ -188,14 +188,17 @@ class OracleVector(BaseVector):
    def text_exists(self, id: str) -> bool:
        with self._get_connection() as conn:
            with conn.cursor() as cur:
-                cur.execute(f"SELECT id FROM {self.table_name} WHERE id = '%s'" % (id,))
+                cur.execute(f"SELECT id FROM {self.table_name} WHERE id = :1", (id,))
                return cur.fetchone() is not None
            conn.close()

    def get_by_ids(self, ids: list[str]) -> list[Document]:
+        if not ids:
+            return []
        with self._get_connection() as conn:
            with conn.cursor() as cur:
-                cur.execute(f"SELECT meta, text FROM {self.table_name} WHERE id IN %s", (tuple(ids),))
+                placeholders = ", ".join(f":{i + 1}" for i in range(len(ids)))
+                cur.execute(f"SELECT meta, text FROM {self.table_name} WHERE id IN ({placeholders})", ids)
                docs = []
                for record in cur:
                    docs.append(Document(page_content=record[1], metadata=record[0]))
@ -208,14 +211,15 @@ class OracleVector(BaseVector):
            return
        with self._get_connection() as conn:
            with conn.cursor() as cur:
-                cur.execute(f"DELETE FROM {self.table_name} WHERE id IN %s" % (tuple(ids),))
+                placeholders = ", ".join(f":{i + 1}" for i in range(len(ids)))
+                cur.execute(f"DELETE FROM {self.table_name} WHERE id IN ({placeholders})", ids)
            conn.commit()
            conn.close()

    def delete_by_metadata_field(self, key: str, value: str) -> None:
        with self._get_connection() as conn:
            with conn.cursor() as cur:
-                cur.execute(f"DELETE FROM {self.table_name} WHERE meta->>%s = %s", (key, value))
+                cur.execute(f"DELETE FROM {self.table_name} WHERE JSON_VALUE(meta, '$." + key + "') = :1", (value,))
            conn.commit()
            conn.close()

@ -227,12 +231,20 @@ class OracleVector(BaseVector):
        :param top_k: The number of nearest neighbors to return, default is 5.
        :return: List of Documents that are nearest to the query vector.
        """
+        # Validate and sanitize top_k to prevent SQL injection
        top_k = kwargs.get("top_k", 4)
+        if not isinstance(top_k, int) or top_k <= 0 or top_k > 10000:
+            top_k = 4  # Use default if invalid
+
        document_ids_filter = kwargs.get("document_ids_filter")
        where_clause = ""
+        params = [numpy.array(query_vector)]
+
        if document_ids_filter:
-            document_ids = ", ".join(f"'{id}'" for id in document_ids_filter)
-            where_clause = f"WHERE metadata->>'document_id' in ({document_ids})"
+            placeholders = ", ".join(f":{i + 2}" for i in range(len(document_ids_filter)))
+            where_clause = f"WHERE JSON_VALUE(meta, '$.document_id') IN ({placeholders})"
+            params.extend(document_ids_filter)
+
        with self._get_connection() as conn:
            conn.inputtypehandler = self.input_type_handler
            conn.outputtypehandler = self.output_type_handler
@ -241,7 +253,7 @@ class OracleVector(BaseVector):
                    f"""SELECT meta, text, vector_distance(embedding,(select to_vector(:1) from dual),cosine)
                    AS distance FROM {self.table_name}
                    {where_clause} ORDER BY distance fetch first {top_k} rows only""",
-                    [numpy.array(query_vector)],
+                    params,
                )
                docs = []
                score_threshold = float(kwargs.get("score_threshold") or 0.0)
@ -259,9 +271,11 @@ class OracleVector(BaseVector):
        import nltk  # type: ignore
        from nltk.corpus import stopwords  # type: ignore

+        # Validate and sanitize top_k to prevent SQL injection
        top_k = kwargs.get("top_k", 5)
+        if not isinstance(top_k, int) or top_k <= 0 or top_k > 10000:
+            top_k = 5  # Use default if invalid
        # just not implement fetch by score_threshold now, may be later
-        score_threshold = float(kwargs.get("score_threshold") or 0.0)
        if len(query) > 0:
            # Check which language the query is in
            zh_pattern = re.compile("[\u4e00-\u9fa5]+")
@ -297,14 +311,21 @@ class OracleVector(BaseVector):
                with conn.cursor() as cur:
                    document_ids_filter = kwargs.get("document_ids_filter")
                    where_clause = ""
+                    params: dict[str, Any] = {"kk": " ACCUM ".join(entities)}
+
                    if document_ids_filter:
-                        document_ids = ", ".join(f"'{id}'" for id in document_ids_filter)
-                        where_clause = f" AND metadata->>'document_id' in ({document_ids}) "
+                        placeholders = []
+                        for i, doc_id in enumerate(document_ids_filter):
+                            param_name = f"doc_id_{i}"
+                            placeholders.append(f":{param_name}")
+                            params[param_name] = doc_id
+                        where_clause = f" AND JSON_VALUE(meta, '$.document_id') IN ({', '.join(placeholders)}) "
+
                    cur.execute(
                        f"""select meta, text, embedding FROM {self.table_name}
                    WHERE CONTAINS(text, :kk, 1) > 0  {where_clause}
                    order by score(1) desc fetch first {top_k} rows only""",
-                        kk=" ACCUM ".join(entities),
+                        params,
                    )
                    docs = []
                    for record in cur:
--- a/api/core/rag/datasource/vdb/pgvector/pgvector.py
+++ b/api/core/rag/datasource/vdb/pgvector/pgvector.py
@ -19,6 +19,8 @@ from core.rag.models.document import Document
 from extensions.ext_redis import redis_client
 from models.dataset import Dataset

+logger = logging.getLogger(__name__)
+

 class PGVectorConfig(BaseModel):
    host: str
@ -155,7 +157,7 @@ class PGVector(BaseVector):
                cur.execute(f"DELETE FROM {self.table_name} WHERE id IN %s", (tuple(ids),))
            except psycopg2.errors.UndefinedTable:
                # table not exists
-                logging.warning("Table %s not found, skipping delete operation.", self.table_name)
+                logger.warning("Table %s not found, skipping delete operation.", self.table_name)
                return
            except Exception as e:
                raise e
--- a/api/core/rag/datasource/vdb/tablestore/tablestore_vector.py
+++ b/api/core/rag/datasource/vdb/tablestore/tablestore_vector.py
@ -17,6 +17,8 @@ from core.rag.models.document import Document
 from extensions.ext_redis import redis_client
 from models import Dataset

+logger = logging.getLogger(__name__)
+

 class TableStoreConfig(BaseModel):
    access_key_id: Optional[str] = None
@ -145,7 +147,7 @@ class TableStoreVector(BaseVector):
        with redis_client.lock(lock_name, timeout=20):
            collection_exist_cache_key = f"vector_indexing_{self._collection_name}"
            if redis_client.get(collection_exist_cache_key):
-                logging.info("Collection %s already exists.", self._collection_name)
+                logger.info("Collection %s already exists.", self._collection_name)
                return

            self._create_table_if_not_exist()
@ -155,7 +157,7 @@ class TableStoreVector(BaseVector):
    def _create_table_if_not_exist(self) -> None:
        table_list = self._tablestore_client.list_table()
        if self._table_name in table_list:
-            logging.info("Tablestore system table[%s] already exists", self._table_name)
+            logger.info("Tablestore system table[%s] already exists", self._table_name)
            return None

        schema_of_primary_key = [("id", "STRING")]
@ -163,12 +165,12 @@ class TableStoreVector(BaseVector):
        table_options = tablestore.TableOptions()
        reserved_throughput = tablestore.ReservedThroughput(tablestore.CapacityUnit(0, 0))
        self._tablestore_client.create_table(table_meta, table_options, reserved_throughput)
-        logging.info("Tablestore create table[%s] successfully.", self._table_name)
+        logger.info("Tablestore create table[%s] successfully.", self._table_name)

    def _create_search_index_if_not_exist(self, dimension: int) -> None:
        search_index_list = self._tablestore_client.list_search_index(table_name=self._table_name)
        if self._index_name in [t[1] for t in search_index_list]:
-            logging.info("Tablestore system index[%s] already exists", self._index_name)
+            logger.info("Tablestore system index[%s] already exists", self._index_name)
            return None

        field_schemas = [
@ -206,20 +208,20 @@ class TableStoreVector(BaseVector):

        index_meta = tablestore.SearchIndexMeta(field_schemas)
        self._tablestore_client.create_search_index(self._table_name, self._index_name, index_meta)
-        logging.info("Tablestore create system index[%s] successfully.", self._index_name)
+        logger.info("Tablestore create system index[%s] successfully.", self._index_name)

    def _delete_table_if_exist(self):
        search_index_list = self._tablestore_client.list_search_index(table_name=self._table_name)
        for resp_tuple in search_index_list:
            self._tablestore_client.delete_search_index(resp_tuple[0], resp_tuple[1])
-            logging.info("Tablestore delete index[%s] successfully.", self._index_name)
+            logger.info("Tablestore delete index[%s] successfully.", self._index_name)

        self._tablestore_client.delete_table(self._table_name)
-        logging.info("Tablestore delete system table[%s] successfully.", self._index_name)
+        logger.info("Tablestore delete system table[%s] successfully.", self._index_name)

    def _delete_search_index(self) -> None:
        self._tablestore_client.delete_search_index(self._table_name, self._index_name)
-        logging.info("Tablestore delete index[%s] successfully.", self._index_name)
+        logger.info("Tablestore delete index[%s] successfully.", self._index_name)

    def _write_row(self, primary_key: str, attributes: dict[str, Any]) -> None:
        pk = [("id", primary_key)]
--- a/api/core/rag/datasource/vdb/tidb_vector/tidb_vector.py
+++ b/api/core/rag/datasource/vdb/tidb_vector/tidb_vector.py
@ -83,14 +83,14 @@ class TiDBVector(BaseVector):
        self._dimension = 1536

    def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
-        logger.info("create collection and add texts, collection_name: " + self._collection_name)
+        logger.info("create collection and add texts, collection_name: %s", self._collection_name)
        self._create_collection(len(embeddings[0]))
        self.add_texts(texts, embeddings)
        self._dimension = len(embeddings[0])
        pass

    def _create_collection(self, dimension: int):
-        logger.info("_create_collection, collection_name " + self._collection_name)
+        logger.info("_create_collection, collection_name %s", self._collection_name)
        lock_name = f"vector_indexing_lock_{self._collection_name}"
        with redis_client.lock(lock_name, timeout=20):
            collection_exist_cache_key = f"vector_indexing_{self._collection_name}"