mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-06 10:17:49 +08:00
fix: OceanBase metadata not returned in document list API (#13209)
### What problem does this PR solve? Fix #13144. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -102,13 +102,13 @@ class DocMetadataService:
|
|||||||
@classmethod
|
@classmethod
|
||||||
def _iter_search_results(cls, results):
|
def _iter_search_results(cls, results):
|
||||||
"""
|
"""
|
||||||
Iterate over search results in various formats (DataFrame, ES, list).
|
Iterate over search results in various formats (DataFrame, ES, OceanBase, list).
|
||||||
|
|
||||||
Yields:
|
Yields:
|
||||||
Tuple of (doc_id, doc_dict) for each document
|
Tuple of (doc_id, doc_dict) for each document
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
results: Search results from ES/Infinity in any format
|
results: Search results from ES/Infinity/OceanBase in any format
|
||||||
"""
|
"""
|
||||||
# Handle tuple return from Infinity: (DataFrame, int)
|
# Handle tuple return from Infinity: (DataFrame, int)
|
||||||
# Check this FIRST because pandas DataFrames also have __getitem__
|
# Check this FIRST because pandas DataFrames also have __getitem__
|
||||||
@ -148,6 +148,14 @@ class DocMetadataService:
|
|||||||
if doc_id:
|
if doc_id:
|
||||||
yield doc_id, doc
|
yield doc_id, doc
|
||||||
|
|
||||||
|
# Check if OceanBase SearchResult format
|
||||||
|
elif hasattr(results, 'chunks') and hasattr(results, 'total'):
|
||||||
|
# OceanBase format: SearchResult(total=int, chunks=[{...}, {...}])
|
||||||
|
for doc in results.chunks:
|
||||||
|
doc_id = cls._extract_doc_id(doc)
|
||||||
|
if doc_id:
|
||||||
|
yield doc_id, doc
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _search_metadata(cls, kb_id: str, condition: Dict = None, limit: int = 10000):
|
def _search_metadata(cls, kb_id: str, condition: Dict = None, limit: int = 10000):
|
||||||
"""
|
"""
|
||||||
@ -367,7 +375,7 @@ class DocMetadataService:
|
|||||||
logging.debug(f"[update_document_metadata] Updating doc_id: {doc_id}, kb_id: {kb_id}, meta_fields: {processed_meta}")
|
logging.debug(f"[update_document_metadata] Updating doc_id: {doc_id}, kb_id: {kb_id}, meta_fields: {processed_meta}")
|
||||||
|
|
||||||
# For Elasticsearch, use efficient partial update
|
# For Elasticsearch, use efficient partial update
|
||||||
if not settings.DOC_ENGINE_INFINITY:
|
if not settings.DOC_ENGINE_INFINITY and not settings.DOC_ENGINE_OCEANBASE:
|
||||||
try:
|
try:
|
||||||
# Use ES partial update API - much more efficient than delete+insert
|
# Use ES partial update API - much more efficient than delete+insert
|
||||||
settings.docStoreConn.es.update(
|
settings.docStoreConn.es.update(
|
||||||
|
|||||||
@ -24,7 +24,8 @@ from typing import Any
|
|||||||
|
|
||||||
from pymysql.converters import escape_string
|
from pymysql.converters import escape_string
|
||||||
from pyobvector import ObVecClient, FtsIndexParam, FtsParser, VECTOR
|
from pyobvector import ObVecClient, FtsIndexParam, FtsParser, VECTOR
|
||||||
from sqlalchemy import Column, Table
|
from sqlalchemy import Column, JSON, Table
|
||||||
|
from sqlalchemy.dialects.mysql import VARCHAR
|
||||||
|
|
||||||
from common.doc_store.doc_store_base import DocStoreConnection, MatchExpr, OrderByExpr
|
from common.doc_store.doc_store_base import DocStoreConnection, MatchExpr, OrderByExpr
|
||||||
|
|
||||||
@ -37,6 +38,15 @@ fulltext_search_template = "MATCH (%s) AGAINST ('%s' IN NATURAL LANGUAGE MODE)"
|
|||||||
vector_search_template = "cosine_distance(%s, '%s')"
|
vector_search_template = "cosine_distance(%s, '%s')"
|
||||||
vector_column_pattern = re.compile(r"q_(?P<vector_size>\d+)_vec")
|
vector_column_pattern = re.compile(r"q_(?P<vector_size>\d+)_vec")
|
||||||
|
|
||||||
|
# Document metadata table columns
|
||||||
|
doc_meta_columns = [
|
||||||
|
Column("id", VARCHAR(256), primary_key=True, comment="document id"),
|
||||||
|
Column("kb_id", VARCHAR(256), nullable=False, comment="knowledge base id"),
|
||||||
|
Column("meta_fields", JSON, nullable=True, comment="document metadata fields"),
|
||||||
|
]
|
||||||
|
doc_meta_column_names = [col.name for col in doc_meta_columns]
|
||||||
|
doc_meta_column_types = {col.name: col.type for col in doc_meta_columns}
|
||||||
|
|
||||||
|
|
||||||
def get_value_str(value: Any) -> str:
|
def get_value_str(value: Any) -> str:
|
||||||
"""Convert value to SQL string representation."""
|
"""Convert value to SQL string representation."""
|
||||||
@ -266,19 +276,9 @@ class OBConnectionBase(DocStoreConnection):
|
|||||||
Table name pattern: ragflow_doc_meta_{tenant_id}
|
Table name pattern: ragflow_doc_meta_{tenant_id}
|
||||||
- Per-tenant metadata table for storing document metadata fields
|
- Per-tenant metadata table for storing document metadata fields
|
||||||
"""
|
"""
|
||||||
from sqlalchemy import JSON
|
|
||||||
from sqlalchemy.dialects.mysql import VARCHAR
|
|
||||||
|
|
||||||
table_name = index_name
|
table_name = index_name
|
||||||
lock_prefix = self.get_lock_prefix()
|
lock_prefix = self.get_lock_prefix()
|
||||||
|
|
||||||
# Define columns for document metadata table
|
|
||||||
doc_meta_columns = [
|
|
||||||
Column("id", VARCHAR(256), primary_key=True, comment="document id"),
|
|
||||||
Column("kb_id", VARCHAR(256), nullable=False, comment="knowledge base id"),
|
|
||||||
Column("meta_fields", JSON, nullable=True, comment="document metadata fields"),
|
|
||||||
]
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Create table with distributed lock
|
# Create table with distributed lock
|
||||||
_try_with_lock(
|
_try_with_lock(
|
||||||
@ -319,11 +319,17 @@ class OBConnectionBase(DocStoreConnection):
|
|||||||
|
|
||||||
def index_exist(self, index_name: str, dataset_id: str = None) -> bool:
|
def index_exist(self, index_name: str, dataset_id: str = None) -> bool:
|
||||||
"""Check if index/table exists."""
|
"""Check if index/table exists."""
|
||||||
# For doc_meta tables, use index_name directly as table name
|
# For doc_meta tables, use index_name directly and only check table existence
|
||||||
|
# (metadata tables don't have fulltext/vector indexes that chunk tables have)
|
||||||
if index_name.startswith("ragflow_doc_meta_"):
|
if index_name.startswith("ragflow_doc_meta_"):
|
||||||
table_name = index_name
|
if index_name in self._table_exists_cache:
|
||||||
else:
|
return True
|
||||||
table_name = self.get_table_name(index_name, dataset_id) if dataset_id else index_name
|
if not self.client.check_table_exists(index_name):
|
||||||
|
return False
|
||||||
|
with self._table_exists_cache_lock:
|
||||||
|
self._table_exists_cache.add(index_name)
|
||||||
|
return True
|
||||||
|
table_name = self.get_table_name(index_name, dataset_id) if dataset_id else index_name
|
||||||
return self._check_table_exists_cached(table_name)
|
return self._check_table_exists_cached(table_name)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -34,7 +34,8 @@ from common.doc_store.doc_store_base import MatchExpr, OrderByExpr, FusionExpr,
|
|||||||
from common.doc_store.ob_conn_base import (
|
from common.doc_store.ob_conn_base import (
|
||||||
OBConnectionBase, get_value_str,
|
OBConnectionBase, get_value_str,
|
||||||
vector_search_template, vector_column_pattern,
|
vector_search_template, vector_column_pattern,
|
||||||
fulltext_index_name_template,
|
fulltext_index_name_template, doc_meta_column_names,
|
||||||
|
doc_meta_column_types,
|
||||||
)
|
)
|
||||||
from common.float_utils import get_float
|
from common.float_utils import get_float
|
||||||
from rag.nlp import rag_tokenizer
|
from rag.nlp import rag_tokenizer
|
||||||
@ -135,8 +136,9 @@ class SearchResult(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
def get_column_value(column_name: str, value: Any) -> Any:
|
def get_column_value(column_name: str, value: Any) -> Any:
|
||||||
if column_name in column_types:
|
# Check chunk table columns first, then doc_meta table columns
|
||||||
column_type = column_types[column_name]
|
column_type = column_types.get(column_name) or doc_meta_column_types.get(column_name)
|
||||||
|
if column_type:
|
||||||
if isinstance(column_type, String):
|
if isinstance(column_type, String):
|
||||||
return str(value)
|
return str(value)
|
||||||
elif isinstance(column_type, Integer):
|
elif isinstance(column_type, Integer):
|
||||||
@ -658,6 +660,12 @@ class OBConnection(OBConnectionBase):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
output_fields = select_fields.copy()
|
output_fields = select_fields.copy()
|
||||||
|
if "*" in output_fields:
|
||||||
|
if index_names[0].startswith("ragflow_doc_meta_"):
|
||||||
|
output_fields = doc_meta_column_names.copy()
|
||||||
|
else:
|
||||||
|
output_fields = column_names.copy()
|
||||||
|
|
||||||
if "id" not in output_fields:
|
if "id" not in output_fields:
|
||||||
output_fields = ["id"] + output_fields
|
output_fields = ["id"] + output_fields
|
||||||
if "_score" in output_fields:
|
if "_score" in output_fields:
|
||||||
|
|||||||
Reference in New Issue
Block a user