mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-28 03:33:05 +08:00
### What problem does this PR solve? Fixes #14570. On OpenSearch backends (`DOC_ENGINE=opensearch`) every document-metadata write failed with `'OSConnection' object has no attribute 'create_doc_meta_idx'`, so both `PATCH /api/v1/datasets/{ds}/documents/{doc}` with `meta_fields` and `POST /api/v1/datasets/{ds}/metadata/update` were unusable while every other document operation (retrieval, parsing, name update, chunk management) worked correctly on the same OpenSearch cluster. The bug runs deeper than the missing method name in the error message suggests. `DocMetadataService` also reached into `settings.docStoreConn.es.*` directly for the index refresh, the scripted partial update, and the count call, which means that even after adding `create_doc_meta_idx` to `OSConnection` the very next call in the same metadata flow would still raise `AttributeError` because `OSConnection` exposes `self.os` rather than `self.es`. Fixing only the reported symptom would have moved the failure one line down without restoring the feature. This PR adds a uniform document-metadata dispatch surface to both connection classes so they present the same abstract API, and routes the service layer through that surface via `getattr` guards instead of poking at backend-specific attributes. The four new methods on `OSConnection` and `ESConnectionBase` are `create_doc_meta_idx`, `refresh_idx`, `count_idx`, and `replace_meta_fields`. `OSConnection.create_doc_meta_idx` reuses the existing `conf/doc_meta_es_mapping.json` schema in the OpenSearch `body=` form because OpenSearch and Elasticsearch share the same index-creation payload, and `replace_meta_fields` emits a full scripted assignment (`ctx._source.meta_fields = params.meta_fields`) on both backends so removed keys actually disappear instead of being preserved by deep-merge semantics. The `getattr`-guarded dispatch in `DocMetadataService` keeps the existing fall-through paths intact for Infinity and OceanBase, which continue to rely on their search-based count fallback and on the delete-then-insert metadata replacement they used before, so this change is strictly additive for those two backends. Verification: `pytest test/unit_test/rag/utils/test_opensearch_doc_meta.py` runs 16 new unit tests that pass locally and pin the `OSConnection` dispatch surface, the `create_doc_meta_idx` short-circuit when the index already exists, the mapping-file payload routing, the `IndicesClient.create` failure path, the `refresh_idx` and `count_idx` success and error sentinels, and the full-assignment script emitted by `replace_meta_fields`. The test module stubs `common.settings` and `rag.nlp` at import time so the suite runs without the heavy backend SDKs that the rest of the repository pulls in transitively. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --------- Co-authored-by: tmimmanuel <tmimmanuel@users.noreply.github.com>
1240 lines
50 KiB
Python
1240 lines
50 KiB
Python
#
|
||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||
#
|
||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
# you may not use this file except in compliance with the License.
|
||
# You may obtain a copy of the License at
|
||
#
|
||
# http://www.apache.org/licenses/LICENSE-2.0
|
||
#
|
||
# Unless required by applicable law or agreed to in writing, software
|
||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
# See the License for the specific language governing permissions and
|
||
# limitations under the License.
|
||
#
|
||
"""
|
||
Document Metadata Service
|
||
|
||
Manages document-level metadata storage in ES/Infinity.
|
||
This is the SOLE source of truth for document metadata - MySQL meta_fields column has been removed.
|
||
"""
|
||
|
||
import json
|
||
import logging
|
||
import re
|
||
from copy import deepcopy
|
||
from typing import Dict, List, Optional
|
||
|
||
from api.db.db_models import DB, Document
|
||
from common import settings
|
||
from common.metadata_utils import dedupe_list
|
||
from api.db.db_models import Knowledgebase
|
||
from common.doc_store.doc_store_base import OrderByExpr
|
||
|
||
|
||
class DocMetadataService:
|
||
"""Service for managing document metadata in ES/Infinity"""
|
||
|
||
@staticmethod
|
||
def _get_doc_meta_index_name(tenant_id: str) -> str:
|
||
"""
|
||
Get the index name for document metadata.
|
||
|
||
Args:
|
||
tenant_id: Tenant ID
|
||
|
||
Returns:
|
||
Index name for document metadata
|
||
"""
|
||
return f"ragflow_doc_meta_{tenant_id}"
|
||
|
||
@staticmethod
|
||
def _extract_metadata(flat_meta: Dict) -> Dict:
|
||
"""
|
||
Extract metadata from ES/Infinity document format.
|
||
|
||
Args:
|
||
flat_meta: Raw document from ES/Infinity with meta_fields field
|
||
|
||
Returns:
|
||
Simple metadata dictionary
|
||
"""
|
||
if not flat_meta or not isinstance(flat_meta, dict):
|
||
return {}
|
||
|
||
meta_fields = flat_meta.get('meta_fields')
|
||
if not meta_fields:
|
||
return {}
|
||
|
||
# Parse JSON string if needed
|
||
if isinstance(meta_fields, str):
|
||
import json
|
||
try:
|
||
return json.loads(meta_fields)
|
||
except json.JSONDecodeError:
|
||
return {}
|
||
|
||
# Already a dict, return as-is
|
||
if isinstance(meta_fields, dict):
|
||
return meta_fields
|
||
|
||
return {}
|
||
|
||
@staticmethod
|
||
def _extract_doc_id(doc: Dict, hit: Dict = None) -> str:
|
||
"""
|
||
Extract document ID from various formats.
|
||
|
||
Args:
|
||
doc: Document dictionary (from DataFrame or list format)
|
||
hit: Hit dictionary (from ES format with _id field)
|
||
|
||
Returns:
|
||
Document ID or empty string
|
||
"""
|
||
if hit:
|
||
# ES format: doc is in _source, id is in _id
|
||
return hit.get('_id', '')
|
||
# DataFrame or list format: check multiple possible fields
|
||
return doc.get("doc_id") or doc.get("_id") or doc.get("id", "")
|
||
|
||
@classmethod
|
||
def _iter_search_results(cls, results):
|
||
"""
|
||
Iterate over search results in various formats (DataFrame, ES, OceanBase, list).
|
||
|
||
Yields:
|
||
Tuple of (doc_id, doc_dict) for each document
|
||
|
||
Args:
|
||
results: Search results from ES/Infinity/OceanBase in any format
|
||
"""
|
||
# Handle tuple return from Infinity: (DataFrame, int)
|
||
# Check this FIRST because pandas DataFrames also have __getitem__
|
||
if isinstance(results, tuple) and len(results) == 2:
|
||
results = results[0] # Extract DataFrame from tuple
|
||
|
||
# Check if results is a pandas DataFrame (from Infinity)
|
||
if hasattr(results, 'iterrows'):
|
||
# Handle pandas DataFrame - use iterrows() to iterate over rows
|
||
for _, row in results.iterrows():
|
||
doc = dict(row) # Convert Series to dict
|
||
doc_id = cls._extract_doc_id(doc)
|
||
if doc_id:
|
||
yield doc_id, doc
|
||
|
||
# Check if ES format (has 'hits' key)
|
||
# Note: ES returns ObjectApiResponse which is dict-like but not isinstance(dict)
|
||
elif hasattr(results, 'get') and 'hits' in results:
|
||
# ES format: {"hits": {"hits": [{"_source": {...}, "_id": "..."}]}}
|
||
hits = results.get('hits', {}).get('hits', [])
|
||
for hit in hits:
|
||
doc = hit.get('_source', {})
|
||
doc_id = cls._extract_doc_id(doc, hit)
|
||
if doc_id:
|
||
yield doc_id, doc
|
||
|
||
# Handle list of dicts or other formats
|
||
elif isinstance(results, list):
|
||
for res in results:
|
||
if isinstance(res, dict):
|
||
docs = [res]
|
||
else:
|
||
docs = res
|
||
|
||
for doc in docs:
|
||
doc_id = cls._extract_doc_id(doc)
|
||
if doc_id:
|
||
yield doc_id, doc
|
||
|
||
# Check if OceanBase SearchResult format
|
||
elif hasattr(results, 'chunks') and hasattr(results, 'total'):
|
||
# OceanBase format: SearchResult(total=int, chunks=[{...}, {...}])
|
||
for doc in results.chunks:
|
||
doc_id = cls._extract_doc_id(doc)
|
||
if doc_id:
|
||
yield doc_id, doc
|
||
|
||
@classmethod
|
||
def _search_metadata(cls, kb_id: str, condition: Dict = None):
|
||
"""
|
||
Common search logic for metadata queries.
|
||
Uses pagination internally to retrieve data from the index.
|
||
|
||
Args:
|
||
kb_id: Knowledge base ID
|
||
condition: Optional search condition (defaults to {"kb_id": kb_id})
|
||
|
||
Returns:
|
||
Search results from ES/Infinity, or empty list if index doesn't exist
|
||
"""
|
||
kb = Knowledgebase.get_by_id(kb_id)
|
||
if not kb:
|
||
return []
|
||
|
||
tenant_id = kb.tenant_id
|
||
index_name = cls._get_doc_meta_index_name(tenant_id)
|
||
|
||
# Check if metadata index exists, create if it doesn't
|
||
if not settings.docStoreConn.index_exist(index_name, ""):
|
||
logging.debug(f"Metadata index {index_name} does not exist, creating it")
|
||
result = settings.docStoreConn.create_doc_meta_idx(index_name)
|
||
if result is False:
|
||
logging.error(f"Failed to create metadata index {index_name}")
|
||
return []
|
||
logging.debug(f"Successfully created metadata index {index_name}")
|
||
|
||
if condition is None:
|
||
condition = {"kb_id": kb_id}
|
||
|
||
# Add sort by id for ES to enable search_after on large data
|
||
order_by = OrderByExpr()
|
||
if not settings.DOC_ENGINE_INFINITY:
|
||
order_by.asc("id")
|
||
|
||
page_size = 1000
|
||
all_results = []
|
||
page = 0
|
||
|
||
while True:
|
||
results = settings.docStoreConn.search(
|
||
select_fields=["*"],
|
||
highlight_fields=[],
|
||
condition=condition,
|
||
match_expressions=[],
|
||
order_by=order_by,
|
||
offset=page * page_size,
|
||
limit=page_size,
|
||
index_names=index_name,
|
||
knowledgebase_ids=[kb_id]
|
||
)
|
||
|
||
# Handle different result formats
|
||
if results is None:
|
||
break
|
||
|
||
# Extract docs from results
|
||
page_docs = []
|
||
total_count = None # Used for Infinity to determine if more results exist
|
||
|
||
# Check for Infinity format first (DataFrame, total) tuple
|
||
if isinstance(results, tuple) and len(results) == 2:
|
||
df, total_count = results
|
||
if hasattr(df, 'iterrows'):
|
||
# Pandas DataFrame from Infinity
|
||
page_docs = df.to_dict('records')
|
||
else:
|
||
page_docs = list(df) if df else []
|
||
# Check for ES format (dict with 'hits' key)
|
||
elif hasattr(results, 'get') and 'hits' in results:
|
||
hits_obj = results.get('hits', {})
|
||
hits = hits_obj.get('hits', [])
|
||
page_docs = []
|
||
for hit in hits:
|
||
doc = hit.get('_source', {})
|
||
doc['id'] = hit.get('_id', '') # Add _id as 'id' for _extract_doc_id to work
|
||
page_docs.append(doc)
|
||
# Extract total count from ES response
|
||
total_hits = hits_obj.get('total', {})
|
||
if isinstance(total_hits, dict):
|
||
total_count = total_hits.get('value', len(page_docs))
|
||
else:
|
||
total_count = total_hits if total_hits else len(page_docs)
|
||
# Handle list/iterable results
|
||
elif hasattr(results, '__iter__') and not isinstance(results, dict):
|
||
page_docs = list(results)
|
||
else:
|
||
page_docs = []
|
||
|
||
if not page_docs:
|
||
break
|
||
|
||
all_results.extend(page_docs)
|
||
page += 1
|
||
|
||
# Determine if there are more results to fetch
|
||
# For Infinity: use total_count if available
|
||
if total_count is not None:
|
||
if len(all_results) >= total_count:
|
||
break
|
||
else:
|
||
# For ES or other: check if we got fewer than page_size
|
||
if len(page_docs) < page_size:
|
||
break
|
||
|
||
logging.debug(f"[_search_metadata] Retrieved {len(all_results)} total results for kb_id: {kb_id}")
|
||
return all_results
|
||
|
||
@classmethod
|
||
def _split_combined_values(cls, meta_fields: Dict) -> Dict:
|
||
"""
|
||
Post-process metadata to split combined values by common delimiters.
|
||
|
||
For example: "关羽、孙权、张辽" -> ["关羽", "孙权", "张辽"]
|
||
This fixes LLM extraction where multiple values are extracted as one combined value.
|
||
Also removes duplicates after splitting.
|
||
|
||
Args:
|
||
meta_fields: Metadata dictionary
|
||
|
||
Returns:
|
||
Processed metadata with split values
|
||
"""
|
||
if not meta_fields or not isinstance(meta_fields, dict):
|
||
return meta_fields
|
||
|
||
processed = {}
|
||
for key, value in meta_fields.items():
|
||
if isinstance(value, list):
|
||
# Process each item in the list
|
||
new_values = []
|
||
for item in value:
|
||
if isinstance(item, str):
|
||
# Split by common delimiters: Chinese comma (、), regular comma (,), pipe (|), semicolon (;), Chinese semicolon (;)
|
||
# Also handle mixed delimiters and spaces
|
||
split_items = re.split(r'[、,,;;|]+', item.strip())
|
||
# Trim whitespace and filter empty strings
|
||
split_items = [s.strip() for s in split_items if s.strip()]
|
||
if split_items:
|
||
new_values.extend(split_items)
|
||
else:
|
||
# Keep original if no split happened
|
||
new_values.append(item)
|
||
else:
|
||
new_values.append(item)
|
||
# Remove duplicates while preserving order.
|
||
# Use string-based dedupe to support unhashable values (e.g. dict entries).
|
||
processed[key] = dedupe_list(new_values)
|
||
else:
|
||
processed[key] = value
|
||
|
||
if processed != meta_fields:
|
||
logging.debug(f"[METADATA SPLIT] Split combined values: {meta_fields} -> {processed}")
|
||
return processed
|
||
|
||
@classmethod
|
||
@DB.connection_context()
|
||
def insert_document_metadata(cls, doc_id: str, meta_fields: Dict) -> bool:
|
||
"""
|
||
Insert document metadata into ES/Infinity.
|
||
|
||
Args:
|
||
doc_id: Document ID
|
||
meta_fields: Metadata dictionary
|
||
|
||
Returns:
|
||
True if successful, False otherwise
|
||
"""
|
||
try:
|
||
# Get document with tenant_id (need to join with Knowledgebase)
|
||
doc_query = Document.select(Document, Knowledgebase.tenant_id).join(
|
||
Knowledgebase, on=(Knowledgebase.id == Document.kb_id)
|
||
).where(Document.id == doc_id)
|
||
|
||
doc = doc_query.first()
|
||
if not doc:
|
||
logging.warning(f"Document {doc_id} not found for metadata insertion")
|
||
return False
|
||
|
||
# Extract document fields
|
||
doc_obj = doc # This is the Document object
|
||
tenant_id = doc.knowledgebase.tenant_id # Get tenant_id from joined Knowledgebase
|
||
kb_id = doc_obj.kb_id
|
||
|
||
# Prepare metadata document
|
||
doc_meta = {
|
||
"id": doc_obj.id,
|
||
"kb_id": kb_id,
|
||
}
|
||
|
||
# Store metadata as JSON object in meta_fields column (same as MySQL structure)
|
||
if meta_fields:
|
||
# Post-process to split combined values by common delimiters
|
||
meta_fields = cls._split_combined_values(meta_fields)
|
||
doc_meta["meta_fields"] = meta_fields
|
||
else:
|
||
doc_meta["meta_fields"] = {}
|
||
|
||
# Ensure index/table exists (per-tenant for both ES and Infinity)
|
||
index_name = cls._get_doc_meta_index_name(tenant_id)
|
||
|
||
# Check if table exists
|
||
table_exists = settings.docStoreConn.index_exist(index_name, kb_id)
|
||
logging.debug(f"Metadata table exists check: {index_name} -> {table_exists}")
|
||
|
||
# Create index if it doesn't exist
|
||
if not table_exists:
|
||
logging.debug(f"Creating metadata table: {index_name}")
|
||
# Both ES and Infinity now use per-tenant metadata tables
|
||
result = settings.docStoreConn.create_doc_meta_idx(index_name)
|
||
logging.debug(f"Table creation result: {result}")
|
||
if result is False:
|
||
logging.error(f"Failed to create metadata table {index_name}")
|
||
return False
|
||
else:
|
||
logging.debug(f"Metadata table already exists: {index_name}")
|
||
|
||
# Insert into ES/Infinity
|
||
result = settings.docStoreConn.insert(
|
||
[doc_meta],
|
||
index_name,
|
||
kb_id
|
||
)
|
||
|
||
if result:
|
||
logging.error(f"Failed to insert metadata for document {doc_id}: {result}")
|
||
return False
|
||
# Force refresh so metadata is immediately searchable.
|
||
# Both Elasticsearch and OpenSearch backends expose refresh_idx;
|
||
# Infinity does not need a manual refresh.
|
||
if not settings.DOC_ENGINE_INFINITY:
|
||
refresh_idx = getattr(settings.docStoreConn, "refresh_idx", None)
|
||
if callable(refresh_idx):
|
||
if refresh_idx(index_name):
|
||
logging.debug(f"Refreshed metadata index: {index_name}")
|
||
else:
|
||
# A failed refresh can leave just-inserted metadata
|
||
# invisible to subsequent reads; surface it so operators
|
||
# can correlate stale-read complaints with the cause.
|
||
logging.warning(
|
||
f"Failed to refresh metadata index {index_name} on backend "
|
||
f"{type(settings.docStoreConn).__name__}; "
|
||
f"metadata may not be immediately searchable"
|
||
)
|
||
else:
|
||
logging.debug(f"Backend {type(settings.docStoreConn).__name__} has no refresh_idx; skipping")
|
||
|
||
logging.debug(f"Successfully inserted metadata for document {doc_id}")
|
||
return True
|
||
|
||
except Exception as e:
|
||
logging.error(f"Error inserting metadata for document {doc_id}: {e}")
|
||
return False
|
||
|
||
@classmethod
|
||
@DB.connection_context()
|
||
def update_document_metadata(cls, doc_id: str, meta_fields: Dict) -> bool:
|
||
"""
|
||
Update document metadata in ES/Infinity.
|
||
|
||
For Elasticsearch: Uses partial update to directly update the meta_fields field.
|
||
For Infinity: Falls back to delete+insert (Infinity doesn't support partial updates well).
|
||
|
||
Args:
|
||
doc_id: Document ID
|
||
meta_fields: Metadata dictionary
|
||
|
||
Returns:
|
||
True if successful, False otherwise
|
||
"""
|
||
try:
|
||
# Get document with tenant_id
|
||
doc_query = Document.select(Document, Knowledgebase.tenant_id).join(
|
||
Knowledgebase, on=(Knowledgebase.id == Document.kb_id)
|
||
).where(Document.id == doc_id)
|
||
|
||
doc = doc_query.first()
|
||
if not doc:
|
||
logging.warning(f"Document {doc_id} not found for metadata update")
|
||
return False
|
||
|
||
# Extract fields
|
||
doc_obj = doc
|
||
tenant_id = doc.knowledgebase.tenant_id
|
||
kb_id = doc_obj.kb_id
|
||
index_name = cls._get_doc_meta_index_name(tenant_id)
|
||
|
||
# Post-process to split combined values
|
||
processed_meta = cls._split_combined_values(meta_fields)
|
||
|
||
logging.debug(f"[update_document_metadata] Updating doc_id: {doc_id}, kb_id: {kb_id}, meta_fields: {processed_meta}")
|
||
|
||
# For Elasticsearch, use efficient partial update
|
||
if not settings.DOC_ENGINE_INFINITY and not settings.DOC_ENGINE_OCEANBASE:
|
||
# Check if index exists first
|
||
index_exists = settings.docStoreConn.index_exist(index_name, "")
|
||
if not index_exists:
|
||
# Index doesn't exist - create it and insert directly
|
||
logging.debug(f"[update_document_metadata] Index {index_name} does not exist, creating and inserting")
|
||
result = settings.docStoreConn.create_doc_meta_idx(index_name)
|
||
if result is False:
|
||
logging.error(f"Failed to create metadata index {index_name}")
|
||
return False
|
||
return cls.insert_document_metadata(doc_id, processed_meta)
|
||
|
||
# Index exists - check if document exists
|
||
try:
|
||
doc_exists = settings.docStoreConn.get(
|
||
doc_id,
|
||
index_name,
|
||
[kb_id]
|
||
)
|
||
if doc_exists:
|
||
# Document exists - replace meta_fields entirely.
|
||
# Using update with a `doc` body would deep-merge the meta_fields
|
||
# object and retain old keys that should be removed, so we delegate
|
||
# to a backend-provided scripted assignment that fully overwrites it.
|
||
replace_meta_fields = getattr(settings.docStoreConn, "replace_meta_fields", None)
|
||
if callable(replace_meta_fields) and replace_meta_fields(index_name, doc_id, processed_meta):
|
||
logging.debug(f"Successfully updated metadata for document {doc_id} via {type(settings.docStoreConn).__name__}.replace_meta_fields")
|
||
return True
|
||
logging.warning(
|
||
f"replace_meta_fields unavailable or failed on backend "
|
||
f"{type(settings.docStoreConn).__name__}; falling back to delete+insert"
|
||
)
|
||
# Mirror the Infinity fallback below so a failed scripted
|
||
# replace still guarantees full overwrite semantics rather
|
||
# than leaking through the "document not found" branch.
|
||
cls.delete_document_metadata(doc_id, kb_id, tenant_id)
|
||
return cls.insert_document_metadata(doc_id, processed_meta)
|
||
except Exception as e:
|
||
logging.debug(f"Document {doc_id} not found in index, will insert: {e}")
|
||
|
||
# Document doesn't exist - insert new
|
||
logging.debug(f"[update_document_metadata] Document {doc_id} not found, inserting new")
|
||
return cls.insert_document_metadata(doc_id, processed_meta)
|
||
|
||
# For Infinity or as fallback: use delete+insert
|
||
logging.debug(f"[update_document_metadata] Using delete+insert method for doc_id: {doc_id}")
|
||
cls.delete_document_metadata(doc_id, kb_id, tenant_id)
|
||
return cls.insert_document_metadata(doc_id, processed_meta)
|
||
|
||
except Exception as e:
|
||
logging.error(f"Error updating metadata for document {doc_id}: {e}")
|
||
return False
|
||
|
||
@classmethod
|
||
@DB.connection_context()
|
||
def delete_document_metadata(cls, doc_id: str, kb_id: str, tenant_id: str = None) -> bool:
|
||
"""
|
||
Delete document metadata from ES/Infinity.
|
||
Also drops the metadata table if it becomes empty (efficiently).
|
||
If document has no metadata in the table, this is a no-op.
|
||
|
||
Args:
|
||
doc_id: Document ID
|
||
kb_id: Knowledge base ID
|
||
tenant_id: Tenant ID, if not provided, get it from kb_id
|
||
|
||
Returns:
|
||
True if successful (or no metadata to delete), False otherwise
|
||
"""
|
||
try:
|
||
logging.debug(f"[METADATA DELETE] Starting metadata deletion for document: {doc_id}")
|
||
|
||
# Get tenant_id from kb_id if not provided
|
||
if tenant_id is None:
|
||
kb = Knowledgebase.get_or_none(Knowledgebase.id == kb_id)
|
||
if not kb:
|
||
logging.warning(f"Knowledgebase {kb_id} not found for metadata deletion")
|
||
return False
|
||
tenant_id = kb.tenant_id
|
||
|
||
index_name = cls._get_doc_meta_index_name(tenant_id)
|
||
logging.debug(f"[delete_document_metadata] Deleting doc_id: {doc_id}, kb_id: {kb_id}, index: {index_name}")
|
||
|
||
# Check if metadata table exists before attempting deletion
|
||
# This is the key optimization - no table = no metadata = nothing to delete
|
||
if not settings.docStoreConn.index_exist(index_name, ""):
|
||
logging.debug(f"Metadata table {index_name} does not exist, skipping metadata deletion for document {doc_id}")
|
||
return True # No metadata to delete is considered success
|
||
|
||
# Try to get the metadata to confirm it exists before deleting
|
||
# This is more efficient than attempting delete on non-existent records
|
||
try:
|
||
existing_metadata = settings.docStoreConn.get(
|
||
doc_id,
|
||
index_name,
|
||
[""] # Empty list for metadata tables
|
||
)
|
||
logging.debug(f"[METADATA DELETE] Get result: {existing_metadata is not None}")
|
||
if not existing_metadata:
|
||
logging.debug(f"[METADATA DELETE] Document {doc_id} has no metadata in table, skipping deletion")
|
||
return True # No metadata to delete is success
|
||
except Exception as e:
|
||
# If get fails, document might not exist in metadata table, which is fine
|
||
logging.error(f"[METADATA DELETE] Get failed: {e}")
|
||
# Continue to check/drop table if needed
|
||
|
||
# Delete from ES/Infinity (only if metadata exists)
|
||
# For metadata tables, pass kb_id for the delete operation
|
||
# The delete() method will detect it's a metadata table and skip the kb_id filter
|
||
logging.debug(f"[METADATA DELETE] Deleting metadata with condition: {{'id': '{doc_id}'}}")
|
||
deleted_count = settings.docStoreConn.delete(
|
||
{"id": doc_id},
|
||
index_name,
|
||
kb_id # Pass actual kb_id (delete() will handle metadata tables correctly)
|
||
)
|
||
logging.debug(f"[METADATA DELETE] Deleted count: {deleted_count}")
|
||
return True
|
||
|
||
except Exception as e:
|
||
logging.error(f"Error deleting metadata for document {doc_id}: {e}")
|
||
return False
|
||
|
||
@classmethod
|
||
def _drop_empty_metadata_table(cls, index_name: str, tenant_id: str) -> None:
|
||
"""
|
||
Check if metadata table is empty and drop it if so.
|
||
Uses optimized count query instead of full search.
|
||
This prevents accumulation of empty metadata tables.
|
||
|
||
Args:
|
||
index_name: Metadata table/index name
|
||
tenant_id: Tenant ID
|
||
"""
|
||
try:
|
||
logging.debug(f"[DROP EMPTY TABLE] Starting empty table check for: {index_name}")
|
||
|
||
# Check if table exists first (cheap operation)
|
||
if not settings.docStoreConn.index_exist(index_name, ""):
|
||
logging.debug(f"[DROP EMPTY TABLE] Metadata table {index_name} does not exist, skipping")
|
||
return
|
||
|
||
logging.debug(f"[DROP EMPTY TABLE] Table {index_name} exists, checking if empty...")
|
||
|
||
# Use the backend-native count primitive when available (ES + OS).
|
||
# No need to refresh since delete operation already uses refresh=True.
|
||
# The invocation lives inside the try/except so a future backend
|
||
# whose count_idx raises (instead of returning the -1 sentinel)
|
||
# still falls through to the search-based empty-table check.
|
||
count_idx = getattr(settings.docStoreConn, "count_idx", None)
|
||
try:
|
||
count_value = count_idx(index_name) if callable(count_idx) else -1
|
||
if count_value < 0:
|
||
raise RuntimeError("native count_idx unavailable or failed")
|
||
logging.debug(f"[DROP EMPTY TABLE] count_idx API result: {count_value} documents")
|
||
is_empty = (count_value == 0)
|
||
except Exception as e:
|
||
logging.warning(f"[DROP EMPTY TABLE] Count API failed, falling back to search: {e}")
|
||
# Fallback to search if count fails
|
||
results = settings.docStoreConn.search(
|
||
select_fields=["id"],
|
||
highlight_fields=[],
|
||
condition={},
|
||
match_expressions=[],
|
||
order_by=OrderByExpr(),
|
||
offset=0,
|
||
limit=1, # Only need 1 result to know if table is non-empty
|
||
index_names=index_name,
|
||
knowledgebase_ids=[""] # Metadata tables don't filter by KB
|
||
)
|
||
|
||
logging.debug(f"[DROP EMPTY TABLE] Search results type: {type(results)}, results: {results}")
|
||
|
||
# Check if empty based on return type (fallback search only)
|
||
if isinstance(results, tuple) and len(results) == 2:
|
||
# Infinity returns (DataFrame, int)
|
||
df, total = results
|
||
logging.debug(f"[DROP EMPTY TABLE] Infinity format - total: {total}, df length: {len(df) if hasattr(df, '__len__') else 'N/A'}")
|
||
is_empty = (total == 0 or (hasattr(df, '__len__') and len(df) == 0))
|
||
elif hasattr(results, 'get') and 'hits' in results:
|
||
# ES format - MUST check this before hasattr(results, '__len__')
|
||
# because ES response objects also have __len__
|
||
total = results.get('hits', {}).get('total', {})
|
||
hits = results.get('hits', {}).get('hits', [])
|
||
|
||
# ES 7.x+: total is a dict like {'value': 0, 'relation': 'eq'}
|
||
# ES 6.x: total is an int
|
||
if isinstance(total, dict):
|
||
total_count = total.get('value', 0)
|
||
else:
|
||
total_count = total
|
||
|
||
logging.debug(f"[DROP EMPTY TABLE] ES format - total: {total_count}, hits count: {len(hits)}")
|
||
is_empty = (total_count == 0 or len(hits) == 0)
|
||
elif hasattr(results, '__len__'):
|
||
# DataFrame or list (check this AFTER ES format)
|
||
result_len = len(results)
|
||
logging.debug(f"[DROP EMPTY TABLE] List/DataFrame format - length: {result_len}")
|
||
is_empty = result_len == 0
|
||
else:
|
||
logging.warning(f"[DROP EMPTY TABLE] Unknown result format: {type(results)}")
|
||
is_empty = False
|
||
|
||
if is_empty:
|
||
logging.debug(f"[DROP EMPTY TABLE] Metadata table {index_name} is empty, dropping it")
|
||
drop_result = settings.docStoreConn.delete_idx(index_name, "")
|
||
logging.debug(f"[DROP EMPTY TABLE] Drop result: {drop_result}")
|
||
else:
|
||
logging.debug(f"[DROP EMPTY TABLE] Metadata table {index_name} still has documents, keeping it")
|
||
|
||
except Exception as e:
|
||
# Log but don't fail - metadata deletion was successful
|
||
logging.error(f"[DROP EMPTY TABLE] Failed to check/drop empty metadata table {index_name}: {e}")
|
||
|
||
@classmethod
|
||
@DB.connection_context()
|
||
def get_document_metadata(cls, doc_id: str) -> Dict:
|
||
"""
|
||
Get document metadata from ES/Infinity.
|
||
|
||
Args:
|
||
doc_id: Document ID
|
||
|
||
Returns:
|
||
Metadata dictionary, empty dict if not found
|
||
"""
|
||
try:
|
||
# Get document with tenant_id
|
||
doc_query = Document.select(Document, Knowledgebase.tenant_id).join(
|
||
Knowledgebase, on=(Knowledgebase.id == Document.kb_id)
|
||
).where(Document.id == doc_id)
|
||
|
||
doc = doc_query.first()
|
||
if not doc:
|
||
logging.warning(f"Document {doc_id} not found")
|
||
return {}
|
||
|
||
# Extract fields
|
||
doc_obj = doc
|
||
tenant_id = doc.knowledgebase.tenant_id
|
||
kb_id = doc_obj.kb_id
|
||
index_name = cls._get_doc_meta_index_name(tenant_id)
|
||
|
||
# Try to get metadata from ES/Infinity
|
||
metadata_doc = settings.docStoreConn.get(
|
||
doc_id,
|
||
index_name,
|
||
[kb_id]
|
||
)
|
||
|
||
if metadata_doc:
|
||
# Extract and unflatten metadata
|
||
return cls._extract_metadata(metadata_doc)
|
||
|
||
return {}
|
||
|
||
except Exception as e:
|
||
logging.error(f"Error getting metadata for document {doc_id}: {e}")
|
||
return {}
|
||
|
||
@classmethod
|
||
@DB.connection_context()
|
||
def get_flatted_meta_by_kbs(cls, kb_ids: List[str]) -> Dict:
|
||
"""
|
||
Get flattened metadata for documents in knowledge bases.
|
||
|
||
- Parses stringified JSON meta_fields when possible and skips non-dict or unparsable values.
|
||
- Expands list values into individual entries.
|
||
Example: {"tags": ["foo","bar"], "author": "alice"} ->
|
||
meta["tags"]["foo"] = [doc_id], meta["tags"]["bar"] = [doc_id], meta["author"]["alice"] = [doc_id]
|
||
Prefer for metadata_condition filtering and scenarios that must respect list semantics.
|
||
|
||
Args:
|
||
kb_ids: List of knowledge base IDs
|
||
|
||
Returns:
|
||
Metadata dictionary in format: {field_name: {value: [doc_ids]}}
|
||
"""
|
||
try:
|
||
# Get tenant_id from first KB
|
||
kb = Knowledgebase.get_by_id(kb_ids[0])
|
||
if not kb:
|
||
return {}
|
||
|
||
tenant_id = kb.tenant_id
|
||
index_name = cls._get_doc_meta_index_name(tenant_id)
|
||
|
||
condition = {"kb_id": kb_ids}
|
||
order_by = OrderByExpr()
|
||
|
||
# Query with large limit
|
||
results = settings.docStoreConn.search(
|
||
select_fields=["*"], # Get all fields
|
||
highlight_fields=[],
|
||
condition=condition,
|
||
match_expressions=[],
|
||
order_by=order_by,
|
||
offset=0,
|
||
limit=10000,
|
||
index_names=index_name,
|
||
knowledgebase_ids=kb_ids
|
||
)
|
||
|
||
logging.debug(f"[get_flatted_meta_by_kbs] index_name: {index_name}, kb_ids: {kb_ids}")
|
||
logging.debug(f"[get_flatted_meta_by_kbs] results type: {type(results)}")
|
||
|
||
# Aggregate metadata
|
||
meta = {}
|
||
doc_count = 0
|
||
|
||
# Use helper to iterate over results in any format
|
||
for doc_id, doc in cls._iter_search_results(results):
|
||
doc_count += 1
|
||
# Extract metadata fields (exclude system fields)
|
||
doc_meta = cls._extract_metadata(doc)
|
||
|
||
for k, v in doc_meta.items():
|
||
if k not in meta:
|
||
meta[k] = {}
|
||
|
||
values = v if isinstance(v, list) else [v]
|
||
for vv in values:
|
||
if vv is None:
|
||
continue
|
||
sv = str(vv)
|
||
if sv not in meta[k]:
|
||
meta[k][sv] = []
|
||
meta[k][sv].append(doc_id)
|
||
|
||
if doc_count >= 10000:
|
||
logging.warning(f"[get_flatted_meta_by_kbs] Results hit the 10000 limit for KBs {kb_ids}.")
|
||
|
||
logging.debug(f"[get_flatted_meta_by_kbs] KBs: {kb_ids}, Returning metadata: {meta}")
|
||
return meta
|
||
|
||
except Exception as e:
|
||
logging.error(f"Error getting flattened metadata for KBs {kb_ids}: {e}")
|
||
return {}
|
||
|
||
@classmethod
|
||
def filter_doc_ids_by_meta_pushdown(
|
||
cls,
|
||
kb_ids: List[str],
|
||
filters: List[Dict],
|
||
logic: str = "and",
|
||
limit: int = 10000,
|
||
) -> Optional[List[str]]:
|
||
"""Run a metadata filter directly against ES, returning matching doc IDs.
|
||
|
||
Returns ``None`` to signal "push-down not viable, use the in-memory
|
||
``meta_filter`` fallback". Reasons for ``None``:
|
||
|
||
- Active doc store is not Elasticsearch (Infinity / OceanBase have
|
||
different filter semantics for the JSON ``meta_fields`` column).
|
||
- One of the user filters cannot be expressed in ES DSL.
|
||
- The ES request itself failed (network, mapping, missing index).
|
||
|
||
On success returns the deduplicated, ordered list of document IDs the
|
||
ES query matched. Callers can union or intersect this with their own
|
||
base ``doc_ids`` rather than fetching the entire metadata table.
|
||
"""
|
||
from common.metadata_es_filter import (
|
||
UnsupportedMetaFilter,
|
||
build_meta_filter_query,
|
||
extract_doc_ids,
|
||
is_pushdown_supported,
|
||
)
|
||
|
||
if not kb_ids:
|
||
return []
|
||
|
||
if settings.DOC_ENGINE_INFINITY:
|
||
# Infinity stores ``meta_fields`` as a JSON column without dotted
|
||
# field access; the in-memory path is still the reliable answer.
|
||
return None
|
||
|
||
es_client = getattr(settings.docStoreConn, "es", None)
|
||
if es_client is None:
|
||
return None
|
||
|
||
if not is_pushdown_supported(filters):
|
||
return None
|
||
|
||
try:
|
||
kb = Knowledgebase.get_by_id(kb_ids[0])
|
||
except Exception as e:
|
||
logging.warning(f"[meta_pushdown] cannot resolve tenant for kb {kb_ids[0]}: {e}")
|
||
return None
|
||
if not kb:
|
||
return None
|
||
|
||
tenant_id = kb.tenant_id
|
||
index_name = cls._get_doc_meta_index_name(tenant_id)
|
||
|
||
try:
|
||
if not settings.docStoreConn.index_exist(index_name, ""):
|
||
# No metadata index → no metadata-filtered docs. Returning an
|
||
# empty list (rather than ``None``) so callers don't bounce
|
||
# back to the in-memory path and re-query MySQL for nothing.
|
||
return []
|
||
except Exception as e:
|
||
logging.warning(f"[meta_pushdown] index_exist check failed for {index_name}: {e}")
|
||
return None
|
||
|
||
try:
|
||
query_body = build_meta_filter_query(filters, logic, kb_ids)
|
||
except UnsupportedMetaFilter as e:
|
||
logging.debug(f"[meta_pushdown] falling back to in-memory: {e.reason}")
|
||
return None
|
||
|
||
# Only the doc id is needed downstream; trimming ``_source`` keeps the
|
||
# response small when the metadata blob is large.
|
||
request_body = {
|
||
**query_body,
|
||
"size": limit,
|
||
"_source": ["id"],
|
||
}
|
||
|
||
try:
|
||
response = es_client.search(index=index_name, body=request_body)
|
||
except Exception as e:
|
||
logging.warning(f"[meta_pushdown] ES query failed for {index_name}: {e}")
|
||
return None
|
||
|
||
doc_ids = extract_doc_ids(response if isinstance(response, dict) else dict(response))
|
||
# Preserve order while removing duplicates so caller-side de-dupe stays
|
||
# cheap.
|
||
seen: set[str] = set()
|
||
unique: List[str] = []
|
||
for did in doc_ids:
|
||
if did in seen:
|
||
continue
|
||
seen.add(did)
|
||
unique.append(did)
|
||
|
||
if len(unique) >= limit:
|
||
logging.warning(
|
||
f"[meta_pushdown] hit limit {limit} for KBs {kb_ids}; some matches may be missing"
|
||
)
|
||
|
||
logging.debug(f"[meta_pushdown] {len(unique)} matches for KBs {kb_ids}")
|
||
return unique
|
||
|
||
@classmethod
|
||
def get_metadata_keys_by_kbs(cls, kb_ids: List[str]) -> List[str]:
|
||
"""
|
||
Get unique metadata field names across multiple knowledge bases.
|
||
|
||
Args:
|
||
kb_ids: List of knowledge base IDs
|
||
|
||
Returns:
|
||
Sorted list of unique metadata field names
|
||
"""
|
||
if not kb_ids:
|
||
return []
|
||
|
||
logging.debug(f"get_metadata_keys_by_kbs start: n_kbs={len(kb_ids)}")
|
||
keys: set[str] = set()
|
||
try:
|
||
for kb_id in kb_ids:
|
||
results = cls._search_metadata(kb_id, condition={"kb_id": kb_id})
|
||
for _doc_id, doc in cls._iter_search_results(results):
|
||
doc_meta = cls._extract_metadata(doc)
|
||
if not isinstance(doc_meta, dict):
|
||
continue
|
||
keys.update(str(k) for k in doc_meta.keys())
|
||
logging.debug(f"get_metadata_keys_by_kbs end: n_keys={len(keys)}, kb_ids={kb_ids}")
|
||
return sorted(keys)
|
||
except Exception as e:
|
||
logging.error(f"Error getting metadata keys for KBs {kb_ids}: {e}")
|
||
return []
|
||
|
||
@classmethod
|
||
def get_metadata_for_documents(cls, doc_ids: Optional[List[str]], kb_id: str) -> Dict[str, Dict]:
|
||
"""
|
||
Get metadata fields for specific documents.
|
||
Returns a mapping of doc_id -> meta_fields
|
||
|
||
Args:
|
||
doc_ids: List of document IDs (if None, gets all documents with metadata for the KB)
|
||
kb_id: Knowledge base ID
|
||
|
||
Returns:
|
||
Dictionary mapping doc_id to meta_fields dict
|
||
"""
|
||
try:
|
||
condition = {"kb_id": kb_id}
|
||
if doc_ids:
|
||
condition["id"] = doc_ids
|
||
results = cls._search_metadata(kb_id, condition=condition)
|
||
if not results:
|
||
return {}
|
||
|
||
# Build mapping: doc_id -> meta_fields
|
||
meta_mapping = {}
|
||
|
||
# Use helper to iterate over results
|
||
for doc_id, doc in cls._iter_search_results(results):
|
||
|
||
# Extract metadata (handles both JSON strings and dicts)
|
||
doc_meta = cls._extract_metadata(doc)
|
||
if doc_meta:
|
||
meta_mapping[doc_id] = doc_meta
|
||
|
||
logging.debug(f"[get_metadata_for_documents] Found metadata for {len(meta_mapping)}/{len(doc_ids) if doc_ids else 'all'} documents")
|
||
return meta_mapping
|
||
|
||
except Exception as e:
|
||
logging.error(f"Error getting metadata for documents: {e}")
|
||
return {}
|
||
|
||
@classmethod
|
||
@DB.connection_context()
|
||
def get_metadata_summary(cls, kb_id: str, doc_ids=None) -> Dict:
|
||
"""
|
||
Get metadata summary for documents in a knowledge base.
|
||
|
||
Args:
|
||
kb_id: Knowledge base ID
|
||
doc_ids: Optional list of document IDs to filter by
|
||
|
||
Returns:
|
||
Dictionary with metadata field statistics in format:
|
||
{
|
||
"field_name": {
|
||
"type": "string" | "number" | "list" | "time",
|
||
"values": [("value1", count1), ("value2", count2), ...] # sorted by count desc
|
||
}
|
||
}
|
||
"""
|
||
def _is_time_string(value: str) -> bool:
|
||
"""Check if a string value is an ISO 8601 datetime (e.g., '2026-02-03T00:00:00')."""
|
||
if not isinstance(value, str):
|
||
return False
|
||
return bool(re.match(r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$', value))
|
||
|
||
def _meta_value_type(value):
|
||
"""Determine the type of a metadata value."""
|
||
if value is None:
|
||
return None
|
||
if isinstance(value, list):
|
||
return "list"
|
||
if isinstance(value, bool):
|
||
return "string"
|
||
if isinstance(value, (int, float)):
|
||
return "number"
|
||
if isinstance(value, str) and _is_time_string(value):
|
||
return "time"
|
||
return "string"
|
||
|
||
try:
|
||
condition = {"kb_id": kb_id}
|
||
if doc_ids:
|
||
condition["id"] = doc_ids
|
||
results = cls._search_metadata(kb_id, condition=condition)
|
||
if not results:
|
||
return {}
|
||
|
||
# Aggregate metadata
|
||
summary = {}
|
||
type_counter = {}
|
||
|
||
logging.debug(f"[METADATA SUMMARY] KB: {kb_id}, doc_ids: {doc_ids}")
|
||
|
||
# Use helper to iterate over results in any format
|
||
for doc_id, doc in cls._iter_search_results(results):
|
||
|
||
doc_meta = cls._extract_metadata(doc)
|
||
|
||
for k, v in doc_meta.items():
|
||
# Track type counts for this field
|
||
value_type = _meta_value_type(v)
|
||
if value_type:
|
||
if k not in type_counter:
|
||
type_counter[k] = {}
|
||
type_counter[k][value_type] = type_counter[k].get(value_type, 0) + 1
|
||
|
||
# Aggregate value counts
|
||
values = v if isinstance(v, list) else [v]
|
||
for vv in values:
|
||
if vv is None:
|
||
continue
|
||
sv = str(vv)
|
||
if k not in summary:
|
||
summary[k] = {}
|
||
summary[k][sv] = summary[k].get(sv, 0) + 1
|
||
|
||
# Build result with type information and sorted values
|
||
result = {}
|
||
for k, v in summary.items():
|
||
values = sorted([(val, cnt) for val, cnt in v.items()], key=lambda x: x[1], reverse=True)
|
||
type_counts = type_counter.get(k, {})
|
||
value_type = "string"
|
||
if type_counts:
|
||
value_type = max(type_counts.items(), key=lambda item: item[1])[0]
|
||
result[k] = {"type": value_type, "values": values}
|
||
|
||
logging.debug(f"[METADATA SUMMARY] Final result: {result}")
|
||
return result
|
||
|
||
except Exception as e:
|
||
logging.error(f"Error getting metadata summary for KB {kb_id}: {e}")
|
||
return {}
|
||
|
||
@classmethod
|
||
@DB.connection_context()
|
||
def batch_update_metadata(cls, kb_id: str, doc_ids: List[str], updates=None, deletes=None) -> int:
|
||
"""
|
||
Batch update metadata for documents in a knowledge base.
|
||
|
||
Args:
|
||
kb_id: Knowledge base ID
|
||
doc_ids: List of document IDs to update
|
||
updates: List of update operations, each with:
|
||
- key: field name to update
|
||
- value: new value
|
||
- match (optional): only update if current value matches this
|
||
deletes: List of delete operations, each with:
|
||
- key: field name to delete from
|
||
- value (optional): specific value to delete (if not provided, deletes the entire field)
|
||
|
||
Returns:
|
||
Number of documents updated
|
||
|
||
Examples:
|
||
updates = [{"key": "author", "value": "John"}]
|
||
updates = [{"key": "tags", "value": "new", "match": "old"}] # Replace "old" with "new" in tags list
|
||
deletes = [{"key": "author"}] # Delete entire author field
|
||
deletes = [{"key": "tags", "value": "obsolete"}] # Remove "obsolete" from tags list
|
||
"""
|
||
updates = updates or []
|
||
deletes = deletes or []
|
||
if not doc_ids:
|
||
return 0
|
||
|
||
def _normalize_meta(meta):
|
||
"""Normalize metadata to a dict."""
|
||
if isinstance(meta, str):
|
||
try:
|
||
meta = json.loads(meta)
|
||
except Exception:
|
||
return {}
|
||
if not isinstance(meta, dict):
|
||
return {}
|
||
return deepcopy(meta)
|
||
|
||
def _str_equal(a, b):
|
||
"""Compare two values as strings."""
|
||
return str(a) == str(b)
|
||
|
||
def _apply_updates(meta):
|
||
"""Apply update operations to metadata."""
|
||
changed = False
|
||
for upd in updates:
|
||
key = upd.get("key")
|
||
if not key:
|
||
continue
|
||
|
||
new_value = upd.get("value")
|
||
match_value = upd.get("match", None)
|
||
match_provided = match_value is not None and match_value != ""
|
||
|
||
if key not in meta:
|
||
if match_provided:
|
||
continue
|
||
meta[key] = dedupe_list(new_value) if isinstance(new_value, list) else new_value
|
||
changed = True
|
||
continue
|
||
|
||
if isinstance(meta[key], list):
|
||
if not match_provided:
|
||
# No match provided, append new_value to the list
|
||
if isinstance(new_value, list):
|
||
meta[key] = dedupe_list(meta[key] + new_value)
|
||
else:
|
||
meta[key] = dedupe_list(meta[key] + [new_value])
|
||
changed = True
|
||
else:
|
||
# Replace items matching match_value with new_value
|
||
replaced = False
|
||
new_list = []
|
||
for item in meta[key]:
|
||
if _str_equal(item, match_value):
|
||
new_list.append(new_value)
|
||
replaced = True
|
||
else:
|
||
new_list.append(item)
|
||
if replaced:
|
||
meta[key] = dedupe_list(new_list)
|
||
changed = True
|
||
else:
|
||
if not match_provided:
|
||
meta[key] = new_value
|
||
changed = True
|
||
else:
|
||
if _str_equal(meta[key], match_value):
|
||
meta[key] = new_value
|
||
changed = True
|
||
return changed
|
||
|
||
def _apply_deletes(meta):
|
||
"""Apply delete operations to metadata."""
|
||
changed = False
|
||
for d in deletes:
|
||
key = d.get("key")
|
||
if not key or key not in meta:
|
||
continue
|
||
value = d.get("value", None)
|
||
if isinstance(meta[key], list):
|
||
if value is None:
|
||
del meta[key]
|
||
changed = True
|
||
continue
|
||
new_list = [item for item in meta[key] if not _str_equal(item, value)]
|
||
if len(new_list) != len(meta[key]):
|
||
if new_list:
|
||
meta[key] = new_list
|
||
else:
|
||
del meta[key]
|
||
changed = True
|
||
else:
|
||
if value is None or _str_equal(meta[key], value):
|
||
del meta[key]
|
||
changed = True
|
||
return changed
|
||
|
||
try:
|
||
results = cls._search_metadata(kb_id, condition={"kb_id": kb_id, "id": doc_ids})
|
||
if not results:
|
||
results = [] # Treat as empty list if None
|
||
|
||
updated_docs = 0
|
||
found_doc_ids = set()
|
||
|
||
logging.debug(f"[batch_update_metadata] Searching for doc_ids: {doc_ids}")
|
||
|
||
# Use helper to iterate over results
|
||
for doc_id, doc in cls._iter_search_results(results):
|
||
found_doc_ids.add(doc_id)
|
||
|
||
# Get current metadata
|
||
current_meta = cls._extract_metadata(doc)
|
||
meta = _normalize_meta(current_meta)
|
||
original_meta = deepcopy(meta)
|
||
|
||
logging.debug(f"[batch_update_metadata] Doc {doc_id}: current_meta={current_meta}, meta={meta}")
|
||
logging.debug(f"[batch_update_metadata] Updates to apply: {updates}, Deletes: {deletes}")
|
||
|
||
# Apply updates and deletes
|
||
changed = _apply_updates(meta)
|
||
logging.debug(f"[batch_update_metadata] After _apply_updates: changed={changed}, meta={meta}")
|
||
changed = _apply_deletes(meta) or changed
|
||
logging.debug(f"[batch_update_metadata] After _apply_deletes: changed={changed}, meta={meta}")
|
||
|
||
# Update if changed
|
||
if changed and meta != original_meta:
|
||
logging.debug(f"[batch_update_metadata] Updating doc_id: {doc_id}, meta: {meta}")
|
||
# If metadata is empty, delete the row entirely instead of keeping empty metadata
|
||
if not meta:
|
||
cls.delete_document_metadata(doc_id, kb_id, tenant_id=None)
|
||
else:
|
||
cls.update_document_metadata(doc_id, meta)
|
||
updated_docs += 1
|
||
|
||
# Handle documents that don't have metadata rows yet
|
||
# These documents weren't in the search results, so we need to insert new metadata for them
|
||
doc_ids_set = set(doc_ids)
|
||
missing_doc_ids = doc_ids_set - found_doc_ids
|
||
if missing_doc_ids and updates:
|
||
logging.debug(f"[batch_update_metadata] Inserting new metadata for documents without metadata rows: {missing_doc_ids}")
|
||
for doc_id in missing_doc_ids:
|
||
# Apply updates to create new metadata
|
||
meta = {}
|
||
_apply_updates(meta)
|
||
if meta:
|
||
# Only insert if there's actual metadata to add
|
||
cls.update_document_metadata(doc_id, meta)
|
||
updated_docs += 1
|
||
logging.debug(f"[batch_update_metadata] Inserted metadata for doc_id: {doc_id}, meta: {meta}")
|
||
|
||
logging.debug(f"[batch_update_metadata] KB: {kb_id}, doc_ids: {doc_ids}, updated: {updated_docs}")
|
||
return updated_docs
|
||
|
||
except Exception as e:
|
||
logging.error(f"Error in batch_update_metadata for KB {kb_id}: {e}")
|
||
return 0
|