Files
ragflow/common/metadata_utils.py
qinling0210 f1d2383572 Push metadata filters down to Infinity (#14974)
### What problem does this PR solve?

Push metadata filters down to Infinity

### Type of change

- [x] Refactoring
2026-05-18 14:22:04 +08:00

428 lines
16 KiB
Python

#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import ast
import logging
from typing import Any, Callable, Dict
import json_repair
def convert_conditions(metadata_condition):
if metadata_condition is None:
metadata_condition = {}
op_mapping = {
"is": "=",
"not is": "",
">=": "",
"<=": "",
"!=": ""
}
return [
{
"op": op_mapping.get(cond["comparison_operator"], cond["comparison_operator"]),
"key": cond["name"],
"value": cond["value"]
}
for cond in metadata_condition.get("conditions", [])
]
def meta_filter(metas: dict, filters: list[dict], logic: str = "and"):
doc_ids = set([])
def normalize_string_values(value):
if isinstance(value, str):
return value.lower()
if isinstance(value, list):
return [item.lower() if isinstance(item, str) else item for item in value]
return value
def filter_out(v2docs, operator, value):
ids = []
for input, docids in v2docs.items():
if operator in ["=", "", ">", "<", "", ""]:
# Check if input is in YYYY-MM-DD date format
input_str = str(input).strip()
value_str = str(value).strip()
# Strict date format detection: YYYY-MM-DD (must be 10 chars with correct format)
is_input_date = (
len(input_str) == 10 and
input_str[4] == '-' and
input_str[7] == '-' and
input_str[:4].isdigit() and
input_str[5:7].isdigit() and
input_str[8:10].isdigit()
)
is_value_date = (
len(value_str) == 10 and
value_str[4] == '-' and
value_str[7] == '-' and
value_str[:4].isdigit() and
value_str[5:7].isdigit() and
value_str[8:10].isdigit()
)
if is_value_date:
# Query value is in date format
if is_input_date:
# Data is also in date format: perform date comparison
input = input_str
value = value_str
else:
# Data is not in date format: skip this record (no match)
continue
else:
# Query value is not in date format: use original logic
try:
if isinstance(input, list):
input = input[0]
input = ast.literal_eval(input)
value = ast.literal_eval(value)
except Exception:
pass
# Convert strings to lowercase
if isinstance(input, str):
input = input.lower()
if isinstance(value, str):
value = value.lower()
else:
# Non-comparison operators: maintain original logic
input = normalize_string_values(input)
value = normalize_string_values(value)
matched = False
try:
if operator == "contains":
matched = str(input).find(value) >= 0 if not isinstance(input, list) else any(
str(i).find(value) >= 0 for i in input)
elif operator == "not contains":
matched = str(input).find(value) == -1 if not isinstance(input, list) else all(
str(i).find(value) == -1 for i in input)
elif operator == "in":
matched = input in value if not isinstance(input, list) else all(i in value for i in input)
elif operator == "not in":
matched = input not in value if not isinstance(input, list) else all(i not in value for i in input)
elif operator == "start with":
matched = str(input).lower().startswith(str(value).lower()) if not isinstance(input,
list) else "".join(
[str(i).lower() for i in input]).startswith(str(value).lower())
elif operator == "end with":
matched = str(input).lower().endswith(str(value).lower()) if not isinstance(input,
list) else "".join(
[str(i).lower() for i in input]).endswith(str(value).lower())
elif operator == "empty":
matched = not input
elif operator == "not empty":
matched = bool(input)
elif operator == "=":
matched = input == value
elif operator == "":
matched = input != value
elif operator == ">":
matched = input > value
elif operator == "<":
matched = input < value
elif operator == "":
matched = input >= value
elif operator == "":
matched = input <= value
except Exception:
pass
if matched:
ids.extend(docids)
return ids
for f in filters:
k = f["key"]
if k not in metas:
# Key not found in metas: treat as no match
ids = []
else:
v2docs = metas[k]
ids = filter_out(v2docs, f["op"], f["value"])
if not doc_ids:
doc_ids = set(ids)
else:
if logic == "and":
doc_ids = doc_ids & set(ids)
if not doc_ids:
logging.debug(f"meta_filter filters={filters}, logic={logic}, early return []")
return []
else:
doc_ids = doc_ids | set(ids)
logging.debug(f"meta_filter filters={filters}, logic={logic}, returning doc_ids={list(doc_ids)}")
return list(doc_ids)
async def apply_meta_data_filter(
meta_data_filter: dict | None,
metas: dict | None = None,
question: str = "",
chat_mdl: Any = None,
base_doc_ids: list[str] | None = None,
manual_value_resolver: Callable[[dict], dict] | None = None,
kb_ids: list[str] | None = None,
metas_loader: Callable[[], dict] | None = None,
) -> list[str] | None:
"""
Apply metadata filtering rules and return the filtered doc_ids.
meta_data_filter supports three modes:
- auto: generate filter conditions via LLM (gen_meta_filter)
- semi_auto: generate conditions using selected metadata keys only
- manual: directly filter based on provided conditions
When ``kb_ids`` is supplied, metadata filters are pushed down to the doc metadata
index (ES/Infinity) via ``DocMetadataService.filter_doc_ids_by_metadata`` instead
of being evaluated in Python over ``metas``. The in-memory ``meta_filter`` path
remains the fallback so callers without a KB scope, or backends without push-down
support, behave exactly as before.
``metas`` may be supplied eagerly or via ``metas_loader``. The loader is
only invoked when the metadata dict is actually needed — i.e. for the LLM
context in ``auto`` / ``semi_auto`` modes, or as the in-memory fallback
when push-down can't service a request. ``manual`` mode that lands on the
push-down path therefore skips the expensive
``get_flatted_meta_by_kbs`` round-trip entirely.
Returns:
list of doc_ids, ["-999"] when manual filters yield no result, or None
when auto/semi_auto filters return empty.
"""
from rag.prompts.generator import gen_meta_filter # move from the top of the file to avoid circular import
doc_ids = list(base_doc_ids) if base_doc_ids else []
if not meta_data_filter:
return doc_ids
method = meta_data_filter.get("method")
# Memoised metadata loader. ``_get_metas`` materialises the dict at most
# once per call; downstream branches that never reach an in-memory eval
# leave the loader untouched.
cached_metas: dict | None = metas
def _get_metas() -> dict:
nonlocal cached_metas
if cached_metas is None:
cached_metas = metas_loader() if metas_loader else {}
return cached_metas
def _run_metadata_filter(conditions: list[dict], logic: str) -> list[str]:
"""Run conditions through ES/Infinity push-down when possible, in-memory otherwise."""
if conditions and kb_ids:
try:
from api.db.services.doc_metadata_service import DocMetadataService
doc_ids = DocMetadataService.filter_doc_ids_by_meta_pushdown(kb_ids, conditions, logic)
logging.debug(f"Doc ids filtered by metadata: {doc_ids}")
if doc_ids is not None:
return doc_ids
except Exception as e:
logging.error(f"Metadata filter push down errored: {e}")
# In-memory fallback
logging.debug("Metadata filter falls back to in-memory filter")
return meta_filter(_get_metas(), conditions, logic)
if method == "auto":
filters: dict = await gen_meta_filter(chat_mdl, _get_metas(), question)
logging.debug(f"Metadata filter(auto) generated: {filters}")
doc_ids.extend(_run_metadata_filter(filters["conditions"], filters.get("logic", "and")))
if not doc_ids:
return None
elif method == "semi_auto":
selected_keys = []
constraints = {}
for item in meta_data_filter.get("semi_auto", []):
if isinstance(item, str):
selected_keys.append(item)
elif isinstance(item, dict):
key = item.get("key")
op = item.get("op")
selected_keys.append(key)
if op:
constraints[key] = op
if selected_keys:
current_metas = _get_metas()
filtered_metas = {key: current_metas[key] for key in selected_keys if key in current_metas}
if filtered_metas:
filters: dict = await gen_meta_filter(chat_mdl, filtered_metas, question, constraints=constraints)
logging.debug(f"Metadata filter(semi_auto) generated: {filters}")
doc_ids.extend(_run_metadata_filter(filters["conditions"], filters.get("logic", "and")))
if not doc_ids:
return None
elif method == "manual":
filters = meta_data_filter.get("manual", [])
if manual_value_resolver:
filters = [manual_value_resolver(flt) for flt in filters]
logging.debug(f"Metadata filter(manual): {filters}")
doc_ids.extend(_run_metadata_filter(filters, meta_data_filter.get("logic", "and")))
if filters and not doc_ids:
doc_ids = ["-999"]
logging.debug(f"apply_meta_data_filter meta_filter={meta_data_filter}, returning doc_ids={doc_ids}")
return doc_ids
def _try_meta_pushdown(
kb_ids: list[str],
conditions: list[dict],
logic: str,
) -> list[str] | None:
"""Attempt the ES push-down path; return ``None`` to fall back in-memory.
Lazy-imports ``DocMetadataService`` so this module stays usable in
environments where the API/db layer hasn't been wired up (e.g. unit tests
that exercise ``meta_filter`` directly).
"""
try:
from api.db.services.doc_metadata_service import DocMetadataService
except Exception as e:
logging.debug(f"[apply_meta_data_filter] push-down disabled, import failed: {e}")
return None
try:
return DocMetadataService.filter_doc_ids_by_meta_pushdown(kb_ids, conditions, logic)
except Exception as e:
logging.warning(f"[apply_meta_data_filter] push-down errored, falling back: {e}")
return None
def dedupe_list(values: list) -> list:
seen = set()
deduped = []
for item in values:
key = str(item)
if key in seen:
continue
seen.add(key)
deduped.append(item)
return deduped
def update_metadata_to(metadata, meta):
if not meta:
return metadata
if isinstance(meta, str):
try:
meta = json_repair.loads(meta)
except Exception:
logging.error("Meta data format error.")
return metadata
if not isinstance(meta, dict):
return metadata
for k, v in meta.items():
if isinstance(v, list):
v = [vv for vv in v if isinstance(vv, str)]
if not v:
continue
v = dedupe_list(v)
if not isinstance(v, list) and not isinstance(v, str):
continue
if k not in metadata:
metadata[k] = v
continue
if isinstance(metadata[k], list):
if isinstance(v, list):
metadata[k].extend(v)
else:
metadata[k].append(v)
metadata[k] = dedupe_list(metadata[k])
else:
metadata[k] = v
return metadata
def metadata_schema(metadata: dict | list | None) -> Dict[str, Any]:
if not metadata:
return {}
properties = {}
for item in metadata:
key = item.get("key")
if not key:
continue
prop_schema = {
"description": item.get("description", "")
}
if "enum" in item and item["enum"]:
prop_schema["enum"] = item["enum"]
prop_schema["type"] = "string"
properties[key] = prop_schema
json_schema = {
"type": "object",
"properties": properties,
}
json_schema["additionalProperties"] = False
return json_schema
def _is_json_schema(obj: dict) -> bool:
if not isinstance(obj, dict):
return False
if "$schema" in obj:
return True
return obj.get("type") == "object" and isinstance(obj.get("properties"), dict)
def _is_metadata_list(obj: list) -> bool:
if not isinstance(obj, list) or not obj:
return False
for item in obj:
if not isinstance(item, dict):
return False
key = item.get("key")
if not isinstance(key, str) or not key:
return False
if "enum" in item and not isinstance(item["enum"], list):
return False
if "description" in item and not isinstance(item["description"], str):
return False
if "descriptions" in item and not isinstance(item["descriptions"], str):
return False
return True
def turn2jsonschema(obj: dict | list) -> Dict[str, Any]:
if isinstance(obj, dict) and _is_json_schema(obj):
return obj
if isinstance(obj, list) and _is_metadata_list(obj):
normalized = []
for item in obj:
description = item.get("description", item.get("descriptions", ""))
normalized_item = {
"key": item.get("key"),
"description": description,
}
if "enum" in item:
normalized_item["enum"] = item["enum"]
normalized.append(normalized_item)
return metadata_schema(normalized)
return {}