# # Copyright 2025 The InfiniFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import ast import logging from typing import Any, Callable, Dict import json_repair def convert_conditions(metadata_condition): if metadata_condition is None: metadata_condition = {} op_mapping = { "is": "=", "not is": "≠", ">=": "≥", "<=": "≤", "!=": "≠" } return [ { "op": op_mapping.get(cond["comparison_operator"], cond["comparison_operator"]), "key": cond["name"], "value": cond["value"] } for cond in metadata_condition.get("conditions", []) ] def meta_filter(metas: dict, filters: list[dict], logic: str = "and"): doc_ids = set([]) def normalize_string_values(value): if isinstance(value, str): return value.lower() if isinstance(value, list): return [item.lower() if isinstance(item, str) else item for item in value] return value def filter_out(v2docs, operator, value): ids = [] for input, docids in v2docs.items(): if operator in ["=", "≠", ">", "<", "≥", "≤"]: # Check if input is in YYYY-MM-DD date format input_str = str(input).strip() value_str = str(value).strip() # Strict date format detection: YYYY-MM-DD (must be 10 chars with correct format) is_input_date = ( len(input_str) == 10 and input_str[4] == '-' and input_str[7] == '-' and input_str[:4].isdigit() and input_str[5:7].isdigit() and input_str[8:10].isdigit() ) is_value_date = ( len(value_str) == 10 and value_str[4] == '-' and value_str[7] == '-' and value_str[:4].isdigit() and value_str[5:7].isdigit() and value_str[8:10].isdigit() ) if is_value_date: # Query value is in date format if is_input_date: # Data is also in date format: perform date comparison input = input_str value = value_str else: # Data is not in date format: skip this record (no match) continue else: # Query value is not in date format: use original logic try: if isinstance(input, list): input = input[0] input = ast.literal_eval(input) value = ast.literal_eval(value) except Exception: pass # Convert strings to lowercase if isinstance(input, str): input = input.lower() if isinstance(value, str): value = value.lower() else: # Non-comparison operators: maintain original logic input = normalize_string_values(input) value = normalize_string_values(value) matched = False try: if operator == "contains": matched = str(input).find(value) >= 0 if not isinstance(input, list) else any( str(i).find(value) >= 0 for i in input) elif operator == "not contains": matched = str(input).find(value) == -1 if not isinstance(input, list) else all( str(i).find(value) == -1 for i in input) elif operator == "in": matched = input in value if not isinstance(input, list) else all(i in value for i in input) elif operator == "not in": matched = input not in value if not isinstance(input, list) else all(i not in value for i in input) elif operator == "start with": matched = str(input).lower().startswith(str(value).lower()) if not isinstance(input, list) else "".join( [str(i).lower() for i in input]).startswith(str(value).lower()) elif operator == "end with": matched = str(input).lower().endswith(str(value).lower()) if not isinstance(input, list) else "".join( [str(i).lower() for i in input]).endswith(str(value).lower()) elif operator == "empty": matched = not input elif operator == "not empty": matched = bool(input) elif operator == "=": matched = input == value elif operator == "≠": matched = input != value elif operator == ">": matched = input > value elif operator == "<": matched = input < value elif operator == "≥": matched = input >= value elif operator == "≤": matched = input <= value except Exception: pass if matched: ids.extend(docids) return ids for f in filters: k = f["key"] if k not in metas: # Key not found in metas: treat as no match ids = [] else: v2docs = metas[k] ids = filter_out(v2docs, f["op"], f["value"]) if not doc_ids: doc_ids = set(ids) else: if logic == "and": doc_ids = doc_ids & set(ids) if not doc_ids: logging.debug(f"meta_filter filters={filters}, logic={logic}, early return []") return [] else: doc_ids = doc_ids | set(ids) logging.debug(f"meta_filter filters={filters}, logic={logic}, returning doc_ids={list(doc_ids)}") return list(doc_ids) async def apply_meta_data_filter( meta_data_filter: dict | None, metas: dict | None = None, question: str = "", chat_mdl: Any = None, base_doc_ids: list[str] | None = None, manual_value_resolver: Callable[[dict], dict] | None = None, kb_ids: list[str] | None = None, metas_loader: Callable[[], dict] | None = None, ) -> list[str] | None: """ Apply metadata filtering rules and return the filtered doc_ids. meta_data_filter supports three modes: - auto: generate filter conditions via LLM (gen_meta_filter) - semi_auto: generate conditions using selected metadata keys only - manual: directly filter based on provided conditions When ``kb_ids`` is supplied, metadata filters are pushed down to the doc metadata index (ES/Infinity) via ``DocMetadataService.filter_doc_ids_by_metadata`` instead of being evaluated in Python over ``metas``. The in-memory ``meta_filter`` path remains the fallback so callers without a KB scope, or backends without push-down support, behave exactly as before. ``metas`` may be supplied eagerly or via ``metas_loader``. The loader is only invoked when the metadata dict is actually needed — i.e. for the LLM context in ``auto`` / ``semi_auto`` modes, or as the in-memory fallback when push-down can't service a request. ``manual`` mode that lands on the push-down path therefore skips the expensive ``get_flatted_meta_by_kbs`` round-trip entirely. Returns: list of doc_ids, ["-999"] when manual filters yield no result, or None when auto/semi_auto filters return empty. """ from rag.prompts.generator import gen_meta_filter # move from the top of the file to avoid circular import doc_ids = list(base_doc_ids) if base_doc_ids else [] if not meta_data_filter: return doc_ids method = meta_data_filter.get("method") # Memoised metadata loader. ``_get_metas`` materialises the dict at most # once per call; downstream branches that never reach an in-memory eval # leave the loader untouched. cached_metas: dict | None = metas def _get_metas() -> dict: nonlocal cached_metas if cached_metas is None: cached_metas = metas_loader() if metas_loader else {} return cached_metas def _run_metadata_filter(conditions: list[dict], logic: str) -> list[str]: """Run conditions through ES/Infinity push-down when possible, in-memory otherwise.""" if conditions and kb_ids: try: from api.db.services.doc_metadata_service import DocMetadataService doc_ids = DocMetadataService.filter_doc_ids_by_meta_pushdown(kb_ids, conditions, logic) logging.debug(f"Doc ids filtered by metadata: {doc_ids}") if doc_ids is not None: return doc_ids except Exception as e: logging.error(f"Metadata filter push down errored: {e}") # In-memory fallback logging.debug("Metadata filter falls back to in-memory filter") return meta_filter(_get_metas(), conditions, logic) if method == "auto": filters: dict = await gen_meta_filter(chat_mdl, _get_metas(), question) logging.debug(f"Metadata filter(auto) generated: {filters}") doc_ids.extend(_run_metadata_filter(filters["conditions"], filters.get("logic", "and"))) if not doc_ids: return None elif method == "semi_auto": selected_keys = [] constraints = {} for item in meta_data_filter.get("semi_auto", []): if isinstance(item, str): selected_keys.append(item) elif isinstance(item, dict): key = item.get("key") op = item.get("op") selected_keys.append(key) if op: constraints[key] = op if selected_keys: current_metas = _get_metas() filtered_metas = {key: current_metas[key] for key in selected_keys if key in current_metas} if filtered_metas: filters: dict = await gen_meta_filter(chat_mdl, filtered_metas, question, constraints=constraints) logging.debug(f"Metadata filter(semi_auto) generated: {filters}") doc_ids.extend(_run_metadata_filter(filters["conditions"], filters.get("logic", "and"))) if not doc_ids: return None elif method == "manual": filters = meta_data_filter.get("manual", []) if manual_value_resolver: filters = [manual_value_resolver(flt) for flt in filters] logging.debug(f"Metadata filter(manual): {filters}") doc_ids.extend(_run_metadata_filter(filters, meta_data_filter.get("logic", "and"))) if filters and not doc_ids: doc_ids = ["-999"] logging.debug(f"apply_meta_data_filter meta_filter={meta_data_filter}, returning doc_ids={doc_ids}") return doc_ids def _try_meta_pushdown( kb_ids: list[str], conditions: list[dict], logic: str, ) -> list[str] | None: """Attempt the ES push-down path; return ``None`` to fall back in-memory. Lazy-imports ``DocMetadataService`` so this module stays usable in environments where the API/db layer hasn't been wired up (e.g. unit tests that exercise ``meta_filter`` directly). """ try: from api.db.services.doc_metadata_service import DocMetadataService except Exception as e: logging.debug(f"[apply_meta_data_filter] push-down disabled, import failed: {e}") return None try: return DocMetadataService.filter_doc_ids_by_meta_pushdown(kb_ids, conditions, logic) except Exception as e: logging.warning(f"[apply_meta_data_filter] push-down errored, falling back: {e}") return None def dedupe_list(values: list) -> list: seen = set() deduped = [] for item in values: key = str(item) if key in seen: continue seen.add(key) deduped.append(item) return deduped def update_metadata_to(metadata, meta): if not meta: return metadata if isinstance(meta, str): try: meta = json_repair.loads(meta) except Exception: logging.error("Meta data format error.") return metadata if not isinstance(meta, dict): return metadata for k, v in meta.items(): if isinstance(v, list): v = [vv for vv in v if isinstance(vv, str)] if not v: continue v = dedupe_list(v) if not isinstance(v, list) and not isinstance(v, str): continue if k not in metadata: metadata[k] = v continue if isinstance(metadata[k], list): if isinstance(v, list): metadata[k].extend(v) else: metadata[k].append(v) metadata[k] = dedupe_list(metadata[k]) else: metadata[k] = v return metadata def metadata_schema(metadata: dict | list | None) -> Dict[str, Any]: if not metadata: return {} properties = {} for item in metadata: key = item.get("key") if not key: continue prop_schema = { "description": item.get("description", "") } if "enum" in item and item["enum"]: prop_schema["enum"] = item["enum"] prop_schema["type"] = "string" properties[key] = prop_schema json_schema = { "type": "object", "properties": properties, } json_schema["additionalProperties"] = False return json_schema def _is_json_schema(obj: dict) -> bool: if not isinstance(obj, dict): return False if "$schema" in obj: return True return obj.get("type") == "object" and isinstance(obj.get("properties"), dict) def _is_metadata_list(obj: list) -> bool: if not isinstance(obj, list) or not obj: return False for item in obj: if not isinstance(item, dict): return False key = item.get("key") if not isinstance(key, str) or not key: return False if "enum" in item and not isinstance(item["enum"], list): return False if "description" in item and not isinstance(item["description"], str): return False if "descriptions" in item and not isinstance(item["descriptions"], str): return False return True def turn2jsonschema(obj: dict | list) -> Dict[str, Any]: if isinstance(obj, dict) and _is_json_schema(obj): return obj if isinstance(obj, list) and _is_metadata_list(obj): normalized = [] for item in obj: description = item.get("description", item.get("descriptions", "")) normalized_item = { "key": item.get("key"), "description": description, } if "enum" in item: normalized_item["enum"] = item["enum"] normalized.append(normalized_item) return metadata_schema(normalized) return {}