From 406b36a4529dfa765b10d32a355fa8d74cbb7dee Mon Sep 17 00:00:00 2001 From: Shiyao Huang <102647710+Shiyao-Huang@users.noreply.github.com> Date: Wed, 6 May 2026 14:28:25 +0800 Subject: [PATCH] fix(#14389): normalize list metadata values for in filters (#14410) ## Summary - normalize string items for list-valued metadata filters in `meta_filter` - fix `in` / `not in` case asymmetry when document metadata is lowercased but filter list values are not - add regression tests that cover the original issue scenario using uppercase list values ## Validation - `PYTHONPATH=external/ragflow pytest external/ragflow/test/unit_test/common/test_metadata_filter_operators.py -q` ## Notes - I commented on #14389 before opening this PR to claim the issue. - The new tests use `value=["F2", "F11"]` so they fail on the old implementation and pass with this fix. - This also benefits other non-comparison operators that flow through the same normalization path. Co-authored-by: copizza Co-authored-by: Wang Qi --- common/metadata_utils.py | 17 +++++++++-------- .../common/test_metadata_filter_operators.py | 14 ++++++++++++++ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/common/metadata_utils.py b/common/metadata_utils.py index f767b3bd5..79db193eb 100644 --- a/common/metadata_utils.py +++ b/common/metadata_utils.py @@ -42,6 +42,13 @@ def convert_conditions(metadata_condition): def meta_filter(metas: dict, filters: list[dict], logic: str = "and"): doc_ids = set([]) + def normalize_string_values(value): + if isinstance(value, str): + return value.lower() + if isinstance(value, list): + return [item.lower() if isinstance(item, str) else item for item in value] + return value + def filter_out(v2docs, operator, value): ids = [] for input, docids in v2docs.items(): @@ -96,14 +103,8 @@ def meta_filter(metas: dict, filters: list[dict], logic: str = "and"): value = value.lower() else: # Non-comparison operators: maintain original logic - if isinstance(input, str): - input = input.lower() - elif operator in ("in", "not in") and isinstance(input, list): - input = [x.lower() if isinstance(x, str) else x for x in input] - if isinstance(value, str): - value = value.lower() - elif operator in ("in", "not in") and isinstance(value, list): - value = [x.lower() if isinstance(x, str) else x for x in value] + input = normalize_string_values(input) + value = normalize_string_values(value) matched = False try: diff --git a/test/unit_test/common/test_metadata_filter_operators.py b/test/unit_test/common/test_metadata_filter_operators.py index 90ee64e31..23f4c2b64 100644 --- a/test/unit_test/common/test_metadata_filter_operators.py +++ b/test/unit_test/common/test_metadata_filter_operators.py @@ -33,6 +33,20 @@ def test_not_in_operator(): assert meta_filter(metas, filters) == ["doc3"] +def test_in_operator_with_list_value_is_case_insensitive(): + metas = {"product": {"F2": ["doc1"], "F11": ["doc2"], "G1": ["doc3"]}} + filters = [{"key": "product", "op": "in", "value": ["F2", "F11"]}] + + assert set(meta_filter(metas, filters)) == {"doc1", "doc2"} + + +def test_not_in_operator_with_list_value_is_case_insensitive(): + metas = {"product": {"F2": ["doc1"], "F11": ["doc2"], "G1": ["doc3"]}} + filters = [{"key": "product", "op": "not in", "value": ["F2", "F11"]}] + + assert meta_filter(metas, filters) == ["doc3"] + + def test_start_with(): # returns chunk where the metadata starts with the value metas = {"name": {"prefix_value": ["doc1"], "other": ["doc2"]}}