Feature/table parser column roles (#13710)

### What problem does this PR solve? The table file parser (CSV/Excel) currently treats all columns identically — every column is both vectorized (embedded in chunk text) and stored as filterable metadata. There's no way for users to control which columns should be searchable by semantic meaning versus which should only be filterable attributes. For example, when ingesting a news articles CSV with columns like title, content, country, category, source, etc., the embedding includes metadata fields like country: Brazil and source: Reuters in the chunk text, which dilutes the semantic quality of the embedding without adding retrieval value. The RDBMS connector (MySQL/PostgreSQL) already supports content_columns / metadata_columns, but this capability was missing for file-based table ingestion. This PR adds column-level control (vectorize / metadata / both) for the table file parser, following RAGFlow's existing patterns. Backward compatible: Datasets without table_column_roles or with table_column_mode: auto behave exactly as before (all columns = both). ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-06-08 08:07:21 +08:00 · 2026-05-11 07:06:04 +05:00
parent 889aba6a32
commit 3c4d1da98f
13 changed files with 1270 additions and 36 deletions
--- a/api/utils/validation_utils.py
+++ b/api/utils/validation_utils.py
@ -377,6 +377,9 @@ class AutoMetadataConfig(Base):
    built_in_metadata: Annotated[list[AutoMetadataField], Field(default_factory=list)]


+TableColumnRole = Literal["indexing", "metadata", "both"]
+
+
 class ParserConfig(Base):
    auto_keywords: Annotated[int, Field(default=0, ge=0, le=32)]
    auto_questions: Annotated[int, Field(default=0, ge=0, le=10)]
@ -393,6 +396,25 @@ class ParserConfig(Base):
    task_page_size: Annotated[int | None, Field(default=None, ge=1)]
    pages: Annotated[list[list[int]] | None, Field(default=None)]
    ext: Annotated[dict, Field(default={})]
+    # Table parser: column name -> "indexing" | "metadata" | "both". Absence => all columns "both".
+    # Table parser: "auto" = all columns both (default), "manual" = use table_column_roles. None → treated as "auto".
+    table_column_mode: Annotated[Literal["auto", "manual"] | None, Field(default=None)]
+    # Table parser: column name -> "indexing" | "metadata" | "both". Used only when table_column_mode == "manual".
+    table_column_roles: Annotated[dict[str, TableColumnRole] | None, Field(default=None)]
+    # Table parser: list of column names (set by backend after first parse; used by frontend for role selector).
+    table_column_names: Annotated[list[str] | None, Field(default=None)]
+
+    @field_validator("table_column_roles", mode="before")
+    @classmethod
+    def legacy_vectorize_table_column_role(cls, v: Any) -> Any:
+        """Normalize legacy role value *vectorize* to *indexing* (chunk text + full-text search)."""
+        if v is None or not isinstance(v, dict):
+            return v
+        out: dict[str, Any] = {}
+        for key, val in v.items():
+            k = key if isinstance(key, str) else str(key)
+            out[k] = "indexing" if val == "vectorize" else val
+        return out


 class UpdateDocumentReq(Base):