ragflow/common/data_source/dingtalk_ai_table_connector.py

"""DingTalk AI Table connector for RAGFlow. By the way, "notable" is a reference to the DingTalk AI Table.

This connector ingests records from DingTalk AI Table as documents.
It first retrieves all sheets from a specified table, then fetches all records
from each sheet.

API Documentation:
- GetAllSheets: https://open.dingtalk.com/document/development/api-notable-getallsheets
- ListRecords: https://open.dingtalk.com/document/development/api-notable-listrecords
"""

import json
import logging
from datetime import datetime, timezone
from typing import Any

from alibabacloud_dingtalk.notable_1_0.client import Client as NotableClient
from alibabacloud_dingtalk.notable_1_0 import models as notable_models
from alibabacloud_tea_openapi import models as open_api_models
from alibabacloud_tea_util import models as util_models
from alibabacloud_tea_util.client import Client as UtilClient

from common.data_source.config import INDEX_BATCH_SIZE, DocumentSource
from common.data_source.exceptions import ConnectorMissingCredentialError, ConnectorValidationError
from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch, SlimConnectorWithPermSync
from common.data_source.models import Document, GenerateDocumentsOutput, GenerateSlimDocumentOutput, SlimDocument

logger = logging.getLogger(__name__)

# Document ID prefix for DingTalk Notable
_DINGTALK_AI_TABLE_DOC_ID_PREFIX = "dingtalk_ai_table:"


class DingTalkAITableClientNotSetUpError(PermissionError):
    """Exception raised when DingTalk Notable client is not initialized."""

    def __init__(self) -> None:
        super().__init__("DingTalk Notable client is not set up. Did you forget to call load_credentials()?")


class DingTalkAITableConnector(LoadConnector, PollConnector, SlimConnectorWithPermSync):
    """
    DingTalk AI Table (Notable) connector for accessing table records.

    This connector:
    1. Retrieves all sheets from a specified Notable table using GetAllSheets API
    2. For each sheet, fetches all records using ListRecords API with pagination
    3. Converts each record into a Document for RAGFlow ingestion

    Required credentials:
    - access_token: DingTalk access token (x-acs-dingtalk-access-token)
    - operator_id: User's unionId for API calls

    Configuration:
    - table_id: The Notable table ID (e.g., 'qnYxxx')
    """

    def __init__(
        self,
        table_id: str,
        operator_id: str,
        batch_size: int = INDEX_BATCH_SIZE,
    ) -> None:
        """
        Initialize the DingTalk Notable connector.

        Args:
            table_id: The Notable table ID
            operator_id: User's unionId for API calls
            batch_size: Number of records per batch for document generation
        """
        self.table_id = table_id
        self.operator_id = operator_id
        self.batch_size = batch_size
        self._client: NotableClient | None = None
        self._access_token: str | None = None

    def _document_id(self, sheet_id: str, record_id: str) -> str:
        return f"{_DINGTALK_AI_TABLE_DOC_ID_PREFIX}{self.table_id}:{sheet_id}:{record_id}"

    def _create_client(self) -> NotableClient:
        """Create DingTalk Notable API client."""
        config = open_api_models.Config()
        config.protocol = "https"
        config.region_id = "central"
        return NotableClient(config)

    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
        """
        Load DingTalk credentials.

        Args:
            credentials: Dictionary containing 'access_token'

        Returns:
            None
        """
        access_token = credentials.get("access_token")
        if not access_token:
            raise ConnectorMissingCredentialError("DingTalk access_token is required")

        self._access_token = access_token
        self._client = self._create_client()
        return None

    @property
    def client(self) -> NotableClient:
        """Get the DingTalk AITable client."""
        if self._client is None:
            raise DingTalkAITableClientNotSetUpError()
        return self._client

    @property
    def access_token(self) -> str:
        """Get the access token."""
        if self._access_token is None:
            raise ConnectorMissingCredentialError("DingTalk access_token not loaded")
        return self._access_token

    def validate_connector_settings(self) -> None:
        """Validate DingTalk connector settings by trying to get all sheets."""
        if self._client is None or self._access_token is None:
            raise ConnectorMissingCredentialError("DingTalk Notable")

        try:
            # Try to get sheets to validate credentials
            headers = notable_models.GetAllSheetsHeaders()
            headers.x_acs_dingtalk_access_token = self._access_token

            request = notable_models.GetAllSheetsRequest(
                operator_id=self.operator_id,
            )

            self.client.get_all_sheets_with_options(
                self.table_id,
                request,
                headers,
                util_models.RuntimeOptions(),
            )
        except Exception as e:
            logger.exception("[DingTalk Notable]: Failed to validate credentials")
            raise ConnectorValidationError(f"DingTalk Notable credential validation failed: {e}")

    def _get_all_sheets(self) -> list[dict[str, Any]]:
        """
        Retrieve all sheets from the Notable table.

        Returns:
            List of sheet information dictionaries
        """
        headers = notable_models.GetAllSheetsHeaders()
        headers.x_acs_dingtalk_access_token = self._access_token

        request = notable_models.GetAllSheetsRequest(
            operator_id=self.operator_id,
        )

        try:
            response = self.client.get_all_sheets_with_options(
                self.table_id,
                request,
                headers,
                util_models.RuntimeOptions(),
            )

            sheets = []
            if response.body and response.body.value:
                for sheet in response.body.value:
                    sheets.append(
                        {
                            "id": sheet.id,
                            "name": sheet.name,
                        }
                    )

            logger.info(f"[DingTalk Notable]: Found {len(sheets)} sheets in table {self.table_id}")
            return sheets

        except Exception as e:
            logger.exception(f"[DingTalk Notable]: Failed to get sheets: {e}")
            raise

    def _list_records(
        self,
        sheet_id: str,
        next_token: str | None = None,
        max_results: int = 100,
    ) -> tuple[list[dict[str, Any]], str | None]:
        """
        List records from a specific sheet with pagination.

        Args:
            sheet_id: The sheet ID
            next_token: Token for pagination
            max_results: Maximum number of results per page

        Returns:
            Tuple of (records list, next_token or None if no more)
        """
        headers = notable_models.ListRecordsHeaders()
        headers.x_acs_dingtalk_access_token = self._access_token

        request = notable_models.ListRecordsRequest(
            operator_id=self.operator_id,
            max_results=max_results,
            next_token=next_token or "",
        )

        try:
            response = self.client.list_records_with_options(
                self.table_id,
                sheet_id,
                request,
                headers,
                util_models.RuntimeOptions(),
            )

            records = []
            new_next_token = None

            if response.body:
                if response.body.records:
                    for record in response.body.records:
                        records.append(
                            {
                                "id": record.id,
                                "fields": record.fields,
                            }
                        )
                if response.body.next_token:
                    new_next_token = response.body.next_token

            return records, new_next_token

        except Exception as e:
            if not UtilClient.empty(getattr(e, "code", None)) and not UtilClient.empty(getattr(e, "message", None)):
                logger.error(f"[DingTalk AITable]: API error - code: {e.code}, message: {e.message}")
            raise

    def _get_all_records(self, sheet_id: str) -> list[dict[str, Any]]:
        """
        Retrieve all records from a sheet with pagination.

        Args:
            sheet_id: The sheet ID

        Returns:
            List of all records
        """
        all_records = []
        next_token = None

        while True:
            records, next_token = self._list_records(
                sheet_id=sheet_id,
                next_token=next_token,
            )
            all_records.extend(records)

            if not next_token:
                break

        logger.info(f"[DingTalk Notable]: Retrieved {len(all_records)} records from sheet {sheet_id}")
        return all_records

    def _convert_record_to_document(
        self,
        record: dict[str, Any],
        sheet_id: str,
        sheet_name: str,
    ) -> Document:
        """
        Convert a Notable record to a Document.

        Args:
            record: The record dictionary
            sheet_id: The sheet ID
            sheet_name: The sheet name

        Returns:
            Document object
        """
        record_id = record.get("id", "unknown")
        fields = record.get("fields", {})

        doc_id = self._document_id(sheet_id, str(record_id))

        # Convert fields to JSON string for blob content
        content = json.dumps(fields, ensure_ascii=False, indent=2)
        blob = content.encode("utf-8")

        # Create semantic identifier from record fields
        # Try to find a meaningful title/name field
        semantic_identifier = f"{sheet_name} - Record {record_id}"

        # Try to find a title-like field
        for key, value in fields.items():
            if isinstance(value, str) and len(value) > 0 and len(value) < 100:
                semantic_identifier = f"{sheet_name} - {value[:50]}"
                break

        # Metadata
        metadata: dict[str, str | list[str]] = {
            "table_id": self.table_id,
            "sheet_id": sheet_id,
            "sheet_name": sheet_name,
            "record_id": record_id,
        }

        # Create document
        doc = Document(
            id=doc_id,
            source=DocumentSource.DINGTALK_AI_TABLE,
            semantic_identifier=semantic_identifier,
            extension=".json",
            blob=blob,
            size_bytes=len(blob),
            doc_updated_at=datetime.now(timezone.utc),
            metadata=metadata,
        )

        return doc

    def retrieve_all_slim_docs_perm_sync(
        self,
        callback: Any = None,
    ) -> GenerateSlimDocumentOutput:
        """
        Enumerate current record IDs for all sheets without building document blobs.

        IDs match :meth:`_convert_record_to_document` / full ingest.
        """
        del callback
        logger.info(
            "[DingTalk Notable]: slim snapshot table_id=%s operator_id=%s",
            self.table_id,
            self.operator_id,
        )
        sheets = self._get_all_sheets()
        batch: list[SlimDocument] = []
        for sheet in sheets:
            sheet_id = sheet["id"]
            next_token: str | None = None
            while True:
                records, next_token = self._list_records(
                    sheet_id=sheet_id,
                    next_token=next_token,
                )
                for record in records:
                    rid = record.get("id")
                    if not rid:
                        continue
                    batch.append(SlimDocument(id=self._document_id(sheet_id, str(rid))))
                    if len(batch) >= self.batch_size:
                        yield batch
                        batch = []
                if not next_token:
                    break
        if batch:
            yield batch

    def _yield_documents_from_table(
        self,
        start: SecondsSinceUnixEpoch | None = None,
        end: SecondsSinceUnixEpoch | None = None,
    ) -> GenerateDocumentsOutput:
        """
        Yield documents from all sheets in the table.

        Args:
            start: Optional start timestamp for filtering
            end: Optional end timestamp for filtering

        Yields:
            Lists of Document objects
        """
        # Get all sheets
        sheets = self._get_all_sheets()

        batch: list[Document] = []

        for sheet in sheets:
            sheet_id = sheet["id"]
            sheet_name = sheet["name"]

            # Get all records from this sheet
            records = self._get_all_records(sheet_id)

            for record in records:
                doc = self._convert_record_to_document(
                    record=record,
                    sheet_id=sheet_id,
                    sheet_name=sheet_name,
                )

                # Apply time filtering if specified
                if start is not None or end is not None:
                    doc_time = doc.doc_updated_at.timestamp() if doc.doc_updated_at else None
                    if doc_time is not None:
                        if start is not None and doc_time < start:
                            continue
                        if end is not None and doc_time > end:
                            continue

                batch.append(doc)

                if len(batch) >= self.batch_size:
                    yield batch
                    batch = []

        if batch:
            yield batch

    def load_from_state(self) -> GenerateDocumentsOutput:
        """
        Load all documents from the DingTalk Notable table.

        Yields:
            Lists of Document objects
        """
        return self._yield_documents_from_table()

    def poll_source(
        self,
        start: SecondsSinceUnixEpoch,
        end: SecondsSinceUnixEpoch,
    ) -> GenerateDocumentsOutput:
        """
        Poll for documents within a time range.

        Args:
            start: Start timestamp
            end: End timestamp

        Yields:
            Lists of Document objects
        """
        return self._yield_documents_from_table(start=start, end=end)


if __name__ == "__main__":
    import os

    logging.basicConfig(level=logging.DEBUG)

    # Example usage
    table_id = os.environ.get("DINGTALK_AI_TABLE_BASE_ID", "")
    operator_id = os.environ.get("DINGTALK_OPERATOR_ID", "")
    access_token = os.environ.get("DINGTALK_ACCESS_TOKEN", "")

    if not all([table_id, operator_id, access_token]):
        print("Please set DINGTALK_AI_TABLE_BASE_ID, DINGTALK_OPERATOR_ID, and DINGTALK_ACCESS_TOKEN environment variables")
        exit(1)

    connector = DingTalkAITableConnector(
        table_id=table_id,
        operator_id=operator_id,
    )
    connector.load_credentials({"access_token": access_token})

    try:
        connector.validate_connector_settings()
        print("Connector settings validated successfully")
    except Exception as e:
        print(f"Validation failed: {e}")
        exit(1)

    document_batches = connector.load_from_state()
    try:
        first_batch = next(document_batches)
        print(f"Loaded {len(first_batch)} documents in first batch.")
        for doc in first_batch[:5]:  # Print first 5 docs
            print(f"- {doc.semantic_identifier} ({doc.size_bytes} bytes)")
            print(f"  Metadata: {doc.metadata}")
    except StopIteration:
        print("No documents available in DingTalk Notable table.")