ragflow/example/sdk/retrieval_example.py

#
#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

"""
The example demonstrates the RAG retrieval flow using the Python SDK.
It shows how to perform semantic search across one or more datasets.
"""

from ragflow_sdk import RAGFlow
import sys
import time
import os

HOST_ADDRESS = os.environ.get("RAGFLOW_HOST_ADDRESS", "http://127.0.0.1")
API_KEY = os.environ.get("RAGFLOW_API_KEY", "ragflow-IzZmY1MGVhYTBhMjExZWZiYTdjMDI0Mm")

try:
    rag = RAGFlow(api_key=API_KEY, base_url=HOST_ADDRESS)

    # 1. Create a dataset
    print("Creating dataset...")
    dataset = rag.create_dataset(name="retrieval_example_dataset")

    # 2. Upload and parse a document to have content for retrieval
    print("Uploading and parsing document...")
    content = "RAGFlow is an open-source RAG engine based on deep document understanding. It features a streamlined RAG workflow for businesses of any size."
    docs = dataset.upload_documents([{"display_name": "ragflow_info.txt", "blob": content.encode('utf-8')}])
    doc = docs[0]

    # Wait for parsing to complete with timeout
    print("Parsing document...")
    dataset.async_parse_documents([doc.id])
    MAX_WAIT = 120  # seconds
    elapsed = 0
    while elapsed < MAX_WAIT:
        doc_status = dataset.list_documents(id=doc.id)[0]
        if doc_status.run == "1" and doc_status.progress >= 1.0:
             break
        print(f"Parsing progress: {doc_status.progress:.2f}")
        time.sleep(2)
        elapsed += 2
    else:
        print("Parsing timed out.")
        sys.exit(-1)
    print("Document parsed and ready for retrieval.")

    # 3. Perform retrieval (Semantic Search)
    print("\n--- Performing Retrieval ---")
    question = "What is RAGFlow?"
    print(f"Question: {question}")

    # Retrieve relevant chunks from one or more datasets
    chunks = rag.retrieve(
        dataset_ids=[dataset.id],
        question=question,
        top_k=5,
        similarity_threshold=0.1
    )

    print(f"Found {len(chunks)} relevant chunks:")
    for i, chunk in enumerate(chunks):
        print(f"\nChunk {i+1}:")
        print(f"Content: {chunk.content[:200]}...")
        print(f"Similarity Score: {chunk.similarity:.4f}")
        print(f"Source Document: {chunk.document_name}")

    # 4. Perform retrieval with additional parameters
    print("\n--- Performing Retrieval with Keyword Search ---")
    chunks = rag.retrieve(
        dataset_ids=[dataset.id],
        question="workflow for businesses",
        top_k=3,
        keyword=True  # Enable keyword search in addition to semantic search
    )
    for i, chunk in enumerate(chunks):
        print(f"Chunk {i+1}: {chunk.content[:100]}... (Score: {chunk.similarity:.4f})")

    # Cleanup
    print("\nCleaning up...")
    rag.delete_datasets(ids=[dataset.id])

    print("Retrieval example done.")
    sys.exit(0)

except Exception as e:
    print(f"An error occurred: {e}")
    sys.exit(-1)