mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-19 11:45:10 +08:00
### What problem does this PR solve? This PR adds a dedicated HTTP benchmark CLI for RAGFlow chat and retrieval endpoints so we can measure latency/QPS. ### Type of change - [x] Documentation Update - [x] Other (please describe): Adds a CLI benchmarking tool for chat/retrieval latency/QPS --------- Co-authored-by: Liu An <asiro@qq.com>
106 lines
3.2 KiB
Python
106 lines
3.2 KiB
Python
from typing import Dict, List, Optional
|
|
|
|
|
|
def _fmt_seconds(value: Optional[float]) -> str:
|
|
if value is None:
|
|
return "n/a"
|
|
return f"{value:.4f}s"
|
|
|
|
|
|
def _fmt_ms(value: Optional[float]) -> str:
|
|
if value is None:
|
|
return "n/a"
|
|
return f"{value * 1000.0:.2f}ms"
|
|
|
|
|
|
def _fmt_qps(qps: Optional[float]) -> str:
|
|
if qps is None or qps <= 0:
|
|
return "n/a"
|
|
return f"{qps:.2f}"
|
|
|
|
|
|
def _calc_qps(total_duration_s: Optional[float], total_requests: int) -> Optional[float]:
|
|
if total_duration_s is None or total_duration_s <= 0:
|
|
return None
|
|
return total_requests / total_duration_s
|
|
|
|
|
|
def render_report(lines: List[str]) -> str:
|
|
return "\n".join(lines).strip() + "\n"
|
|
|
|
|
|
def chat_report(
|
|
*,
|
|
interface: str,
|
|
concurrency: int,
|
|
total_duration_s: Optional[float],
|
|
iterations: int,
|
|
success: int,
|
|
failure: int,
|
|
model: str,
|
|
total_stats: Dict[str, Optional[float]],
|
|
first_token_stats: Dict[str, Optional[float]],
|
|
errors: List[str],
|
|
created: Dict[str, str],
|
|
) -> str:
|
|
lines = [
|
|
f"Interface: {interface}",
|
|
f"Concurrency: {concurrency}",
|
|
f"Iterations: {iterations}",
|
|
f"Success: {success}",
|
|
f"Failure: {failure}",
|
|
f"Model: {model}",
|
|
]
|
|
for key, value in created.items():
|
|
lines.append(f"{key}: {value}")
|
|
lines.extend(
|
|
[
|
|
"Latency (total): "
|
|
f"avg={_fmt_ms(total_stats['avg'])}, min={_fmt_ms(total_stats['min'])}, "
|
|
f"p50={_fmt_ms(total_stats['p50'])}, p90={_fmt_ms(total_stats['p90'])}, p95={_fmt_ms(total_stats['p95'])}",
|
|
"Latency (first token): "
|
|
f"avg={_fmt_ms(first_token_stats['avg'])}, min={_fmt_ms(first_token_stats['min'])}, "
|
|
f"p50={_fmt_ms(first_token_stats['p50'])}, p90={_fmt_ms(first_token_stats['p90'])}, p95={_fmt_ms(first_token_stats['p95'])}",
|
|
f"Total Duration: {_fmt_seconds(total_duration_s)}",
|
|
f"QPS (requests / total duration): {_fmt_qps(_calc_qps(total_duration_s, iterations))}",
|
|
]
|
|
)
|
|
if errors:
|
|
lines.append("Errors: " + "; ".join(errors[:5]))
|
|
return render_report(lines)
|
|
|
|
|
|
def retrieval_report(
|
|
*,
|
|
interface: str,
|
|
concurrency: int,
|
|
total_duration_s: Optional[float],
|
|
iterations: int,
|
|
success: int,
|
|
failure: int,
|
|
stats: Dict[str, Optional[float]],
|
|
errors: List[str],
|
|
created: Dict[str, str],
|
|
) -> str:
|
|
lines = [
|
|
f"Interface: {interface}",
|
|
f"Concurrency: {concurrency}",
|
|
f"Iterations: {iterations}",
|
|
f"Success: {success}",
|
|
f"Failure: {failure}",
|
|
]
|
|
for key, value in created.items():
|
|
lines.append(f"{key}: {value}")
|
|
lines.extend(
|
|
[
|
|
"Latency: "
|
|
f"avg={_fmt_ms(stats['avg'])}, min={_fmt_ms(stats['min'])}, "
|
|
f"p50={_fmt_ms(stats['p50'])}, p90={_fmt_ms(stats['p90'])}, p95={_fmt_ms(stats['p95'])}",
|
|
f"Total Duration: {_fmt_seconds(total_duration_s)}",
|
|
f"QPS (requests / total duration): {_fmt_qps(_calc_qps(total_duration_s, iterations))}",
|
|
]
|
|
)
|
|
if errors:
|
|
lines.append("Errors: " + "; ".join(errors[:5]))
|
|
return render_report(lines)
|