chore: improve clear workflow_run task (#31124)

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
Co-authored-by: hj24 <mambahj24@gmail.com>
This commit is contained in:
非法操作
2026-01-19 10:58:57 +08:00
committed by GitHub
parent 13d648cf7b
commit b0545635b8
6 changed files with 119 additions and 128 deletions

View File

@ -862,8 +862,27 @@ def clear_free_plan_tenant_expired_logs(days: int, batch: int, tenant_ids: list[
@click.command("clean-workflow-runs", help="Clean expired workflow runs and related data for free tenants.")
@click.option("--days", default=30, show_default=True, help="Delete workflow runs created before N days ago.")
@click.option(
"--before-days",
"--days",
default=30,
show_default=True,
type=click.IntRange(min=0),
help="Delete workflow runs created before N days ago.",
)
@click.option("--batch-size", default=200, show_default=True, help="Batch size for selecting workflow runs.")
@click.option(
"--from-days-ago",
default=None,
type=click.IntRange(min=0),
help="Lower bound in days ago (older). Must be paired with --to-days-ago.",
)
@click.option(
"--to-days-ago",
default=None,
type=click.IntRange(min=0),
help="Upper bound in days ago (newer). Must be paired with --from-days-ago.",
)
@click.option(
"--start-from",
type=click.DateTime(formats=["%Y-%m-%d", "%Y-%m-%dT%H:%M:%S"]),
@ -882,8 +901,10 @@ def clear_free_plan_tenant_expired_logs(days: int, batch: int, tenant_ids: list[
help="Preview cleanup results without deleting any workflow run data.",
)
def clean_workflow_runs(
days: int,
before_days: int,
batch_size: int,
from_days_ago: int | None,
to_days_ago: int | None,
start_from: datetime.datetime | None,
end_before: datetime.datetime | None,
dry_run: bool,
@ -894,11 +915,24 @@ def clean_workflow_runs(
if (start_from is None) ^ (end_before is None):
raise click.UsageError("--start-from and --end-before must be provided together.")
if (from_days_ago is None) ^ (to_days_ago is None):
raise click.UsageError("--from-days-ago and --to-days-ago must be provided together.")
if from_days_ago is not None and to_days_ago is not None:
if start_from or end_before:
raise click.UsageError("Choose either day offsets or explicit dates, not both.")
if from_days_ago <= to_days_ago:
raise click.UsageError("--from-days-ago must be greater than --to-days-ago.")
now = datetime.datetime.now()
start_from = now - datetime.timedelta(days=from_days_ago)
end_before = now - datetime.timedelta(days=to_days_ago)
before_days = 0
start_time = datetime.datetime.now(datetime.UTC)
click.echo(click.style(f"Starting workflow run cleanup at {start_time.isoformat()}.", fg="white"))
WorkflowRunCleanup(
days=days,
days=before_days,
batch_size=batch_size,
start_from=start_from,
end_before=end_before,

View File

@ -0,0 +1,35 @@
"""change workflow node execution workflow_run index
Revision ID: 288345cd01d1
Revises: 3334862ee907
Create Date: 2026-01-16 17:15:00.000000
"""
from alembic import op
# revision identifiers, used by Alembic.
revision = "288345cd01d1"
down_revision = "3334862ee907"
branch_labels = None
depends_on = None
def upgrade():
with op.batch_alter_table("workflow_node_executions", schema=None) as batch_op:
batch_op.drop_index("workflow_node_execution_workflow_run_idx")
batch_op.create_index(
"workflow_node_execution_workflow_run_id_idx",
["workflow_run_id"],
unique=False,
)
def downgrade():
with op.batch_alter_table("workflow_node_executions", schema=None) as batch_op:
batch_op.drop_index("workflow_node_execution_workflow_run_id_idx")
batch_op.create_index(
"workflow_node_execution_workflow_run_idx",
["tenant_id", "app_id", "workflow_id", "triggered_from", "workflow_run_id"],
unique=False,
)

View File

@ -781,11 +781,7 @@ class WorkflowNodeExecutionModel(Base): # This model is expected to have `offlo
return (
PrimaryKeyConstraint("id", name="workflow_node_execution_pkey"),
Index(
"workflow_node_execution_workflow_run_idx",
"tenant_id",
"app_id",
"workflow_id",
"triggered_from",
"workflow_node_execution_workflow_run_id_idx",
"workflow_run_id",
),
Index(

View File

@ -13,6 +13,8 @@ from collections.abc import Sequence
from datetime import datetime
from typing import Protocol
from sqlalchemy.orm import Session
from core.workflow.repositories.workflow_node_execution_repository import WorkflowNodeExecutionRepository
from models.workflow import WorkflowNodeExecutionModel
@ -130,6 +132,18 @@ class DifyAPIWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository, Pr
"""
...
def count_by_runs(self, session: Session, run_ids: Sequence[str]) -> tuple[int, int]:
"""
Count node executions and offloads for the given workflow run ids.
"""
...
def delete_by_runs(self, session: Session, run_ids: Sequence[str]) -> tuple[int, int]:
"""
Delete node executions and offloads for the given workflow run ids.
"""
...
def delete_executions_by_app(
self,
tenant_id: str,

View File

@ -7,17 +7,15 @@ using SQLAlchemy 2.0 style queries for WorkflowNodeExecutionModel operations.
from collections.abc import Sequence
from datetime import datetime
from typing import TypedDict, cast
from typing import cast
from sqlalchemy import asc, delete, desc, func, select, tuple_
from sqlalchemy import asc, delete, desc, func, select
from sqlalchemy.engine import CursorResult
from sqlalchemy.orm import Session, sessionmaker
from models.enums import WorkflowRunTriggeredFrom
from models.workflow import (
WorkflowNodeExecutionModel,
WorkflowNodeExecutionOffload,
WorkflowNodeExecutionTriggeredFrom,
)
from repositories.api_workflow_node_execution_repository import DifyAPIWorkflowNodeExecutionRepository
@ -49,26 +47,6 @@ class DifyAPISQLAlchemyWorkflowNodeExecutionRepository(DifyAPIWorkflowNodeExecut
"""
self._session_maker = session_maker
@staticmethod
def _map_run_triggered_from_to_node_triggered_from(triggered_from: str) -> str:
"""
Map workflow run triggered_from values to workflow node execution triggered_from values.
"""
if triggered_from in {
WorkflowRunTriggeredFrom.APP_RUN.value,
WorkflowRunTriggeredFrom.DEBUGGING.value,
WorkflowRunTriggeredFrom.SCHEDULE.value,
WorkflowRunTriggeredFrom.PLUGIN.value,
WorkflowRunTriggeredFrom.WEBHOOK.value,
}:
return WorkflowNodeExecutionTriggeredFrom.WORKFLOW_RUN.value
if triggered_from in {
WorkflowRunTriggeredFrom.RAG_PIPELINE_RUN.value,
WorkflowRunTriggeredFrom.RAG_PIPELINE_DEBUGGING.value,
}:
return WorkflowNodeExecutionTriggeredFrom.RAG_PIPELINE_RUN.value
return ""
def get_node_last_execution(
self,
tenant_id: str,
@ -316,51 +294,16 @@ class DifyAPISQLAlchemyWorkflowNodeExecutionRepository(DifyAPIWorkflowNodeExecut
session.commit()
return result.rowcount
class RunContext(TypedDict):
run_id: str
tenant_id: str
app_id: str
workflow_id: str
triggered_from: str
@staticmethod
def delete_by_runs(session: Session, runs: Sequence[RunContext]) -> tuple[int, int]:
def delete_by_runs(self, session: Session, run_ids: Sequence[str]) -> tuple[int, int]:
"""
Delete node executions (and offloads) for the given workflow runs using indexed columns.
Uses the composite index on (tenant_id, app_id, workflow_id, triggered_from, workflow_run_id)
by filtering on those columns with tuple IN.
Delete node executions (and offloads) for the given workflow runs using workflow_run_id.
"""
if not runs:
if not run_ids:
return 0, 0
tuple_values = [
(
run["tenant_id"],
run["app_id"],
run["workflow_id"],
DifyAPISQLAlchemyWorkflowNodeExecutionRepository._map_run_triggered_from_to_node_triggered_from(
run["triggered_from"]
),
run["run_id"],
)
for run in runs
]
node_execution_ids = session.scalars(
select(WorkflowNodeExecutionModel.id).where(
tuple_(
WorkflowNodeExecutionModel.tenant_id,
WorkflowNodeExecutionModel.app_id,
WorkflowNodeExecutionModel.workflow_id,
WorkflowNodeExecutionModel.triggered_from,
WorkflowNodeExecutionModel.workflow_run_id,
).in_(tuple_values)
)
).all()
if not node_execution_ids:
return 0, 0
run_ids = list(run_ids)
run_id_filter = WorkflowNodeExecutionModel.workflow_run_id.in_(run_ids)
node_execution_ids = select(WorkflowNodeExecutionModel.id).where(run_id_filter)
offloads_deleted = (
cast(
@ -377,55 +320,32 @@ class DifyAPISQLAlchemyWorkflowNodeExecutionRepository(DifyAPIWorkflowNodeExecut
node_executions_deleted = (
cast(
CursorResult,
session.execute(
delete(WorkflowNodeExecutionModel).where(WorkflowNodeExecutionModel.id.in_(node_execution_ids))
),
session.execute(delete(WorkflowNodeExecutionModel).where(run_id_filter)),
).rowcount
or 0
)
return node_executions_deleted, offloads_deleted
@staticmethod
def count_by_runs(session: Session, runs: Sequence[RunContext]) -> tuple[int, int]:
def count_by_runs(self, session: Session, run_ids: Sequence[str]) -> tuple[int, int]:
"""
Count node executions (and offloads) for the given workflow runs using indexed columns.
Count node executions (and offloads) for the given workflow runs using workflow_run_id.
"""
if not runs:
if not run_ids:
return 0, 0
tuple_values = [
(
run["tenant_id"],
run["app_id"],
run["workflow_id"],
DifyAPISQLAlchemyWorkflowNodeExecutionRepository._map_run_triggered_from_to_node_triggered_from(
run["triggered_from"]
),
run["run_id"],
)
for run in runs
]
tuple_filter = tuple_(
WorkflowNodeExecutionModel.tenant_id,
WorkflowNodeExecutionModel.app_id,
WorkflowNodeExecutionModel.workflow_id,
WorkflowNodeExecutionModel.triggered_from,
WorkflowNodeExecutionModel.workflow_run_id,
).in_(tuple_values)
run_ids = list(run_ids)
run_id_filter = WorkflowNodeExecutionModel.workflow_run_id.in_(run_ids)
node_executions_count = (
session.scalar(select(func.count()).select_from(WorkflowNodeExecutionModel).where(tuple_filter)) or 0
session.scalar(select(func.count()).select_from(WorkflowNodeExecutionModel).where(run_id_filter)) or 0
)
node_execution_ids = select(WorkflowNodeExecutionModel.id).where(run_id_filter)
offloads_count = (
session.scalar(
select(func.count())
.select_from(WorkflowNodeExecutionOffload)
.join(
WorkflowNodeExecutionModel,
WorkflowNodeExecutionOffload.node_execution_id == WorkflowNodeExecutionModel.id,
)
.where(tuple_filter)
.where(WorkflowNodeExecutionOffload.node_execution_id.in_(node_execution_ids))
)
or 0
)

View File

@ -10,9 +10,7 @@ from enums.cloud_plan import CloudPlan
from extensions.ext_database import db
from models.workflow import WorkflowRun
from repositories.api_workflow_run_repository import APIWorkflowRunRepository
from repositories.sqlalchemy_api_workflow_node_execution_repository import (
DifyAPISQLAlchemyWorkflowNodeExecutionRepository,
)
from repositories.factory import DifyAPIRepositoryFactory
from repositories.sqlalchemy_workflow_trigger_log_repository import SQLAlchemyWorkflowTriggerLogRepository
from services.billing_service import BillingService, SubscriptionPlan
@ -92,9 +90,12 @@ class WorkflowRunCleanup:
paid_or_skipped = len(run_rows) - len(free_runs)
if not free_runs:
skipped_message = (
f"[batch #{batch_index}] skipped (no sandbox runs in batch, {paid_or_skipped} paid/unknown)"
)
click.echo(
click.style(
f"[batch #{batch_index}] skipped (no sandbox runs in batch, {paid_or_skipped} paid/unknown)",
skipped_message,
fg="yellow",
)
)
@ -255,21 +256,6 @@ class WorkflowRunCleanup:
trigger_repo = SQLAlchemyWorkflowTriggerLogRepository(session)
return trigger_repo.count_by_run_ids(run_ids)
@staticmethod
def _build_run_contexts(
runs: Sequence[WorkflowRun],
) -> list[DifyAPISQLAlchemyWorkflowNodeExecutionRepository.RunContext]:
return [
{
"run_id": run.id,
"tenant_id": run.tenant_id,
"app_id": run.app_id,
"workflow_id": run.workflow_id,
"triggered_from": run.triggered_from,
}
for run in runs
]
@staticmethod
def _empty_related_counts() -> dict[str, int]:
return {
@ -293,9 +279,15 @@ class WorkflowRunCleanup:
)
def _count_node_executions(self, session: Session, runs: Sequence[WorkflowRun]) -> tuple[int, int]:
run_contexts = self._build_run_contexts(runs)
return DifyAPISQLAlchemyWorkflowNodeExecutionRepository.count_by_runs(session, run_contexts)
run_ids = [run.id for run in runs]
repo = DifyAPIRepositoryFactory.create_api_workflow_node_execution_repository(
session_maker=sessionmaker(bind=session.get_bind(), expire_on_commit=False)
)
return repo.count_by_runs(session, run_ids)
def _delete_node_executions(self, session: Session, runs: Sequence[WorkflowRun]) -> tuple[int, int]:
run_contexts = self._build_run_contexts(runs)
return DifyAPISQLAlchemyWorkflowNodeExecutionRepository.delete_by_runs(session, run_contexts)
run_ids = [run.id for run in runs]
repo = DifyAPIRepositoryFactory.create_api_workflow_node_execution_repository(
session_maker=sessionmaker(bind=session.get_bind(), expire_on_commit=False)
)
return repo.delete_by_runs(session, run_ids)