feat(skills-ui): show replay eval coverage

This commit is contained in:
2026-06-08 13:38:10 +08:00
parent b9171998b9
commit 9e2c02a333
3 changed files with 33 additions and 2 deletions

View File

@ -50,6 +50,7 @@ from beaver.services.user_file_resolver import (
build_file_auth_context, build_file_auth_context,
) )
from beaver.skills.learning import SkillLearningWorker, SkillLearningWorkerConfig from beaver.skills.learning import SkillLearningWorker, SkillLearningWorkerConfig
from beaver.skills.learning.replay import ReplayRunner
from beaver.skills.catalog.utils import parse_frontmatter from beaver.skills.catalog.utils import parse_frontmatter
from .deps import get_agent_service from .deps import get_agent_service
@ -2080,7 +2081,8 @@ def create_app(
@app.post("/api/skills/candidates/{candidate_id}/draft") @app.post("/api/skills/candidates/{candidate_id}/draft")
async def synthesize_skill_draft(candidate_id: str, request: Request) -> dict[str, Any]: async def synthesize_skill_draft(candidate_id: str, request: Request) -> dict[str, Any]:
agent_service = get_agent_service(request) agent_service = get_agent_service(request)
loaded = agent_service.create_loop().boot() loop = agent_service.create_loop()
loaded = loop.boot()
try: try:
candidate = loaded.skill_learning_pipeline.get_candidate(candidate_id) # type: ignore[union-attr] candidate = loaded.skill_learning_pipeline.get_candidate(candidate_id) # type: ignore[union-attr]
if candidate.draft_skill_name and candidate.draft_id: if candidate.draft_skill_name and candidate.draft_id:
@ -2099,6 +2101,7 @@ def create_app(
draft.skill_name, draft.skill_name,
draft.draft_id, draft.draft_id,
provider_bundle=provider_bundle, provider_bundle=provider_bundle,
replay_runner=ReplayRunner(agent_loop=loop),
) )
except ValueError as exc: except ValueError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc raise HTTPException(status_code=404, detail=str(exc)) from exc
@ -2107,7 +2110,8 @@ def create_app(
@app.post("/api/skills/candidates/{candidate_id}/regenerate") @app.post("/api/skills/candidates/{candidate_id}/regenerate")
async def regenerate_skill_draft(candidate_id: str, request: Request) -> dict[str, Any]: async def regenerate_skill_draft(candidate_id: str, request: Request) -> dict[str, Any]:
agent_service = get_agent_service(request) agent_service = get_agent_service(request)
loaded = agent_service.create_loop().boot() loop = agent_service.create_loop()
loaded = loop.boot()
provider_bundle = agent_service._make_provider_bundle_for_task(loaded, {}) # noqa: SLF001 provider_bundle = agent_service._make_provider_bundle_for_task(loaded, {}) # noqa: SLF001
try: try:
draft = await loaded.skill_learning_pipeline.regenerate_draft( # type: ignore[union-attr] draft = await loaded.skill_learning_pipeline.regenerate_draft( # type: ignore[union-attr]
@ -2120,6 +2124,7 @@ def create_app(
draft.skill_name, draft.skill_name,
draft.draft_id, draft.draft_id,
provider_bundle=provider_bundle, provider_bundle=provider_bundle,
replay_runner=ReplayRunner(agent_loop=loop),
) )
except ValueError as exc: except ValueError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc raise HTTPException(status_code=404, detail=str(exc)) from exc

View File

@ -1088,6 +1088,12 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
/> />
</div> </div>
<div className="mt-3 grid gap-2 sm:grid-cols-3">
<MetricTile label={t('执行覆盖', 'Execution')} value={formatPercent(report.execution_coverage)} />
<MetricTile label={t('替代评估', 'Surrogate')} value={formatPercent(report.surrogate_coverage)} />
<MetricTile label={t('置信度', 'Confidence')} value={report.confidence || 'low'} />
</div>
<div className="mt-3 grid gap-2 sm:grid-cols-3"> <div className="mt-3 grid gap-2 sm:grid-cols-3">
<ReadableFact icon={<CheckCircle2 className="h-4 w-4" />} label={t('改进', 'Improved')} value={String(report.improved_count)} /> <ReadableFact icon={<CheckCircle2 className="h-4 w-4" />} label={t('改进', 'Improved')} value={String(report.improved_count)} />
<ReadableFact icon={<XCircle className="h-4 w-4" />} label={t('回退', 'Regressed')} value={String(report.regression_count)} /> <ReadableFact icon={<XCircle className="h-4 w-4" />} label={t('回退', 'Regressed')} value={String(report.regression_count)} />
@ -1135,6 +1141,12 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
</div> </div>
</div> </div>
)} )}
{Array.isArray(report.case_reports) && report.case_reports.length > 0 ? (
<RawDetails title={t('Replay case reports', 'Replay case reports')} payload={report.case_reports} />
) : null}
{report.preservation_report ? (
<RawDetails title={t('Preservation report', 'Preservation report')} payload={report.preservation_report} />
) : null}
<div className="mt-3 text-xs text-muted-foreground">{formatDateTime(report.created_at)}</div> <div className="mt-3 text-xs text-muted-foreground">{formatDateTime(report.created_at)}</div>
<RawDetails title={t('原始评估报告', 'Raw eval report')} payload={report} /> <RawDetails title={t('原始评估报告', 'Raw eval report')} payload={report} />
</div> </div>
@ -1387,6 +1399,11 @@ function formatScore(value: number): string {
return value.toFixed(2); return value.toFixed(2);
} }
function formatPercent(value?: number | null): string {
if (typeof value !== 'number' || Number.isNaN(value)) return '0%';
return `${Math.round(value * 100)}%`;
}
function formatSignedScore(value: number): string { function formatSignedScore(value: number): string {
if (!Number.isFinite(value)) return '-'; if (!Number.isFinite(value)) return '-';
return `${value >= 0 ? '+' : ''}${value.toFixed(2)}`; return `${value >= 0 ? '+' : ''}${value.toFixed(2)}`;

View File

@ -985,6 +985,15 @@ export interface SkillDraftEvalReport {
cases: Array<Record<string, unknown>>; cases: Array<Record<string, unknown>>;
status: string; status: string;
created_at: string; created_at: string;
eval_version?: string;
mode?: 'heuristic' | 'replay' | string;
execution_coverage?: number;
surrogate_coverage?: number;
blocked_coverage?: number;
confidence?: 'low' | 'medium' | 'high' | string;
case_reports?: Array<Record<string, unknown>>;
tool_mode_summary?: Record<string, unknown>;
preservation_report?: Record<string, unknown> | null;
} }
export interface SkillDraft { export interface SkillDraft {