feat(skills-ui): show replay eval coverage

This commit is contained in:
2026-06-08 13:38:10 +08:00
parent b9171998b9
commit 9e2c02a333
3 changed files with 33 additions and 2 deletions

View File

@ -50,6 +50,7 @@ from beaver.services.user_file_resolver import (
build_file_auth_context,
)
from beaver.skills.learning import SkillLearningWorker, SkillLearningWorkerConfig
from beaver.skills.learning.replay import ReplayRunner
from beaver.skills.catalog.utils import parse_frontmatter
from .deps import get_agent_service
@ -2080,7 +2081,8 @@ def create_app(
@app.post("/api/skills/candidates/{candidate_id}/draft")
async def synthesize_skill_draft(candidate_id: str, request: Request) -> dict[str, Any]:
agent_service = get_agent_service(request)
loaded = agent_service.create_loop().boot()
loop = agent_service.create_loop()
loaded = loop.boot()
try:
candidate = loaded.skill_learning_pipeline.get_candidate(candidate_id) # type: ignore[union-attr]
if candidate.draft_skill_name and candidate.draft_id:
@ -2099,6 +2101,7 @@ def create_app(
draft.skill_name,
draft.draft_id,
provider_bundle=provider_bundle,
replay_runner=ReplayRunner(agent_loop=loop),
)
except ValueError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc
@ -2107,7 +2110,8 @@ def create_app(
@app.post("/api/skills/candidates/{candidate_id}/regenerate")
async def regenerate_skill_draft(candidate_id: str, request: Request) -> dict[str, Any]:
agent_service = get_agent_service(request)
loaded = agent_service.create_loop().boot()
loop = agent_service.create_loop()
loaded = loop.boot()
provider_bundle = agent_service._make_provider_bundle_for_task(loaded, {}) # noqa: SLF001
try:
draft = await loaded.skill_learning_pipeline.regenerate_draft( # type: ignore[union-attr]
@ -2120,6 +2124,7 @@ def create_app(
draft.skill_name,
draft.draft_id,
provider_bundle=provider_bundle,
replay_runner=ReplayRunner(agent_loop=loop),
)
except ValueError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc

View File

@ -1088,6 +1088,12 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
/>
</div>
<div className="mt-3 grid gap-2 sm:grid-cols-3">
<MetricTile label={t('执行覆盖', 'Execution')} value={formatPercent(report.execution_coverage)} />
<MetricTile label={t('替代评估', 'Surrogate')} value={formatPercent(report.surrogate_coverage)} />
<MetricTile label={t('置信度', 'Confidence')} value={report.confidence || 'low'} />
</div>
<div className="mt-3 grid gap-2 sm:grid-cols-3">
<ReadableFact icon={<CheckCircle2 className="h-4 w-4" />} label={t('改进', 'Improved')} value={String(report.improved_count)} />
<ReadableFact icon={<XCircle className="h-4 w-4" />} label={t('回退', 'Regressed')} value={String(report.regression_count)} />
@ -1135,6 +1141,12 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
</div>
</div>
)}
{Array.isArray(report.case_reports) && report.case_reports.length > 0 ? (
<RawDetails title={t('Replay case reports', 'Replay case reports')} payload={report.case_reports} />
) : null}
{report.preservation_report ? (
<RawDetails title={t('Preservation report', 'Preservation report')} payload={report.preservation_report} />
) : null}
<div className="mt-3 text-xs text-muted-foreground">{formatDateTime(report.created_at)}</div>
<RawDetails title={t('原始评估报告', 'Raw eval report')} payload={report} />
</div>
@ -1387,6 +1399,11 @@ function formatScore(value: number): string {
return value.toFixed(2);
}
function formatPercent(value?: number | null): string {
if (typeof value !== 'number' || Number.isNaN(value)) return '0%';
return `${Math.round(value * 100)}%`;
}
function formatSignedScore(value: number): string {
if (!Number.isFinite(value)) return '-';
return `${value >= 0 ? '+' : ''}${value.toFixed(2)}`;

View File

@ -985,6 +985,15 @@ export interface SkillDraftEvalReport {
cases: Array<Record<string, unknown>>;
status: string;
created_at: string;
eval_version?: string;
mode?: 'heuristic' | 'replay' | string;
execution_coverage?: number;
surrogate_coverage?: number;
blocked_coverage?: number;
confidence?: 'low' | 'medium' | 'high' | string;
case_reports?: Array<Record<string, unknown>>;
tool_mode_summary?: Record<string, unknown>;
preservation_report?: Record<string, unknown> | null;
}
export interface SkillDraft {