feat(skills-ui): show replay eval coverage
This commit is contained in:
@ -50,6 +50,7 @@ from beaver.services.user_file_resolver import (
|
||||
build_file_auth_context,
|
||||
)
|
||||
from beaver.skills.learning import SkillLearningWorker, SkillLearningWorkerConfig
|
||||
from beaver.skills.learning.replay import ReplayRunner
|
||||
from beaver.skills.catalog.utils import parse_frontmatter
|
||||
|
||||
from .deps import get_agent_service
|
||||
@ -2080,7 +2081,8 @@ def create_app(
|
||||
@app.post("/api/skills/candidates/{candidate_id}/draft")
|
||||
async def synthesize_skill_draft(candidate_id: str, request: Request) -> dict[str, Any]:
|
||||
agent_service = get_agent_service(request)
|
||||
loaded = agent_service.create_loop().boot()
|
||||
loop = agent_service.create_loop()
|
||||
loaded = loop.boot()
|
||||
try:
|
||||
candidate = loaded.skill_learning_pipeline.get_candidate(candidate_id) # type: ignore[union-attr]
|
||||
if candidate.draft_skill_name and candidate.draft_id:
|
||||
@ -2099,6 +2101,7 @@ def create_app(
|
||||
draft.skill_name,
|
||||
draft.draft_id,
|
||||
provider_bundle=provider_bundle,
|
||||
replay_runner=ReplayRunner(agent_loop=loop),
|
||||
)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
||||
@ -2107,7 +2110,8 @@ def create_app(
|
||||
@app.post("/api/skills/candidates/{candidate_id}/regenerate")
|
||||
async def regenerate_skill_draft(candidate_id: str, request: Request) -> dict[str, Any]:
|
||||
agent_service = get_agent_service(request)
|
||||
loaded = agent_service.create_loop().boot()
|
||||
loop = agent_service.create_loop()
|
||||
loaded = loop.boot()
|
||||
provider_bundle = agent_service._make_provider_bundle_for_task(loaded, {}) # noqa: SLF001
|
||||
try:
|
||||
draft = await loaded.skill_learning_pipeline.regenerate_draft( # type: ignore[union-attr]
|
||||
@ -2120,6 +2124,7 @@ def create_app(
|
||||
draft.skill_name,
|
||||
draft.draft_id,
|
||||
provider_bundle=provider_bundle,
|
||||
replay_runner=ReplayRunner(agent_loop=loop),
|
||||
)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
||||
|
||||
@ -1088,6 +1088,12 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div className="mt-3 grid gap-2 sm:grid-cols-3">
|
||||
<MetricTile label={t('执行覆盖', 'Execution')} value={formatPercent(report.execution_coverage)} />
|
||||
<MetricTile label={t('替代评估', 'Surrogate')} value={formatPercent(report.surrogate_coverage)} />
|
||||
<MetricTile label={t('置信度', 'Confidence')} value={report.confidence || 'low'} />
|
||||
</div>
|
||||
|
||||
<div className="mt-3 grid gap-2 sm:grid-cols-3">
|
||||
<ReadableFact icon={<CheckCircle2 className="h-4 w-4" />} label={t('改进', 'Improved')} value={String(report.improved_count)} />
|
||||
<ReadableFact icon={<XCircle className="h-4 w-4" />} label={t('回退', 'Regressed')} value={String(report.regression_count)} />
|
||||
@ -1135,6 +1141,12 @@ function EvalReportPanel({ report }: { report?: SkillDraftEvalReport | null }) {
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
{Array.isArray(report.case_reports) && report.case_reports.length > 0 ? (
|
||||
<RawDetails title={t('Replay case reports', 'Replay case reports')} payload={report.case_reports} />
|
||||
) : null}
|
||||
{report.preservation_report ? (
|
||||
<RawDetails title={t('Preservation report', 'Preservation report')} payload={report.preservation_report} />
|
||||
) : null}
|
||||
<div className="mt-3 text-xs text-muted-foreground">{formatDateTime(report.created_at)}</div>
|
||||
<RawDetails title={t('原始评估报告', 'Raw eval report')} payload={report} />
|
||||
</div>
|
||||
@ -1387,6 +1399,11 @@ function formatScore(value: number): string {
|
||||
return value.toFixed(2);
|
||||
}
|
||||
|
||||
function formatPercent(value?: number | null): string {
|
||||
if (typeof value !== 'number' || Number.isNaN(value)) return '0%';
|
||||
return `${Math.round(value * 100)}%`;
|
||||
}
|
||||
|
||||
function formatSignedScore(value: number): string {
|
||||
if (!Number.isFinite(value)) return '-';
|
||||
return `${value >= 0 ? '+' : ''}${value.toFixed(2)}`;
|
||||
|
||||
@ -985,6 +985,15 @@ export interface SkillDraftEvalReport {
|
||||
cases: Array<Record<string, unknown>>;
|
||||
status: string;
|
||||
created_at: string;
|
||||
eval_version?: string;
|
||||
mode?: 'heuristic' | 'replay' | string;
|
||||
execution_coverage?: number;
|
||||
surrogate_coverage?: number;
|
||||
blocked_coverage?: number;
|
||||
confidence?: 'low' | 'medium' | 'high' | string;
|
||||
case_reports?: Array<Record<string, unknown>>;
|
||||
tool_mode_summary?: Record<string, unknown>;
|
||||
preservation_report?: Record<string, unknown> | null;
|
||||
}
|
||||
|
||||
export interface SkillDraft {
|
||||
|
||||
Reference in New Issue
Block a user