Files
beaver_project/app-instance/backend/nanobot/cron/service.py
2026-03-13 16:40:08 +08:00

584 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Cron 调度服务(持久化 + 计算下一次触发 + 定时执行)。
这个模块是 nanobot 的“计划任务内核”,职责边界如下:
1. 数据层:把任务状态持久化到 `jobs.json`,并在内存维护一个 `CronStore` 缓存;
2. 调度层:根据 `at / every / cron` 规则计算每个任务的下一次触发时间;
3. 执行层:在任务到点时调用 `on_job` 回调(通常由 gateway 注入,转到 agent 执行);
4. 管理层:提供增删改查、启停、手动触发等公共 API。
关键设计点:
- 单计时器模型:始终只保留“最近一次触发点”的 `asyncio.Task`
避免“每个任务一个 sleep 协程”导致的资源膨胀;
- 懒加载存储:首次访问才读盘,后续以内存对象为准,写操作再落盘;
- 容错优先:配置/解析异常尽量降级为空任务或不可调度,不让主服务崩溃。
"""
import asyncio
import json
import re
import time
import uuid
from dataclasses import dataclass
from datetime import datetime, timedelta
from pathlib import Path
from typing import Any, Callable, Coroutine, Literal
from loguru import logger
from nanobot.cron.types import (
CronAction,
CronExecutionResult,
CronJob,
CronJobState,
CronPayload,
CronSchedule,
CronStore,
)
def _now_ms() -> int:
"""返回当前 Unix 时间戳(毫秒,基于系统墙钟时间)。"""
# 这里使用 wall-clocktime.time因为 cron 语义本身就是“现实时间点”。
# 若改用 monotonic则无法直接表达“今天 9:00”这种绝对时刻。
return int(time.time() * 1000)
def _compute_next_run(schedule: CronSchedule, now_ms: int) -> int | None:
"""计算下一次运行时间(毫秒时间戳)。
返回 None 表示该任务当前不可运行(如参数非法、时间已过或 cron 解析失败)。
"""
if schedule.kind == "at":
# 一次性定时:仅当目标时间晚于“现在”才有效。
return schedule.at_ms if schedule.at_ms and schedule.at_ms > now_ms else None
if schedule.kind == "every":
if not schedule.every_ms or schedule.every_ms <= 0:
return None
# 固定间隔任务:以“当前时刻 + 间隔”作为下一次触发点。
# 注意这里不做“对齐”计算(例如每分钟整点),仅做相对延迟:
# - 优点:实现简单、行为稳定;
# - 代价:若执行耗时较长,长期看会有“相位漂移”(不保证卡在固定秒位)。
return now_ms + schedule.every_ms
if schedule.kind == "cron" and schedule.expr:
try:
from croniter import croniter
from zoneinfo import ZoneInfo
# 使用调用方传入的 now_ms 作为基准,保证在同一输入下行为可预测。
base_time = now_ms / 1000
# 未指定 tz 时,退回到当前系统本地时区。
tz = ZoneInfo(schedule.tz) if schedule.tz else datetime.now().astimezone().tzinfo
base_dt = datetime.fromtimestamp(base_time, tz=tz)
cron = croniter(schedule.expr, base_dt)
next_dt = cron.get_next(datetime)
return int(next_dt.timestamp() * 1000)
except Exception:
# 调度表达式或时区非法时,返回 None 让上层把任务视为不可调度。
# 这里吞掉异常是有意设计:单个坏任务不应拖垮整个调度器。
return None
return None
def _validate_schedule_for_add(schedule: CronSchedule) -> None:
"""在创建任务前做必要校验,避免写入明显不可执行的调度。"""
# 只有 cron 表达式支持时区字段at/every 传 tz 视为配置错误。
if schedule.tz and schedule.kind != "cron":
raise ValueError("tz can only be used with cron schedules")
if schedule.kind == "cron" and schedule.tz:
try:
from zoneinfo import ZoneInfo
ZoneInfo(schedule.tz)
except Exception:
raise ValueError(f"unknown timezone '{schedule.tz}'") from None
_DAILY_LIMIT_PATTERNS = [
re.compile(r"今日.*已达.*上限"),
re.compile(r"已达\d+支上限"),
re.compile(r"停止介绍"),
re.compile(r"daily (?:cap|limit).*(?:reached|hit)", re.IGNORECASE),
re.compile(r"today.*(?:reached|hit).*(?:cap|limit)", re.IGNORECASE),
]
def _looks_like_daily_limit_reached(response: str | None) -> bool:
if not response:
return False
probe = response.strip()
if not probe:
return False
return any(pattern.search(probe) for pattern in _DAILY_LIMIT_PATTERNS)
def _next_daily_cycle_start_ms(job: CronJob, now_ms: int) -> int:
"""Pick the next local-day anchor time for finite daily batch jobs."""
tz = datetime.now().astimezone().tzinfo
now_dt = datetime.fromtimestamp(now_ms / 1000, tz=tz)
anchor_source_ms = job.created_at_ms or now_ms
anchor_dt = datetime.fromtimestamp(anchor_source_ms / 1000, tz=tz)
candidate = now_dt.replace(
hour=anchor_dt.hour,
minute=anchor_dt.minute,
second=anchor_dt.second,
microsecond=anchor_dt.microsecond,
) + timedelta(days=1)
return int(candidate.timestamp() * 1000)
def _schedule_from_action(action: CronAction) -> CronSchedule:
if action.every_seconds is not None:
return CronSchedule(kind="every", every_ms=action.every_seconds * 1000)
if action.cron_expr:
return CronSchedule(kind="cron", expr=action.cron_expr, tz=action.tz)
if action.at:
dt = datetime.fromisoformat(action.at)
return CronSchedule(kind="at", at_ms=int(dt.timestamp() * 1000))
raise ValueError("reschedule action requires exactly one schedule field")
@dataclass
class _ActionOutcome:
removed: bool = False
explicit_next_run: bool = False
managed_next_run_at_ms: int | None = None
_CronCallbackResult = str | CronExecutionResult | None
class CronService:
"""管理并执行定时任务的服务对象。
运行模型(事件循环内):
1. `start()` 时加载 store、重算 next_run、挂载单计时器
2. 计时器唤醒后 `_on_timer()` 找到到期任务并顺序执行;
3. 每次状态变化后都 `_save_store()` + `_arm_timer()`,保持数据与调度一致。
并发假设:
- 默认在同一个 asyncio 事件循环线程内被调用;
- 代码未显式加锁,不保证跨线程并发安全;
- 若要跨线程/多进程共享,应加文件锁或迁移到数据库事务模型。
"""
def __init__(
self,
store_path: Path,
on_job: Callable[[CronJob], Coroutine[Any, Any, _CronCallbackResult]] | None = None,
):
# 任务持久化文件(默认:~/.nanobot/data/cron/jobs.json
self.store_path = store_path
# 任务执行回调:由 gateway 注入,用于真正触发 agent 处理。
# CLI 仅做任务管理时可以不传(保持 None
self.on_job = on_job
# `_store` 采用懒加载;首次访问时才读盘。
self._store: CronStore | None = None
# 全局只维护一个“最近唤醒点”的计时任务,减少无效 wake-up。
self._timer_task: asyncio.Task | None = None
# 服务开关:只要 stop() 把它置 False计时器回调会自然短路退出。
self._running = False
def _load_store(self) -> CronStore:
"""从磁盘加载任务到内存(懒加载 + 内存缓存)。"""
if self._store:
# 已加载过直接返回内存对象,避免频繁磁盘 IO。
return self._store
if self.store_path.exists():
try:
data = json.loads(self.store_path.read_text(encoding="utf-8"))
jobs = []
for j in data.get("jobs", []):
# 反序列化时字段采用“宽松读取”:
# - 新老版本缺失字段尽量给默认值;
# - 以最大兼容性优先,减少升级时配置爆炸。
jobs.append(CronJob(
id=j["id"],
name=j["name"],
enabled=j.get("enabled", True),
schedule=CronSchedule(
kind=j["schedule"]["kind"],
at_ms=j["schedule"].get("atMs"),
every_ms=j["schedule"].get("everyMs"),
expr=j["schedule"].get("expr"),
tz=j["schedule"].get("tz"),
),
payload=CronPayload(
kind=j["payload"].get("kind", "agent_turn"),
message=j["payload"].get("message", ""),
session_key=j["payload"].get("sessionKey"),
deliver=j["payload"].get("deliver", False),
channel=j["payload"].get("channel"),
to=j["payload"].get("to"),
),
state=CronJobState(
next_run_at_ms=j.get("state", {}).get("nextRunAtMs"),
last_run_at_ms=j.get("state", {}).get("lastRunAtMs"),
last_status=j.get("state", {}).get("lastStatus"),
last_error=j.get("state", {}).get("lastError"),
),
created_at_ms=j.get("createdAtMs", 0),
updated_at_ms=j.get("updatedAtMs", 0),
delete_after_run=j.get("deleteAfterRun", False),
))
self._store = CronStore(jobs=jobs)
except Exception as e:
# 文件损坏或结构异常时,不让服务崩溃,回退为空 store。
logger.warning("Failed to load cron store: {}", e)
self._store = CronStore()
else:
# 首次运行尚无文件时,初始化为空 store。
self._store = CronStore()
return self._store
def _save_store(self) -> None:
"""把内存中的任务快照写回磁盘。"""
if not self._store:
return
# 首次保存时自动创建上级目录。
self.store_path.parent.mkdir(parents=True, exist_ok=True)
data = {
"version": self._store.version,
"jobs": [
{
"id": j.id,
"name": j.name,
"enabled": j.enabled,
"schedule": {
"kind": j.schedule.kind,
"atMs": j.schedule.at_ms,
"everyMs": j.schedule.every_ms,
"expr": j.schedule.expr,
"tz": j.schedule.tz,
},
"payload": {
"kind": j.payload.kind,
"message": j.payload.message,
"sessionKey": j.payload.session_key,
"deliver": j.payload.deliver,
"channel": j.payload.channel,
"to": j.payload.to,
},
"state": {
"nextRunAtMs": j.state.next_run_at_ms,
"lastRunAtMs": j.state.last_run_at_ms,
"lastStatus": j.state.last_status,
"lastError": j.state.last_error,
},
"createdAtMs": j.created_at_ms,
"updatedAtMs": j.updated_at_ms,
"deleteAfterRun": j.delete_after_run,
}
for j in self._store.jobs
]
}
# 这里是“整文件覆盖写”模型,不是事务性写入。
# 若未来需要更强一致性,可升级为“临时文件 + 原子 rename”。
self.store_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
async def start(self) -> None:
"""启动服务并挂载下一次唤醒计时器。"""
# 幂等启动语义:重复 start 不抛错,但会重算并重新挂载 timer。
self._running = True
self._load_store()
# 每次启动都重算 next_run避免沿用过期的历史状态。
self._recompute_next_runs()
self._save_store()
self._arm_timer()
logger.info("Cron service started with {} jobs", len(self._store.jobs if self._store else []))
def stop(self) -> None:
"""停止服务并取消当前计时器。"""
self._running = False
if self._timer_task:
# 取消后不等待完成:让调用方快速返回,避免阻塞关停流程。
self._timer_task.cancel()
self._timer_task = None
def _recompute_next_runs(self) -> None:
"""批量重算启用任务的下一次触发时间。"""
if not self._store:
return
now = _now_ms()
for job in self._store.jobs:
if job.enabled:
job.state.next_run_at_ms = _compute_next_run(job.schedule, now)
def _get_next_wake_ms(self) -> int | None:
"""返回所有启用任务中最早的触发时间。"""
if not self._store:
return None
times = [j.state.next_run_at_ms for j in self._store.jobs
if j.enabled and j.state.next_run_at_ms]
# 没有任何可触发任务则返回 None上层据此不挂 timer。
return min(times) if times else None
def _arm_timer(self) -> None:
"""按“最近触发点”重置单计时器。"""
# 每次状态变化后都重置 timer保证只等待当前最近的一次触发。
if self._timer_task:
self._timer_task.cancel()
next_wake = self._get_next_wake_ms()
if not next_wake or not self._running:
return
delay_ms = max(0, next_wake - _now_ms())
delay_s = delay_ms / 1000
async def tick():
# sleep 期间若 timer 被 cancel会抛 CancelledError 并自然结束任务。
await asyncio.sleep(delay_s)
if self._running:
await self._on_timer()
self._timer_task = asyncio.create_task(tick())
async def _on_timer(self) -> None:
"""计时器触发后执行所有到期任务,并继续调度下一轮。"""
if not self._store:
return
now = _now_ms()
due_jobs = [
j for j in self._store.jobs
if j.enabled and j.state.next_run_at_ms and now >= j.state.next_run_at_ms
]
# 顺序执行,便于日志可读性与状态一致性;若后续有并发需求可在此扩展。
# 这里“顺序而非并发”的取舍:
# - 优点:状态更新顺序可预测,诊断简单;
# - 代价:单个慢任务会延后后续任务执行。
for job in due_jobs:
await self._execute_job(job)
# 无论是否有 due job都保存一次状态并重挂 timer
# 保证 next_run 与磁盘快照一致。
self._save_store()
self._arm_timer()
@staticmethod
def _coerce_execution_result(
callback_result: _CronCallbackResult,
) -> CronExecutionResult:
"""Normalize legacy string callbacks into the structured execution result."""
if isinstance(callback_result, CronExecutionResult):
return callback_result
return CronExecutionResult(response=callback_result)
def _apply_structured_action(self, job: CronJob, action: CronAction) -> _ActionOutcome:
"""Apply one structured cron control decision to the current job."""
normalized = (action.action or "none").strip().lower()
reason = action.reason or "no reason provided"
if normalized == "none":
return _ActionOutcome()
if normalized == "remove":
self._store.jobs = [item for item in self._store.jobs if item.id != job.id]
logger.info("Cron: removed job '{}' via structured action ({})", job.name, reason)
return _ActionOutcome(removed=True)
if normalized == "disable":
job.enabled = False
job.state.next_run_at_ms = None
logger.info("Cron: disabled job '{}' via structured action ({})", job.name, reason)
return _ActionOutcome(explicit_next_run=True)
if normalized == "complete_today":
managed_next_run_at_ms = _next_daily_cycle_start_ms(job, _now_ms())
logger.info(
"Cron: job '{}' completed today's batch via structured action ({}), next cycle at {}",
job.name,
reason,
managed_next_run_at_ms,
)
return _ActionOutcome(managed_next_run_at_ms=managed_next_run_at_ms)
if normalized == "reschedule":
schedule = _schedule_from_action(action)
_validate_schedule_for_add(schedule)
job.schedule = schedule
job.enabled = True
job.delete_after_run = schedule.kind == "at"
job.state.next_run_at_ms = _compute_next_run(schedule, _now_ms())
logger.info("Cron: rescheduled job '{}' via structured action ({})", job.name, reason)
return _ActionOutcome(explicit_next_run=True)
logger.warning("Cron: unknown structured action '{}' for job '{}'", normalized, job.name)
return _ActionOutcome()
async def _execute_job(self, job: CronJob) -> None:
"""执行单个任务并更新其运行状态。"""
start_ms = _now_ms()
logger.info("Cron: executing job '{}' ({})", job.name, job.id)
managed_next_run_at_ms: int | None = None
removed_by_action = False
explicit_next_run = False
try:
result = CronExecutionResult()
if self.on_job:
# on_job 是业务注入点(如 gateway 中调用 agent.process_direct
result = self._coerce_execution_result(await self.on_job(job))
if result.action is not None:
action_outcome = self._apply_structured_action(job, result.action)
removed_by_action = action_outcome.removed
explicit_next_run = action_outcome.explicit_next_run
managed_next_run_at_ms = action_outcome.managed_next_run_at_ms
elif job.schedule.kind == "every" and _looks_like_daily_limit_reached(result.response):
managed_next_run_at_ms = _next_daily_cycle_start_ms(job, _now_ms())
logger.info(
"Cron: job '{}' reached daily terminal state, snoozed until {}",
job.name,
managed_next_run_at_ms,
)
# 无论回调是否返回内容,只要没有抛异常都视为成功。
job.state.last_status = "ok"
job.state.last_error = None
logger.info("Cron: job '{}' completed", job.name)
except Exception as e:
# 执行失败仅影响当前任务,不中断调度器整体运行。
job.state.last_status = "error"
job.state.last_error = str(e)
logger.error("Cron: job '{}' failed: {}", job.name, e)
job.state.last_run_at_ms = start_ms
job.updated_at_ms = _now_ms()
if removed_by_action:
return
if explicit_next_run:
return
if managed_next_run_at_ms is not None:
# 终态任务:跳过本日剩余频繁触发,等到下一日周期起点再恢复。
job.state.next_run_at_ms = managed_next_run_at_ms
return
# 一次性任务:执行后按配置删除或停用,避免重复触发。
if job.schedule.kind == "at":
if job.delete_after_run:
# 一次性且要求删除:直接从 store 移除,后续 list 不再显示。
self._store.jobs = [j for j in self._store.jobs if j.id != job.id]
else:
# 一次性但不删除:仅禁用,便于事后审计/手动重启。
job.enabled = False
job.state.next_run_at_ms = None
else:
# 周期任务:立即计算下一次触发时间,供下轮 timer 使用。
job.state.next_run_at_ms = _compute_next_run(job.schedule, _now_ms())
# ========== Public API ==========
def list_jobs(self, include_disabled: bool = False) -> list[CronJob]:
"""列出任务,默认仅返回已启用任务。"""
store = self._load_store()
jobs = store.jobs if include_disabled else [j for j in store.jobs if j.enabled]
# 以 next_run 升序返回,便于直接展示“谁最先执行”。
return sorted(jobs, key=lambda j: j.state.next_run_at_ms or float("inf"))
def add_job(
self,
name: str,
schedule: CronSchedule,
message: str,
payload_kind: Literal["system_event", "agent_turn"] = "agent_turn",
session_key: str | None = None,
deliver: bool = False,
channel: str | None = None,
to: str | None = None,
delete_after_run: bool = False,
) -> CronJob:
"""创建并持久化新任务。"""
store = self._load_store()
# 添加前做参数合法性校验,尽早失败并给上层明确异常。
_validate_schedule_for_add(schedule)
now = _now_ms()
job = CronJob(
id=str(uuid.uuid4())[:8],
name=name,
enabled=True,
schedule=schedule,
payload=CronPayload(
kind=payload_kind,
message=message,
session_key=session_key,
deliver=deliver,
channel=channel,
to=to,
),
state=CronJobState(next_run_at_ms=_compute_next_run(schedule, now)),
created_at_ms=now,
updated_at_ms=now,
delete_after_run=delete_after_run,
)
store.jobs.append(job)
# 每次变更都立即落盘并重排 timer避免“内存态/调度态”漂移。
self._save_store()
self._arm_timer()
logger.info("Cron: added job '{}' ({})", name, job.id)
return job
def remove_job(self, job_id: str) -> bool:
"""按 ID 删除任务;存在并删除成功时返回 True。"""
store = self._load_store()
before = len(store.jobs)
store.jobs = [j for j in store.jobs if j.id != job_id]
removed = len(store.jobs) < before
if removed:
self._save_store()
self._arm_timer()
logger.info("Cron: removed job {}", job_id)
# 返回布尔值给上层决定提示文案found/not found
return removed
def enable_job(self, job_id: str, enabled: bool = True) -> CronJob | None:
"""启用或停用任务,并同步更新 next_run。"""
store = self._load_store()
for job in store.jobs:
if job.id == job_id:
job.enabled = enabled
job.updated_at_ms = _now_ms()
if enabled:
job.state.next_run_at_ms = _compute_next_run(job.schedule, _now_ms())
else:
job.state.next_run_at_ms = None
self._save_store()
self._arm_timer()
return job
# 没找到任务时返回 None调用方据此输出“not found”。
return None
async def run_job(self, job_id: str, force: bool = False) -> bool:
"""手动触发任务执行。
默认遵守启用状态;`force=True` 时即使任务被禁用也会执行一次。
"""
store = self._load_store()
for job in store.jobs:
if job.id == job_id:
if not force and not job.enabled:
# 遵守启用状态:禁用任务默认不执行。
return False
await self._execute_job(job)
self._save_store()
self._arm_timer()
return True
return False
def status(self) -> dict:
"""返回服务运行状态摘要。"""
store = self._load_store()
# 这个接口主要用于 status 面板,不暴露详细任务内容。
return {
"enabled": self._running,
"jobs": len(store.jobs),
"next_wake_at_ms": self._get_next_wake_ms(),
}