4 Commits

Author SHA1 Message Date
lpf
557633b698 Refresh agent topology state after task restart 2026-03-07 01:53:18 +08:00
lpf
823f96be5a Stop planned task spam after cancellation 2026-03-06 21:31:21 +08:00
lpf
9d0ab54a97 Recover running subagent tasks after restart 2026-03-06 20:05:03 +08:00
lpf
ee9326b2f2 Reduce planned task progress noise 2026-03-06 19:57:54 +08:00
6 changed files with 262 additions and 10 deletions

View File

@@ -9,6 +9,7 @@ package agent
import (
"context"
"encoding/json"
"errors"
"fmt"
"hash/fnv"
"math"
@@ -544,6 +545,11 @@ func (al *AgentLoop) processInbound(ctx context.Context, msg bus.InboundMessage)
response, err := al.processPlannedMessage(ctx, msg)
if err != nil {
if errors.Is(err, context.Canceled) {
al.audit.Record(al.getTrigger(msg), msg.Channel, msg.SessionKey, true, err)
al.appendTaskAudit(taskID, msg, started, err, true)
return
}
response = fmt.Sprintf("Error processing message: %v", err)
}

View File

@@ -4,7 +4,9 @@ import (
"bufio"
"context"
"encoding/json"
"errors"
"fmt"
"math"
"os"
"path/filepath"
"regexp"
@@ -26,6 +28,7 @@ type plannedTaskResult struct {
Index int
Task plannedTask
Output string
Err error
ErrText string
}
@@ -123,6 +126,11 @@ func splitPlannedSegments(content string) []string {
func (al *AgentLoop) runPlannedTasks(ctx context.Context, msg bus.InboundMessage, tasks []plannedTask) (string, error) {
results := make([]plannedTaskResult, len(tasks))
var wg sync.WaitGroup
var progressMu sync.Mutex
completed := 0
failed := 0
milestones := plannedProgressMilestones(len(tasks))
notified := make(map[int]struct{}, len(milestones))
for i, task := range tasks {
wg.Add(1)
go func(index int, t plannedTask) {
@@ -137,15 +145,33 @@ func (al *AgentLoop) runPlannedTasks(ctx context.Context, msg bus.InboundMessage
subMsg.Metadata["planned_task_index"] = fmt.Sprintf("%d", t.Index)
subMsg.Metadata["planned_task_total"] = fmt.Sprintf("%d", len(tasks))
out, err := al.processMessage(ctx, subMsg)
res := plannedTaskResult{Index: index, Task: t, Output: strings.TrimSpace(out)}
res := plannedTaskResult{Index: index, Task: t, Output: strings.TrimSpace(out), Err: err}
if err != nil {
res.ErrText = err.Error()
}
results[index] = res
al.publishPlannedTaskProgress(msg, len(tasks), res)
progressMu.Lock()
completed++
if res.ErrText != "" && !isPlannedTaskCancellation(ctx, res) {
failed++
}
snapshotCompleted := completed
snapshotFailed := failed
shouldNotify := shouldPublishPlannedTaskProgress(ctx, len(tasks), snapshotCompleted, res, milestones, notified)
if shouldNotify && res.ErrText == "" {
notified[snapshotCompleted] = struct{}{}
}
progressMu.Unlock()
if shouldNotify {
al.publishPlannedTaskProgress(msg, len(tasks), snapshotCompleted, snapshotFailed, res)
}
}(i, task)
}
wg.Wait()
if err := ctx.Err(); err != nil {
return "", err
}
var b strings.Builder
b.WriteString(fmt.Sprintf("已自动拆解为 %d 个任务并执行:\n\n", len(results)))
for _, r := range results {
@@ -163,7 +189,63 @@ func (al *AgentLoop) runPlannedTasks(ctx context.Context, msg bus.InboundMessage
return strings.TrimSpace(b.String()), nil
}
func (al *AgentLoop) publishPlannedTaskProgress(msg bus.InboundMessage, total int, res plannedTaskResult) {
func plannedProgressMilestones(total int) []int {
if total <= 3 {
return nil
}
points := []float64{0.33, 0.66}
out := make([]int, 0, len(points))
seen := map[int]struct{}{}
for _, p := range points {
step := int(math.Round(float64(total) * p))
if step <= 0 || step >= total {
continue
}
if _, ok := seen[step]; ok {
continue
}
seen[step] = struct{}{}
out = append(out, step)
}
return out
}
func shouldPublishPlannedTaskProgress(ctx context.Context, total, completed int, res plannedTaskResult, milestones []int, notified map[int]struct{}) bool {
if total <= 1 {
return false
}
if isPlannedTaskCancellation(ctx, res) {
return false
}
if strings.TrimSpace(res.ErrText) != "" {
return true
}
if completed >= total {
return false
}
for _, step := range milestones {
if completed != step {
continue
}
if _, ok := notified[step]; ok {
return false
}
return true
}
return false
}
func isPlannedTaskCancellation(ctx context.Context, res plannedTaskResult) bool {
if res.Err != nil && errors.Is(res.Err, context.Canceled) {
return true
}
if strings.EqualFold(strings.TrimSpace(res.ErrText), context.Canceled.Error()) {
return true
}
return ctx != nil && errors.Is(ctx.Err(), context.Canceled)
}
func (al *AgentLoop) publishPlannedTaskProgress(msg bus.InboundMessage, total, completed, failed int, res plannedTaskResult) {
if al == nil || al.bus == nil || total <= 1 {
return
}
@@ -184,7 +266,7 @@ func (al *AgentLoop) publishPlannedTaskProgress(msg bus.InboundMessage, total in
body = "(无输出)"
}
body = summarizePlannedTaskProgressBody(body, 6, 320)
content := fmt.Sprintf("进度 %d/%d任务%d已%s\n%s", idx, total, idx, status, body)
content := fmt.Sprintf("阶段进度 %d/%d(失败 %d\n最近任务%d 已%s\n%s", completed, total, failed, idx, status, body)
al.bus.PublishOutbound(bus.OutboundMessage{
Channel: msg.Channel,
ChatID: msg.ChatID,

View File

@@ -0,0 +1,63 @@
package agent
import (
"context"
"errors"
"testing"
)
func TestPlannedProgressMilestones(t *testing.T) {
t.Parallel()
got := plannedProgressMilestones(12)
if len(got) != 2 || got[0] != 4 || got[1] != 8 {
t.Fatalf("unexpected milestones: %#v", got)
}
}
func TestShouldPublishPlannedTaskProgress(t *testing.T) {
t.Parallel()
milestones := plannedProgressMilestones(12)
notified := map[int]struct{}{}
if shouldPublishPlannedTaskProgress(context.Background(), 12, 1, plannedTaskResult{}, milestones, notified) {
t.Fatalf("did not expect early success notification")
}
if !shouldPublishPlannedTaskProgress(context.Background(), 12, 4, plannedTaskResult{}, milestones, notified) {
t.Fatalf("expected milestone notification")
}
notified[4] = struct{}{}
if shouldPublishPlannedTaskProgress(context.Background(), 12, 4, plannedTaskResult{}, milestones, notified) {
t.Fatalf("did not expect duplicate milestone notification")
}
if !shouldPublishPlannedTaskProgress(context.Background(), 12, 5, plannedTaskResult{ErrText: "boom"}, milestones, notified) {
t.Fatalf("expected failure notification")
}
if shouldPublishPlannedTaskProgress(context.Background(), 3, 3, plannedTaskResult{}, plannedProgressMilestones(3), map[int]struct{}{}) {
t.Fatalf("did not expect final success notification")
}
if shouldPublishPlannedTaskProgress(context.Background(), 12, 5, plannedTaskResult{Err: context.Canceled, ErrText: context.Canceled.Error()}, milestones, notified) {
t.Fatalf("did not expect cancellation notification")
}
ctx, cancel := context.WithCancel(context.Background())
cancel()
if shouldPublishPlannedTaskProgress(ctx, 12, 5, plannedTaskResult{Err: errors.New("worker exited after parent stop"), ErrText: "worker exited after parent stop"}, milestones, notified) {
t.Fatalf("did not expect notification after parent cancellation")
}
}
func TestIsPlannedTaskCancellation(t *testing.T) {
t.Parallel()
if !isPlannedTaskCancellation(context.Background(), plannedTaskResult{Err: context.Canceled, ErrText: context.Canceled.Error()}) {
t.Fatalf("expected direct context cancellation to be detected")
}
ctx, cancel := context.WithCancel(context.Background())
cancel()
if !isPlannedTaskCancellation(ctx, plannedTaskResult{Err: errors.New("worker exited after parent stop"), ErrText: "worker exited after parent stop"}) {
t.Fatalf("expected canceled parent context to suppress planned task result")
}
if isPlannedTaskCancellation(context.Background(), plannedTaskResult{Err: errors.New("boom"), ErrText: "boom"}) {
t.Fatalf("did not expect non-cancellation error to be suppressed")
}
}

View File

@@ -51,6 +51,7 @@ type SubagentTask struct {
type SubagentManager struct {
tasks map[string]*SubagentTask
cancelFuncs map[string]context.CancelFunc
recoverableTaskIDs []string
archiveAfterMinute int64
mu sync.RWMutex
provider providers.LLMProvider
@@ -99,9 +100,13 @@ func NewSubagentManager(provider providers.LLMProvider, workspace string, bus *b
if runStore != nil {
for _, task := range runStore.List() {
mgr.tasks[task.ID] = task
if task.Status == "running" {
mgr.recoverableTaskIDs = append(mgr.recoverableTaskIDs, task.ID)
}
}
mgr.nextID = runStore.NextIDSeed()
}
go mgr.resumeRecoveredTasks()
return mgr
}
@@ -534,6 +539,7 @@ func (sm *SubagentManager) SetRunFunc(f SubagentRunFunc) {
sm.mu.Lock()
defer sm.mu.Unlock()
sm.runFunc = f
go sm.resumeRecoveredTasks()
}
func (sm *SubagentManager) ProfileStore() *SubagentProfileStore {
@@ -542,6 +548,38 @@ func (sm *SubagentManager) ProfileStore() *SubagentProfileStore {
return sm.profileStore
}
func (sm *SubagentManager) resumeRecoveredTasks() {
if sm == nil {
return
}
sm.mu.Lock()
if sm.runFunc == nil && sm.provider == nil {
sm.mu.Unlock()
return
}
taskIDs := append([]string(nil), sm.recoverableTaskIDs...)
sm.recoverableTaskIDs = nil
toResume := make([]*SubagentTask, 0, len(taskIDs))
for _, taskID := range taskIDs {
task, ok := sm.tasks[taskID]
if !ok || task == nil || task.Status != "running" {
continue
}
task.Updated = time.Now().UnixMilli()
sm.persistTaskLocked(task, "recovered", "auto-resumed after restart")
toResume = append(toResume, task)
}
sm.mu.Unlock()
for _, task := range toResume {
taskCtx, cancel := context.WithCancel(context.Background())
sm.mu.Lock()
sm.cancelFuncs[task.ID] = cancel
sm.mu.Unlock()
go sm.runTask(taskCtx, task)
}
}
func (sm *SubagentManager) NextTaskSequence() int {
sm.mu.RLock()
defer sm.mu.RUnlock()

View File

@@ -205,6 +205,55 @@ func TestSubagentManagerRestoresPersistedRuns(t *testing.T) {
time.Sleep(100 * time.Millisecond)
}
func TestSubagentManagerAutoRecoversRunningTaskAfterRestart(t *testing.T) {
workspace := t.TempDir()
block := make(chan struct{})
manager := NewSubagentManager(nil, workspace, nil)
manager.SetRunFunc(func(ctx context.Context, task *SubagentTask) (string, error) {
<-block
return "should-not-complete-here", nil
})
_, err := manager.Spawn(context.Background(), SubagentSpawnOptions{
Task: "recover me",
AgentID: "coder",
OriginChannel: "cli",
OriginChatID: "direct",
})
if err != nil {
t.Fatalf("spawn failed: %v", err)
}
time.Sleep(80 * time.Millisecond)
recovered := make(chan string, 1)
reloaded := NewSubagentManager(nil, workspace, nil)
reloaded.SetRunFunc(func(ctx context.Context, task *SubagentTask) (string, error) {
recovered <- task.ID
return "recovered-ok", nil
})
select {
case taskID := <-recovered:
if taskID != "subagent-1" {
t.Fatalf("expected recovered task id subagent-1, got %s", taskID)
}
case <-time.After(2 * time.Second):
t.Fatalf("expected running task to auto-recover after restart")
}
got, ok := reloaded.GetTask("subagent-1")
if !ok {
t.Fatalf("expected recovered task to exist")
}
if got.Status != "completed" || got.Result != "recovered-ok" {
t.Fatalf("unexpected recovered task: %+v", got)
}
close(block)
_ = waitSubagentDone(t, manager, 4*time.Second)
time.Sleep(100 * time.Millisecond)
}
func TestSubagentManagerPersistsEvents(t *testing.T) {
workspace := t.TempDir()
manager := NewSubagentManager(nil, workspace, nil)

View File

@@ -115,6 +115,8 @@ type AgentTaskStats = {
running: number;
failed: number;
waiting: number;
latestStatus: string;
latestUpdated: number;
active: Array<{ id: string; status: string; title: string }>;
};
@@ -199,13 +201,18 @@ function buildTaskStats(tasks: SubagentTask[]): Record<string, AgentTaskStats> {
const agentID = normalizeTitle(task.agent_id, '');
if (!agentID) return acc;
if (!acc[agentID]) {
acc[agentID] = { total: 0, running: 0, failed: 0, waiting: 0, active: [] };
acc[agentID] = { total: 0, running: 0, failed: 0, waiting: 0, latestStatus: '', latestUpdated: 0, active: [] };
}
const item = acc[agentID];
item.total += 1;
if (task.status === 'running') item.running += 1;
if (task.status === 'failed') item.failed += 1;
if (task.waiting_for_reply) item.waiting += 1;
const updatedAt = Math.max(task.updated || 0, task.created || 0);
if (updatedAt >= item.latestUpdated) {
item.latestUpdated = updatedAt;
item.latestStatus = normalizeTitle(task.status, '');
item.failed = task.status === 'failed' ? 1 : 0;
}
if (task.status === 'running' || task.waiting_for_reply) {
item.active.push({
id: task.id,
@@ -400,6 +407,13 @@ const Subagents: React.FC = () => {
load().catch(() => { });
}, [q, selectedAgentID]);
useEffect(() => {
const timer = window.setInterval(() => {
load().catch(() => { });
}, 5000);
return () => window.clearInterval(timer);
}, [q, selectedAgentID]);
const selected = useMemo(() => items.find((x) => x.id === selectedId) || null, [items, selectedId]);
const selectedRegistryItem = useMemo(
() => registryItems.find((x) => x.agent_id === selectedAgentID) || null,
@@ -489,7 +503,7 @@ const Subagents: React.FC = () => {
failed: 0,
};
const localMainStats = taskStats[normalizeTitle(localRoot.agent_id, 'main')] || { total: 0, running: 0, failed: 0, waiting: 0, active: [] };
const localMainStats = taskStats[normalizeTitle(localRoot.agent_id, 'main')] || { total: 0, running: 0, failed: 0, waiting: 0, latestStatus: '', latestUpdated: 0, active: [] };
const localMainTask = recentTaskByAgent[normalizeTitle(localRoot.agent_id, 'main')];
localBranchStats.running += localMainStats.running;
localBranchStats.failed += localMainStats.failed;
@@ -512,7 +526,7 @@ const Subagents: React.FC = () => {
`transport=${normalizeTitle(localRoot.transport, 'local')} type=${normalizeTitle(localRoot.type, 'router')}`,
localMainStats.active[0] ? `task: ${localMainStats.active[0].title}` : t('noLiveTasks'),
],
accent: 'bg-amber-400',
accent: localMainStats.running > 0 ? 'bg-emerald-500' : localMainStats.latestStatus === 'failed' ? 'bg-red-500' : 'bg-amber-400',
clickable: true,
scale,
onClick: () => {
@@ -525,7 +539,7 @@ const Subagents: React.FC = () => {
localChildren.forEach((child, idx) => {
const childX = localOriginX + idx * (cardWidth + clusterGap);
const childY = childStartY;
const stats = taskStats[normalizeTitle(child.agent_id, '')] || { total: 0, running: 0, failed: 0, waiting: 0, active: [] };
const stats = taskStats[normalizeTitle(child.agent_id, '')] || { total: 0, running: 0, failed: 0, waiting: 0, latestStatus: '', latestUpdated: 0, active: [] };
const task = recentTaskByAgent[normalizeTitle(child.agent_id, '')];
localBranchStats.running += stats.running;
localBranchStats.failed += stats.failed;
@@ -547,7 +561,7 @@ const Subagents: React.FC = () => {
`transport=${normalizeTitle(child.transport, 'local')} type=${normalizeTitle(child.type, 'worker')}`,
stats.active[0] ? `task: ${stats.active[0].title}` : task ? `last: ${summarizeTask(task.task, task.label)}` : t('noLiveTasks'),
],
accent: stats.running > 0 ? 'bg-emerald-500' : stats.failed > 0 ? 'bg-red-500' : 'bg-sky-400',
accent: stats.running > 0 ? 'bg-emerald-500' : stats.latestStatus === 'failed' ? 'bg-red-500' : 'bg-sky-400',
clickable: true,
scale,
onClick: () => {