3 Commits

Author SHA1 Message Date
lpf
557633b698 Refresh agent topology state after task restart 2026-03-07 01:53:18 +08:00
lpf
823f96be5a Stop planned task spam after cancellation 2026-03-06 21:31:21 +08:00
lpf
9d0ab54a97 Recover running subagent tasks after restart 2026-03-06 20:05:03 +08:00
6 changed files with 169 additions and 16 deletions

View File

@@ -9,6 +9,7 @@ package agent
import (
"context"
"encoding/json"
"errors"
"fmt"
"hash/fnv"
"math"
@@ -544,6 +545,11 @@ func (al *AgentLoop) processInbound(ctx context.Context, msg bus.InboundMessage)
response, err := al.processPlannedMessage(ctx, msg)
if err != nil {
if errors.Is(err, context.Canceled) {
al.audit.Record(al.getTrigger(msg), msg.Channel, msg.SessionKey, true, err)
al.appendTaskAudit(taskID, msg, started, err, true)
return
}
response = fmt.Sprintf("Error processing message: %v", err)
}

View File

@@ -4,6 +4,7 @@ import (
"bufio"
"context"
"encoding/json"
"errors"
"fmt"
"math"
"os"
@@ -27,6 +28,7 @@ type plannedTaskResult struct {
Index int
Task plannedTask
Output string
Err error
ErrText string
}
@@ -143,19 +145,19 @@ func (al *AgentLoop) runPlannedTasks(ctx context.Context, msg bus.InboundMessage
subMsg.Metadata["planned_task_index"] = fmt.Sprintf("%d", t.Index)
subMsg.Metadata["planned_task_total"] = fmt.Sprintf("%d", len(tasks))
out, err := al.processMessage(ctx, subMsg)
res := plannedTaskResult{Index: index, Task: t, Output: strings.TrimSpace(out)}
res := plannedTaskResult{Index: index, Task: t, Output: strings.TrimSpace(out), Err: err}
if err != nil {
res.ErrText = err.Error()
}
results[index] = res
progressMu.Lock()
completed++
if res.ErrText != "" {
if res.ErrText != "" && !isPlannedTaskCancellation(ctx, res) {
failed++
}
snapshotCompleted := completed
snapshotFailed := failed
shouldNotify := shouldPublishPlannedTaskProgress(len(tasks), snapshotCompleted, res, milestones, notified)
shouldNotify := shouldPublishPlannedTaskProgress(ctx, len(tasks), snapshotCompleted, res, milestones, notified)
if shouldNotify && res.ErrText == "" {
notified[snapshotCompleted] = struct{}{}
}
@@ -167,6 +169,9 @@ func (al *AgentLoop) runPlannedTasks(ctx context.Context, msg bus.InboundMessage
}(i, task)
}
wg.Wait()
if err := ctx.Err(); err != nil {
return "", err
}
var b strings.Builder
b.WriteString(fmt.Sprintf("已自动拆解为 %d 个任务并执行:\n\n", len(results)))
for _, r := range results {
@@ -205,10 +210,13 @@ func plannedProgressMilestones(total int) []int {
return out
}
func shouldPublishPlannedTaskProgress(total, completed int, res plannedTaskResult, milestones []int, notified map[int]struct{}) bool {
func shouldPublishPlannedTaskProgress(ctx context.Context, total, completed int, res plannedTaskResult, milestones []int, notified map[int]struct{}) bool {
if total <= 1 {
return false
}
if isPlannedTaskCancellation(ctx, res) {
return false
}
if strings.TrimSpace(res.ErrText) != "" {
return true
}
@@ -227,6 +235,16 @@ func shouldPublishPlannedTaskProgress(total, completed int, res plannedTaskResul
return false
}
func isPlannedTaskCancellation(ctx context.Context, res plannedTaskResult) bool {
if res.Err != nil && errors.Is(res.Err, context.Canceled) {
return true
}
if strings.EqualFold(strings.TrimSpace(res.ErrText), context.Canceled.Error()) {
return true
}
return ctx != nil && errors.Is(ctx.Err(), context.Canceled)
}
func (al *AgentLoop) publishPlannedTaskProgress(msg bus.InboundMessage, total, completed, failed int, res plannedTaskResult) {
if al == nil || al.bus == nil || total <= 1 {
return

View File

@@ -1,6 +1,10 @@
package agent
import "testing"
import (
"context"
"errors"
"testing"
)
func TestPlannedProgressMilestones(t *testing.T) {
t.Parallel()
@@ -16,20 +20,44 @@ func TestShouldPublishPlannedTaskProgress(t *testing.T) {
milestones := plannedProgressMilestones(12)
notified := map[int]struct{}{}
if shouldPublishPlannedTaskProgress(12, 1, plannedTaskResult{}, milestones, notified) {
if shouldPublishPlannedTaskProgress(context.Background(), 12, 1, plannedTaskResult{}, milestones, notified) {
t.Fatalf("did not expect early success notification")
}
if !shouldPublishPlannedTaskProgress(12, 4, plannedTaskResult{}, milestones, notified) {
if !shouldPublishPlannedTaskProgress(context.Background(), 12, 4, plannedTaskResult{}, milestones, notified) {
t.Fatalf("expected milestone notification")
}
notified[4] = struct{}{}
if shouldPublishPlannedTaskProgress(12, 4, plannedTaskResult{}, milestones, notified) {
if shouldPublishPlannedTaskProgress(context.Background(), 12, 4, plannedTaskResult{}, milestones, notified) {
t.Fatalf("did not expect duplicate milestone notification")
}
if !shouldPublishPlannedTaskProgress(12, 5, plannedTaskResult{ErrText: "boom"}, milestones, notified) {
if !shouldPublishPlannedTaskProgress(context.Background(), 12, 5, plannedTaskResult{ErrText: "boom"}, milestones, notified) {
t.Fatalf("expected failure notification")
}
if shouldPublishPlannedTaskProgress(3, 3, plannedTaskResult{}, plannedProgressMilestones(3), map[int]struct{}{}) {
if shouldPublishPlannedTaskProgress(context.Background(), 3, 3, plannedTaskResult{}, plannedProgressMilestones(3), map[int]struct{}{}) {
t.Fatalf("did not expect final success notification")
}
if shouldPublishPlannedTaskProgress(context.Background(), 12, 5, plannedTaskResult{Err: context.Canceled, ErrText: context.Canceled.Error()}, milestones, notified) {
t.Fatalf("did not expect cancellation notification")
}
ctx, cancel := context.WithCancel(context.Background())
cancel()
if shouldPublishPlannedTaskProgress(ctx, 12, 5, plannedTaskResult{Err: errors.New("worker exited after parent stop"), ErrText: "worker exited after parent stop"}, milestones, notified) {
t.Fatalf("did not expect notification after parent cancellation")
}
}
func TestIsPlannedTaskCancellation(t *testing.T) {
t.Parallel()
if !isPlannedTaskCancellation(context.Background(), plannedTaskResult{Err: context.Canceled, ErrText: context.Canceled.Error()}) {
t.Fatalf("expected direct context cancellation to be detected")
}
ctx, cancel := context.WithCancel(context.Background())
cancel()
if !isPlannedTaskCancellation(ctx, plannedTaskResult{Err: errors.New("worker exited after parent stop"), ErrText: "worker exited after parent stop"}) {
t.Fatalf("expected canceled parent context to suppress planned task result")
}
if isPlannedTaskCancellation(context.Background(), plannedTaskResult{Err: errors.New("boom"), ErrText: "boom"}) {
t.Fatalf("did not expect non-cancellation error to be suppressed")
}
}

View File

@@ -51,6 +51,7 @@ type SubagentTask struct {
type SubagentManager struct {
tasks map[string]*SubagentTask
cancelFuncs map[string]context.CancelFunc
recoverableTaskIDs []string
archiveAfterMinute int64
mu sync.RWMutex
provider providers.LLMProvider
@@ -99,9 +100,13 @@ func NewSubagentManager(provider providers.LLMProvider, workspace string, bus *b
if runStore != nil {
for _, task := range runStore.List() {
mgr.tasks[task.ID] = task
if task.Status == "running" {
mgr.recoverableTaskIDs = append(mgr.recoverableTaskIDs, task.ID)
}
}
mgr.nextID = runStore.NextIDSeed()
}
go mgr.resumeRecoveredTasks()
return mgr
}
@@ -534,6 +539,7 @@ func (sm *SubagentManager) SetRunFunc(f SubagentRunFunc) {
sm.mu.Lock()
defer sm.mu.Unlock()
sm.runFunc = f
go sm.resumeRecoveredTasks()
}
func (sm *SubagentManager) ProfileStore() *SubagentProfileStore {
@@ -542,6 +548,38 @@ func (sm *SubagentManager) ProfileStore() *SubagentProfileStore {
return sm.profileStore
}
func (sm *SubagentManager) resumeRecoveredTasks() {
if sm == nil {
return
}
sm.mu.Lock()
if sm.runFunc == nil && sm.provider == nil {
sm.mu.Unlock()
return
}
taskIDs := append([]string(nil), sm.recoverableTaskIDs...)
sm.recoverableTaskIDs = nil
toResume := make([]*SubagentTask, 0, len(taskIDs))
for _, taskID := range taskIDs {
task, ok := sm.tasks[taskID]
if !ok || task == nil || task.Status != "running" {
continue
}
task.Updated = time.Now().UnixMilli()
sm.persistTaskLocked(task, "recovered", "auto-resumed after restart")
toResume = append(toResume, task)
}
sm.mu.Unlock()
for _, task := range toResume {
taskCtx, cancel := context.WithCancel(context.Background())
sm.mu.Lock()
sm.cancelFuncs[task.ID] = cancel
sm.mu.Unlock()
go sm.runTask(taskCtx, task)
}
}
func (sm *SubagentManager) NextTaskSequence() int {
sm.mu.RLock()
defer sm.mu.RUnlock()

View File

@@ -205,6 +205,55 @@ func TestSubagentManagerRestoresPersistedRuns(t *testing.T) {
time.Sleep(100 * time.Millisecond)
}
func TestSubagentManagerAutoRecoversRunningTaskAfterRestart(t *testing.T) {
workspace := t.TempDir()
block := make(chan struct{})
manager := NewSubagentManager(nil, workspace, nil)
manager.SetRunFunc(func(ctx context.Context, task *SubagentTask) (string, error) {
<-block
return "should-not-complete-here", nil
})
_, err := manager.Spawn(context.Background(), SubagentSpawnOptions{
Task: "recover me",
AgentID: "coder",
OriginChannel: "cli",
OriginChatID: "direct",
})
if err != nil {
t.Fatalf("spawn failed: %v", err)
}
time.Sleep(80 * time.Millisecond)
recovered := make(chan string, 1)
reloaded := NewSubagentManager(nil, workspace, nil)
reloaded.SetRunFunc(func(ctx context.Context, task *SubagentTask) (string, error) {
recovered <- task.ID
return "recovered-ok", nil
})
select {
case taskID := <-recovered:
if taskID != "subagent-1" {
t.Fatalf("expected recovered task id subagent-1, got %s", taskID)
}
case <-time.After(2 * time.Second):
t.Fatalf("expected running task to auto-recover after restart")
}
got, ok := reloaded.GetTask("subagent-1")
if !ok {
t.Fatalf("expected recovered task to exist")
}
if got.Status != "completed" || got.Result != "recovered-ok" {
t.Fatalf("unexpected recovered task: %+v", got)
}
close(block)
_ = waitSubagentDone(t, manager, 4*time.Second)
time.Sleep(100 * time.Millisecond)
}
func TestSubagentManagerPersistsEvents(t *testing.T) {
workspace := t.TempDir()
manager := NewSubagentManager(nil, workspace, nil)

View File

@@ -115,6 +115,8 @@ type AgentTaskStats = {
running: number;
failed: number;
waiting: number;
latestStatus: string;
latestUpdated: number;
active: Array<{ id: string; status: string; title: string }>;
};
@@ -199,13 +201,18 @@ function buildTaskStats(tasks: SubagentTask[]): Record<string, AgentTaskStats> {
const agentID = normalizeTitle(task.agent_id, '');
if (!agentID) return acc;
if (!acc[agentID]) {
acc[agentID] = { total: 0, running: 0, failed: 0, waiting: 0, active: [] };
acc[agentID] = { total: 0, running: 0, failed: 0, waiting: 0, latestStatus: '', latestUpdated: 0, active: [] };
}
const item = acc[agentID];
item.total += 1;
if (task.status === 'running') item.running += 1;
if (task.status === 'failed') item.failed += 1;
if (task.waiting_for_reply) item.waiting += 1;
const updatedAt = Math.max(task.updated || 0, task.created || 0);
if (updatedAt >= item.latestUpdated) {
item.latestUpdated = updatedAt;
item.latestStatus = normalizeTitle(task.status, '');
item.failed = task.status === 'failed' ? 1 : 0;
}
if (task.status === 'running' || task.waiting_for_reply) {
item.active.push({
id: task.id,
@@ -400,6 +407,13 @@ const Subagents: React.FC = () => {
load().catch(() => { });
}, [q, selectedAgentID]);
useEffect(() => {
const timer = window.setInterval(() => {
load().catch(() => { });
}, 5000);
return () => window.clearInterval(timer);
}, [q, selectedAgentID]);
const selected = useMemo(() => items.find((x) => x.id === selectedId) || null, [items, selectedId]);
const selectedRegistryItem = useMemo(
() => registryItems.find((x) => x.agent_id === selectedAgentID) || null,
@@ -489,7 +503,7 @@ const Subagents: React.FC = () => {
failed: 0,
};
const localMainStats = taskStats[normalizeTitle(localRoot.agent_id, 'main')] || { total: 0, running: 0, failed: 0, waiting: 0, active: [] };
const localMainStats = taskStats[normalizeTitle(localRoot.agent_id, 'main')] || { total: 0, running: 0, failed: 0, waiting: 0, latestStatus: '', latestUpdated: 0, active: [] };
const localMainTask = recentTaskByAgent[normalizeTitle(localRoot.agent_id, 'main')];
localBranchStats.running += localMainStats.running;
localBranchStats.failed += localMainStats.failed;
@@ -512,7 +526,7 @@ const Subagents: React.FC = () => {
`transport=${normalizeTitle(localRoot.transport, 'local')} type=${normalizeTitle(localRoot.type, 'router')}`,
localMainStats.active[0] ? `task: ${localMainStats.active[0].title}` : t('noLiveTasks'),
],
accent: 'bg-amber-400',
accent: localMainStats.running > 0 ? 'bg-emerald-500' : localMainStats.latestStatus === 'failed' ? 'bg-red-500' : 'bg-amber-400',
clickable: true,
scale,
onClick: () => {
@@ -525,7 +539,7 @@ const Subagents: React.FC = () => {
localChildren.forEach((child, idx) => {
const childX = localOriginX + idx * (cardWidth + clusterGap);
const childY = childStartY;
const stats = taskStats[normalizeTitle(child.agent_id, '')] || { total: 0, running: 0, failed: 0, waiting: 0, active: [] };
const stats = taskStats[normalizeTitle(child.agent_id, '')] || { total: 0, running: 0, failed: 0, waiting: 0, latestStatus: '', latestUpdated: 0, active: [] };
const task = recentTaskByAgent[normalizeTitle(child.agent_id, '')];
localBranchStats.running += stats.running;
localBranchStats.failed += stats.failed;
@@ -547,7 +561,7 @@ const Subagents: React.FC = () => {
`transport=${normalizeTitle(child.transport, 'local')} type=${normalizeTitle(child.type, 'worker')}`,
stats.active[0] ? `task: ${stats.active[0].title}` : task ? `last: ${summarizeTask(task.task, task.label)}` : t('noLiveTasks'),
],
accent: stats.running > 0 ? 'bg-emerald-500' : stats.failed > 0 ? 'bg-red-500' : 'bg-sky-400',
accent: stats.running > 0 ? 'bg-emerald-500' : stats.latestStatus === 'failed' ? 'bg-red-500' : 'bg-sky-400',
clickable: true,
scale,
onClick: () => {