Recover running subagent tasks after restart

Reduce planned task progress noise
2026-05-09 05:37:29 +08:00 · 2026-03-06 20:05:03 +08:00 · 2026-03-06 19:57:54 +08:00
4 changed files with 189 additions and 3 deletions
--- a/pkg/agent/session_planner.go
+++ b/pkg/agent/session_planner.go
@@ -5,6 +5,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"math"
 	"os"
 	"path/filepath"
 	"regexp"
@@ -123,6 +124,11 @@ func splitPlannedSegments(content string) []string {
 func (al *AgentLoop) runPlannedTasks(ctx context.Context, msg bus.InboundMessage, tasks []plannedTask) (string, error) {
 	results := make([]plannedTaskResult, len(tasks))
 	var wg sync.WaitGroup
+	var progressMu sync.Mutex
+	completed := 0
+	failed := 0
+	milestones := plannedProgressMilestones(len(tasks))
+	notified := make(map[int]struct{}, len(milestones))
 	for i, task := range tasks {
 		wg.Add(1)
 		go func(index int, t plannedTask) {
@@ -142,7 +148,22 @@ func (al *AgentLoop) runPlannedTasks(ctx context.Context, msg bus.InboundMessage
 				res.ErrText = err.Error()
 			}
 			results[index] = res
-			al.publishPlannedTaskProgress(msg, len(tasks), res)
+			progressMu.Lock()
+			completed++
+			if res.ErrText != "" {
+				failed++
+			}
+			snapshotCompleted := completed
+			snapshotFailed := failed
+			shouldNotify := shouldPublishPlannedTaskProgress(len(tasks), snapshotCompleted, res, milestones, notified)
+			if shouldNotify && res.ErrText == "" {
+				notified[snapshotCompleted] = struct{}{}
+			}
+			progressMu.Unlock()
+
+			if shouldNotify {
+				al.publishPlannedTaskProgress(msg, len(tasks), snapshotCompleted, snapshotFailed, res)
+			}
 		}(i, task)
 	}
 	wg.Wait()
@@ -163,7 +184,50 @@ func (al *AgentLoop) runPlannedTasks(ctx context.Context, msg bus.InboundMessage
 	return strings.TrimSpace(b.String()), nil
 }

-func (al *AgentLoop) publishPlannedTaskProgress(msg bus.InboundMessage, total int, res plannedTaskResult) {
+func plannedProgressMilestones(total int) []int {
+	if total <= 3 {
+		return nil
+	}
+	points := []float64{0.33, 0.66}
+	out := make([]int, 0, len(points))
+	seen := map[int]struct{}{}
+	for _, p := range points {
+		step := int(math.Round(float64(total) * p))
+		if step <= 0 || step >= total {
+			continue
+		}
+		if _, ok := seen[step]; ok {
+			continue
+		}
+		seen[step] = struct{}{}
+		out = append(out, step)
+	}
+	return out
+}
+
+func shouldPublishPlannedTaskProgress(total, completed int, res plannedTaskResult, milestones []int, notified map[int]struct{}) bool {
+	if total <= 1 {
+		return false
+	}
+	if strings.TrimSpace(res.ErrText) != "" {
+		return true
+	}
+	if completed >= total {
+		return false
+	}
+	for _, step := range milestones {
+		if completed != step {
+			continue
+		}
+		if _, ok := notified[step]; ok {
+			return false
+		}
+		return true
+	}
+	return false
+}
+
+func (al *AgentLoop) publishPlannedTaskProgress(msg bus.InboundMessage, total, completed, failed int, res plannedTaskResult) {
 	if al == nil || al.bus == nil || total <= 1 {
 		return
 	}
@@ -184,7 +248,7 @@ func (al *AgentLoop) publishPlannedTaskProgress(msg bus.InboundMessage, total in
 		body = "(无输出)"
 	}
 	body = summarizePlannedTaskProgressBody(body, 6, 320)
-	content := fmt.Sprintf("进度 %d/%d：任务%d已%s\n%s", idx, total, idx, status, body)
+	content := fmt.Sprintf("阶段进度 %d/%d（失败 %d）\n最近任务：%d 已%s\n%s", completed, total, failed, idx, status, body)
 	al.bus.PublishOutbound(bus.OutboundMessage{
 		Channel: msg.Channel,
 		ChatID:  msg.ChatID,
--- a/pkg/agent/session_planner_progress_test.go
+++ b/pkg/agent/session_planner_progress_test.go
@@ -0,0 +1,35 @@
+package agent
+
+import "testing"
+
+func TestPlannedProgressMilestones(t *testing.T) {
+	t.Parallel()
+
+	got := plannedProgressMilestones(12)
+	if len(got) != 2 || got[0] != 4 || got[1] != 8 {
+		t.Fatalf("unexpected milestones: %#v", got)
+	}
+}
+
+func TestShouldPublishPlannedTaskProgress(t *testing.T) {
+	t.Parallel()
+
+	milestones := plannedProgressMilestones(12)
+	notified := map[int]struct{}{}
+	if shouldPublishPlannedTaskProgress(12, 1, plannedTaskResult{}, milestones, notified) {
+		t.Fatalf("did not expect early success notification")
+	}
+	if !shouldPublishPlannedTaskProgress(12, 4, plannedTaskResult{}, milestones, notified) {
+		t.Fatalf("expected milestone notification")
+	}
+	notified[4] = struct{}{}
+	if shouldPublishPlannedTaskProgress(12, 4, plannedTaskResult{}, milestones, notified) {
+		t.Fatalf("did not expect duplicate milestone notification")
+	}
+	if !shouldPublishPlannedTaskProgress(12, 5, plannedTaskResult{ErrText: "boom"}, milestones, notified) {
+		t.Fatalf("expected failure notification")
+	}
+	if shouldPublishPlannedTaskProgress(3, 3, plannedTaskResult{}, plannedProgressMilestones(3), map[int]struct{}{}) {
+		t.Fatalf("did not expect final success notification")
+	}
+}
--- a/pkg/tools/subagent.go
+++ b/pkg/tools/subagent.go
@@ -51,6 +51,7 @@ type SubagentTask struct {
 type SubagentManager struct {
 	tasks              map[string]*SubagentTask
 	cancelFuncs        map[string]context.CancelFunc
+	recoverableTaskIDs []string
 	archiveAfterMinute int64
 	mu                 sync.RWMutex
 	provider           providers.LLMProvider
@@ -99,9 +100,13 @@ func NewSubagentManager(provider providers.LLMProvider, workspace string, bus *b
 	if runStore != nil {
 		for _, task := range runStore.List() {
 			mgr.tasks[task.ID] = task
+			if task.Status == "running" {
+				mgr.recoverableTaskIDs = append(mgr.recoverableTaskIDs, task.ID)
+			}
 		}
 		mgr.nextID = runStore.NextIDSeed()
 	}
+	go mgr.resumeRecoveredTasks()
 	return mgr
 }

@@ -534,6 +539,7 @@ func (sm *SubagentManager) SetRunFunc(f SubagentRunFunc) {
 	sm.mu.Lock()
 	defer sm.mu.Unlock()
 	sm.runFunc = f
+	go sm.resumeRecoveredTasks()
 }

 func (sm *SubagentManager) ProfileStore() *SubagentProfileStore {
@@ -542,6 +548,38 @@ func (sm *SubagentManager) ProfileStore() *SubagentProfileStore {
 	return sm.profileStore
 }

+func (sm *SubagentManager) resumeRecoveredTasks() {
+	if sm == nil {
+		return
+	}
+	sm.mu.Lock()
+	if sm.runFunc == nil && sm.provider == nil {
+		sm.mu.Unlock()
+		return
+	}
+	taskIDs := append([]string(nil), sm.recoverableTaskIDs...)
+	sm.recoverableTaskIDs = nil
+	toResume := make([]*SubagentTask, 0, len(taskIDs))
+	for _, taskID := range taskIDs {
+		task, ok := sm.tasks[taskID]
+		if !ok || task == nil || task.Status != "running" {
+			continue
+		}
+		task.Updated = time.Now().UnixMilli()
+		sm.persistTaskLocked(task, "recovered", "auto-resumed after restart")
+		toResume = append(toResume, task)
+	}
+	sm.mu.Unlock()
+
+	for _, task := range toResume {
+		taskCtx, cancel := context.WithCancel(context.Background())
+		sm.mu.Lock()
+		sm.cancelFuncs[task.ID] = cancel
+		sm.mu.Unlock()
+		go sm.runTask(taskCtx, task)
+	}
+}
+
 func (sm *SubagentManager) NextTaskSequence() int {
 	sm.mu.RLock()
 	defer sm.mu.RUnlock()
--- a/pkg/tools/subagent_runtime_control_test.go
+++ b/pkg/tools/subagent_runtime_control_test.go
@@ -205,6 +205,55 @@ func TestSubagentManagerRestoresPersistedRuns(t *testing.T) {
 	time.Sleep(100 * time.Millisecond)
 }

+func TestSubagentManagerAutoRecoversRunningTaskAfterRestart(t *testing.T) {
+	workspace := t.TempDir()
+	block := make(chan struct{})
+	manager := NewSubagentManager(nil, workspace, nil)
+	manager.SetRunFunc(func(ctx context.Context, task *SubagentTask) (string, error) {
+		<-block
+		return "should-not-complete-here", nil
+	})
+
+	_, err := manager.Spawn(context.Background(), SubagentSpawnOptions{
+		Task:          "recover me",
+		AgentID:       "coder",
+		OriginChannel: "cli",
+		OriginChatID:  "direct",
+	})
+	if err != nil {
+		t.Fatalf("spawn failed: %v", err)
+	}
+	time.Sleep(80 * time.Millisecond)
+
+	recovered := make(chan string, 1)
+	reloaded := NewSubagentManager(nil, workspace, nil)
+	reloaded.SetRunFunc(func(ctx context.Context, task *SubagentTask) (string, error) {
+		recovered <- task.ID
+		return "recovered-ok", nil
+	})
+
+	select {
+	case taskID := <-recovered:
+		if taskID != "subagent-1" {
+			t.Fatalf("expected recovered task id subagent-1, got %s", taskID)
+		}
+	case <-time.After(2 * time.Second):
+		t.Fatalf("expected running task to auto-recover after restart")
+	}
+
+	got, ok := reloaded.GetTask("subagent-1")
+	if !ok {
+		t.Fatalf("expected recovered task to exist")
+	}
+	if got.Status != "completed" || got.Result != "recovered-ok" {
+		t.Fatalf("unexpected recovered task: %+v", got)
+	}
+
+	close(block)
+	_ = waitSubagentDone(t, manager, 4*time.Second)
+	time.Sleep(100 * time.Millisecond)
+}
+
 func TestSubagentManagerPersistsEvents(t *testing.T) {
 	workspace := t.TempDir()
 	manager := NewSubagentManager(nil, workspace, nil)
Author	SHA1	Message	Date
lpf	9d0ab54a97	Recover running subagent tasks after restart	2026-03-06 20:05:03 +08:00
lpf	ee9326b2f2	Reduce planned task progress noise	2026-03-06 19:57:54 +08:00