2 Commits

Author SHA1 Message Date
lpf
9d0ab54a97 Recover running subagent tasks after restart 2026-03-06 20:05:03 +08:00
lpf
ee9326b2f2 Reduce planned task progress noise 2026-03-06 19:57:54 +08:00
4 changed files with 189 additions and 3 deletions

View File

@@ -5,6 +5,7 @@ import (
"context"
"encoding/json"
"fmt"
"math"
"os"
"path/filepath"
"regexp"
@@ -123,6 +124,11 @@ func splitPlannedSegments(content string) []string {
func (al *AgentLoop) runPlannedTasks(ctx context.Context, msg bus.InboundMessage, tasks []plannedTask) (string, error) {
results := make([]plannedTaskResult, len(tasks))
var wg sync.WaitGroup
var progressMu sync.Mutex
completed := 0
failed := 0
milestones := plannedProgressMilestones(len(tasks))
notified := make(map[int]struct{}, len(milestones))
for i, task := range tasks {
wg.Add(1)
go func(index int, t plannedTask) {
@@ -142,7 +148,22 @@ func (al *AgentLoop) runPlannedTasks(ctx context.Context, msg bus.InboundMessage
res.ErrText = err.Error()
}
results[index] = res
al.publishPlannedTaskProgress(msg, len(tasks), res)
progressMu.Lock()
completed++
if res.ErrText != "" {
failed++
}
snapshotCompleted := completed
snapshotFailed := failed
shouldNotify := shouldPublishPlannedTaskProgress(len(tasks), snapshotCompleted, res, milestones, notified)
if shouldNotify && res.ErrText == "" {
notified[snapshotCompleted] = struct{}{}
}
progressMu.Unlock()
if shouldNotify {
al.publishPlannedTaskProgress(msg, len(tasks), snapshotCompleted, snapshotFailed, res)
}
}(i, task)
}
wg.Wait()
@@ -163,7 +184,50 @@ func (al *AgentLoop) runPlannedTasks(ctx context.Context, msg bus.InboundMessage
return strings.TrimSpace(b.String()), nil
}
func (al *AgentLoop) publishPlannedTaskProgress(msg bus.InboundMessage, total int, res plannedTaskResult) {
func plannedProgressMilestones(total int) []int {
if total <= 3 {
return nil
}
points := []float64{0.33, 0.66}
out := make([]int, 0, len(points))
seen := map[int]struct{}{}
for _, p := range points {
step := int(math.Round(float64(total) * p))
if step <= 0 || step >= total {
continue
}
if _, ok := seen[step]; ok {
continue
}
seen[step] = struct{}{}
out = append(out, step)
}
return out
}
func shouldPublishPlannedTaskProgress(total, completed int, res plannedTaskResult, milestones []int, notified map[int]struct{}) bool {
if total <= 1 {
return false
}
if strings.TrimSpace(res.ErrText) != "" {
return true
}
if completed >= total {
return false
}
for _, step := range milestones {
if completed != step {
continue
}
if _, ok := notified[step]; ok {
return false
}
return true
}
return false
}
func (al *AgentLoop) publishPlannedTaskProgress(msg bus.InboundMessage, total, completed, failed int, res plannedTaskResult) {
if al == nil || al.bus == nil || total <= 1 {
return
}
@@ -184,7 +248,7 @@ func (al *AgentLoop) publishPlannedTaskProgress(msg bus.InboundMessage, total in
body = "(无输出)"
}
body = summarizePlannedTaskProgressBody(body, 6, 320)
content := fmt.Sprintf("进度 %d/%d任务%d已%s\n%s", idx, total, idx, status, body)
content := fmt.Sprintf("阶段进度 %d/%d(失败 %d\n最近任务%d 已%s\n%s", completed, total, failed, idx, status, body)
al.bus.PublishOutbound(bus.OutboundMessage{
Channel: msg.Channel,
ChatID: msg.ChatID,

View File

@@ -0,0 +1,35 @@
package agent
import "testing"
func TestPlannedProgressMilestones(t *testing.T) {
t.Parallel()
got := plannedProgressMilestones(12)
if len(got) != 2 || got[0] != 4 || got[1] != 8 {
t.Fatalf("unexpected milestones: %#v", got)
}
}
func TestShouldPublishPlannedTaskProgress(t *testing.T) {
t.Parallel()
milestones := plannedProgressMilestones(12)
notified := map[int]struct{}{}
if shouldPublishPlannedTaskProgress(12, 1, plannedTaskResult{}, milestones, notified) {
t.Fatalf("did not expect early success notification")
}
if !shouldPublishPlannedTaskProgress(12, 4, plannedTaskResult{}, milestones, notified) {
t.Fatalf("expected milestone notification")
}
notified[4] = struct{}{}
if shouldPublishPlannedTaskProgress(12, 4, plannedTaskResult{}, milestones, notified) {
t.Fatalf("did not expect duplicate milestone notification")
}
if !shouldPublishPlannedTaskProgress(12, 5, plannedTaskResult{ErrText: "boom"}, milestones, notified) {
t.Fatalf("expected failure notification")
}
if shouldPublishPlannedTaskProgress(3, 3, plannedTaskResult{}, plannedProgressMilestones(3), map[int]struct{}{}) {
t.Fatalf("did not expect final success notification")
}
}

View File

@@ -51,6 +51,7 @@ type SubagentTask struct {
type SubagentManager struct {
tasks map[string]*SubagentTask
cancelFuncs map[string]context.CancelFunc
recoverableTaskIDs []string
archiveAfterMinute int64
mu sync.RWMutex
provider providers.LLMProvider
@@ -99,9 +100,13 @@ func NewSubagentManager(provider providers.LLMProvider, workspace string, bus *b
if runStore != nil {
for _, task := range runStore.List() {
mgr.tasks[task.ID] = task
if task.Status == "running" {
mgr.recoverableTaskIDs = append(mgr.recoverableTaskIDs, task.ID)
}
}
mgr.nextID = runStore.NextIDSeed()
}
go mgr.resumeRecoveredTasks()
return mgr
}
@@ -534,6 +539,7 @@ func (sm *SubagentManager) SetRunFunc(f SubagentRunFunc) {
sm.mu.Lock()
defer sm.mu.Unlock()
sm.runFunc = f
go sm.resumeRecoveredTasks()
}
func (sm *SubagentManager) ProfileStore() *SubagentProfileStore {
@@ -542,6 +548,38 @@ func (sm *SubagentManager) ProfileStore() *SubagentProfileStore {
return sm.profileStore
}
func (sm *SubagentManager) resumeRecoveredTasks() {
if sm == nil {
return
}
sm.mu.Lock()
if sm.runFunc == nil && sm.provider == nil {
sm.mu.Unlock()
return
}
taskIDs := append([]string(nil), sm.recoverableTaskIDs...)
sm.recoverableTaskIDs = nil
toResume := make([]*SubagentTask, 0, len(taskIDs))
for _, taskID := range taskIDs {
task, ok := sm.tasks[taskID]
if !ok || task == nil || task.Status != "running" {
continue
}
task.Updated = time.Now().UnixMilli()
sm.persistTaskLocked(task, "recovered", "auto-resumed after restart")
toResume = append(toResume, task)
}
sm.mu.Unlock()
for _, task := range toResume {
taskCtx, cancel := context.WithCancel(context.Background())
sm.mu.Lock()
sm.cancelFuncs[task.ID] = cancel
sm.mu.Unlock()
go sm.runTask(taskCtx, task)
}
}
func (sm *SubagentManager) NextTaskSequence() int {
sm.mu.RLock()
defer sm.mu.RUnlock()

View File

@@ -205,6 +205,55 @@ func TestSubagentManagerRestoresPersistedRuns(t *testing.T) {
time.Sleep(100 * time.Millisecond)
}
func TestSubagentManagerAutoRecoversRunningTaskAfterRestart(t *testing.T) {
workspace := t.TempDir()
block := make(chan struct{})
manager := NewSubagentManager(nil, workspace, nil)
manager.SetRunFunc(func(ctx context.Context, task *SubagentTask) (string, error) {
<-block
return "should-not-complete-here", nil
})
_, err := manager.Spawn(context.Background(), SubagentSpawnOptions{
Task: "recover me",
AgentID: "coder",
OriginChannel: "cli",
OriginChatID: "direct",
})
if err != nil {
t.Fatalf("spawn failed: %v", err)
}
time.Sleep(80 * time.Millisecond)
recovered := make(chan string, 1)
reloaded := NewSubagentManager(nil, workspace, nil)
reloaded.SetRunFunc(func(ctx context.Context, task *SubagentTask) (string, error) {
recovered <- task.ID
return "recovered-ok", nil
})
select {
case taskID := <-recovered:
if taskID != "subagent-1" {
t.Fatalf("expected recovered task id subagent-1, got %s", taskID)
}
case <-time.After(2 * time.Second):
t.Fatalf("expected running task to auto-recover after restart")
}
got, ok := reloaded.GetTask("subagent-1")
if !ok {
t.Fatalf("expected recovered task to exist")
}
if got.Status != "completed" || got.Result != "recovered-ok" {
t.Fatalf("unexpected recovered task: %+v", got)
}
close(block)
_ = waitSubagentDone(t, manager, 4*time.Second)
time.Sleep(100 * time.Millisecond)
}
func TestSubagentManagerPersistsEvents(t *testing.T) {
workspace := t.TempDir()
manager := NewSubagentManager(nil, workspace, nil)