feat: harden concurrency scheduling and task watchdog

This commit is contained in:
lpf
2026-03-05 11:32:06 +08:00
parent 0f3196f305
commit 2fbb98bccd
20 changed files with 1526 additions and 159 deletions

View File

@@ -955,10 +955,6 @@ func buildHeartbeatService(cfg *config.Config, msgBus *bus.MessageBus) *heartbea
func buildAutonomyEngine(cfg *config.Config, msgBus *bus.MessageBus) *autonomy.Engine {
a := cfg.Agents.Defaults.Autonomy
maxRoundsWithoutUser := a.MaxRoundsWithoutUser
if maxRoundsWithoutUser == 0 && cfg.Agents.Defaults.RuntimeControl.AutonomyMaxRoundsWithoutUser > 0 {
maxRoundsWithoutUser = cfg.Agents.Defaults.RuntimeControl.AutonomyMaxRoundsWithoutUser
}
idleRoundBudgetReleaseSec := a.IdleRoundBudgetReleaseSec
if idleRoundBudgetReleaseSec == 0 {
idleRoundBudgetReleaseSec = 1800
@@ -990,7 +986,7 @@ func buildAutonomyEngine(cfg *config.Config, msgBus *bus.MessageBus) *autonomy.E
NotifySameReasonCooldownSec: a.NotifySameReasonCooldownSec,
QuietHours: a.QuietHours,
UserIdleResumeSec: a.UserIdleResumeSec,
MaxRoundsWithoutUser: maxRoundsWithoutUser,
MaxRoundsWithoutUser: a.MaxRoundsWithoutUser,
TaskHistoryRetentionDays: a.TaskHistoryRetentionDays,
WaitingResumeDebounceSec: a.WaitingResumeDebounceSec,
IdleRoundBudgetReleaseSec: idleRoundBudgetReleaseSec,

View File

@@ -17,14 +17,14 @@
"enabled": false,
"tick_interval_sec": 30,
"min_run_interval_sec": 20,
"max_pending_duration_sec": 180,
"max_pending_duration_sec": 900,
"max_consecutive_stalls": 3,
"max_dispatch_per_tick": 2,
"max_dispatch_per_tick": 0,
"notify_cooldown_sec": 300,
"notify_same_reason_cooldown_sec": 900,
"quiet_hours": "23:00-08:00",
"user_idle_resume_sec": 20,
"max_rounds_without_user": 12,
"max_rounds_without_user": 0,
"task_history_retention_days": 3,
"waiting_resume_debounce_sec": 5,
"idle_round_budget_release_sec": 1800,
@@ -61,8 +61,8 @@
"autonomy_tick_interval_sec": 20,
"autonomy_min_run_interval_sec": 20,
"autonomy_idle_threshold_sec": 20,
"autonomy_max_rounds_without_user": 120,
"autonomy_max_pending_duration_sec": 180,
"autonomy_max_rounds_without_user": 0,
"autonomy_max_pending_duration_sec": 900,
"autonomy_max_consecutive_stalls": 3,
"autolearn_max_rounds_without_user": 200,
"run_state_ttl_seconds": 1800,

View File

@@ -734,23 +734,6 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage)
return fmt.Sprintf(tpl, lang), nil
}
// Update tool contexts
if tool, ok := al.tools.Get("message"); ok {
if mt, ok := tool.(*tools.MessageTool); ok {
mt.SetContext(msg.Channel, msg.ChatID)
}
}
if tool, ok := al.tools.Get("spawn"); ok {
if st, ok := tool.(*tools.SpawnTool); ok {
st.SetContext(msg.Channel, msg.ChatID)
}
}
if tool, ok := al.tools.Get("remind"); ok {
if rt, ok := tool.(*tools.RemindTool); ok {
rt.SetContext(msg.Channel, msg.ChatID)
}
}
history := al.sessions.GetHistory(msg.SessionKey)
summary := al.sessions.GetSummary(msg.SessionKey)
memoryRecallUsed := false
@@ -948,7 +931,8 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage)
"iteration": iteration,
})
result, err := al.tools.Execute(ctx, tc.Name, tc.Arguments)
execArgs := withToolContextArgs(tc.Name, tc.Arguments, msg.Channel, msg.ChatID)
result, err := al.tools.Execute(ctx, tc.Name, execArgs)
if err != nil {
result = fmt.Sprintf("Error: %v", err)
}
@@ -1168,18 +1152,6 @@ func (al *AgentLoop) processSystemMessage(ctx context.Context, msg bus.InboundMe
// Use the origin session for context
sessionKey := fmt.Sprintf("%s:%s", originChannel, originChatID)
// Update tool contexts to original channel/chatID
if tool, ok := al.tools.Get("message"); ok {
if mt, ok := tool.(*tools.MessageTool); ok {
mt.SetContext(originChannel, originChatID)
}
}
if tool, ok := al.tools.Get("spawn"); ok {
if st, ok := tool.(*tools.SpawnTool); ok {
st.SetContext(originChannel, originChatID)
}
}
// Build messages with the announce content
history := al.sessions.GetHistory(sessionKey)
summary := al.sessions.GetSummary(sessionKey)
@@ -1273,7 +1245,8 @@ func (al *AgentLoop) processSystemMessage(ctx context.Context, msg bus.InboundMe
al.sessions.AddMessageFull(sessionKey, assistantMsg)
for _, tc := range response.ToolCalls {
result, err := al.tools.Execute(ctx, tc.Name, tc.Arguments)
execArgs := withToolContextArgs(tc.Name, tc.Arguments, originChannel, originChatID)
result, err := al.tools.Execute(ctx, tc.Name, execArgs)
if err != nil {
result = fmt.Sprintf("Error: %v", err)
}
@@ -1657,6 +1630,42 @@ func truncateString(s string, maxLen int) string {
return s[:maxLen-3] + "..."
}
func withToolContextArgs(toolName string, args map[string]interface{}, channel, chatID string) map[string]interface{} {
if channel == "" || chatID == "" {
return args
}
switch toolName {
case "message", "spawn", "remind":
default:
return args
}
next := make(map[string]interface{}, len(args)+2)
for k, v := range args {
next[k] = v
}
if toolName == "message" {
if _, ok := next["channel"]; !ok {
next["channel"] = channel
}
if _, hasChat := next["chat_id"]; !hasChat {
if _, hasTo := next["to"]; !hasTo {
next["chat_id"] = chatID
}
}
return next
}
if _, ok := next["channel"]; !ok {
next["channel"] = channel
}
if _, ok := next["chat_id"]; !ok {
next["chat_id"] = chatID
}
return next
}
func shouldRecallMemory(text string, keywords []string) bool {
s := strings.ToLower(strings.TrimSpace(text))
if s == "" {

View File

@@ -0,0 +1,36 @@
package agent
import "testing"
func TestWithToolContextArgsInjectsDefaults(t *testing.T) {
args := map[string]interface{}{"message": "hello"}
got := withToolContextArgs("message", args, "telegram", "chat-1")
if got["channel"] != "telegram" {
t.Fatalf("expected channel injected, got %v", got["channel"])
}
if got["chat_id"] != "chat-1" {
t.Fatalf("expected chat_id injected, got %v", got["chat_id"])
}
}
func TestWithToolContextArgsPreservesExplicitTarget(t *testing.T) {
args := map[string]interface{}{"message": "hello", "to": "target-2"}
got := withToolContextArgs("message", args, "telegram", "chat-1")
if _, ok := got["chat_id"]; ok {
t.Fatalf("chat_id should not be injected when 'to' is provided")
}
if got["to"] != "target-2" {
t.Fatalf("expected to preserved, got %v", got["to"])
}
}
func TestWithToolContextArgsSkipsUnrelatedTools(t *testing.T) {
args := map[string]interface{}{"query": "x"}
got := withToolContextArgs("memory_search", args, "telegram", "chat-1")
if len(got) != len(args) {
t.Fatalf("expected unchanged args for unrelated tool")
}
if _, ok := got["channel"]; ok {
t.Fatalf("unexpected channel key for unrelated tool")
}
}

View File

@@ -8,7 +8,6 @@ import (
"os"
"path/filepath"
"regexp"
"sort"
"strings"
"sync"
@@ -140,11 +139,10 @@ func (al *AgentLoop) runPlannedTasks(ctx context.Context, msg bus.InboundMessage
res.ErrText = err.Error()
}
results[index] = res
al.publishPlannedTaskProgress(msg, len(tasks), res)
}(i, task)
}
wg.Wait()
sort.SliceStable(results, func(i, j int) bool { return results[i].Task.Index < results[j].Task.Index })
var b strings.Builder
b.WriteString(fmt.Sprintf("已自动拆解为 %d 个任务并执行:\n\n", len(results)))
for _, r := range results {
@@ -162,6 +160,35 @@ func (al *AgentLoop) runPlannedTasks(ctx context.Context, msg bus.InboundMessage
return strings.TrimSpace(b.String()), nil
}
func (al *AgentLoop) publishPlannedTaskProgress(msg bus.InboundMessage, total int, res plannedTaskResult) {
if al == nil || al.bus == nil || total <= 1 {
return
}
if msg.Channel == "system" || msg.Channel == "internal" {
return
}
idx := res.Task.Index
if idx <= 0 {
idx = res.Index + 1
}
status := "完成"
body := strings.TrimSpace(res.Output)
if res.ErrText != "" {
status = "失败"
body = strings.TrimSpace(res.ErrText)
}
if body == "" {
body = "(无输出)"
}
body = truncate(strings.ReplaceAll(body, "\n", " "), 280)
content := fmt.Sprintf("进度 %d/%d任务%d已%s\n%s", idx, total, idx, status, body)
al.bus.PublishOutbound(bus.OutboundMessage{
Channel: msg.Channel,
ChatID: msg.ChatID,
Content: content,
})
}
func (al *AgentLoop) enrichTaskContentWithMemoryAndEKG(ctx context.Context, task plannedTask) string {
base := strings.TrimSpace(task.Content)
if base == "" {

View File

@@ -4,9 +4,14 @@ import (
"context"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"testing"
"time"
"clawgo/pkg/bus"
"clawgo/pkg/config"
"clawgo/pkg/ekg"
"clawgo/pkg/providers"
)
@@ -54,6 +59,115 @@ func TestProcessPlannedMessage_AggregatesResults(t *testing.T) {
}
}
type probeProvider struct {
mu sync.Mutex
inFlight int
maxInFlight int
delayPerCall time.Duration
responseCount int
}
func (p *probeProvider) Chat(_ context.Context, _ []providers.Message, _ []providers.ToolDefinition, _ string, _ map[string]interface{}) (*providers.LLMResponse, error) {
p.mu.Lock()
p.inFlight++
if p.inFlight > p.maxInFlight {
p.maxInFlight = p.inFlight
}
p.responseCount++
p.mu.Unlock()
time.Sleep(p.delayPerCall)
p.mu.Lock()
n := p.responseCount
p.inFlight--
p.mu.Unlock()
resp := providers.LLMResponse{Content: "done-" + strconv.Itoa(n), FinishReason: "stop"}
return &resp, nil
}
func (p *probeProvider) GetDefaultModel() string { return "test-model" }
func TestRunPlannedTasks_NonConflictingKeysCanRunInParallel(t *testing.T) {
p := &probeProvider{delayPerCall: 100 * time.Millisecond}
cfg := config.DefaultConfig()
cfg.Agents.Defaults.Workspace = filepath.Join(t.TempDir(), "workspace")
cfg.Agents.Defaults.MaxToolIterations = 2
cfg.Agents.Defaults.ContextCompaction.Enabled = false
loop := NewAgentLoop(cfg, bus.NewMessageBus(), p, nil)
_, err := loop.processPlannedMessage(context.Background(), bus.InboundMessage{
Channel: "cli",
SenderID: "u",
ChatID: "direct",
SessionKey: "sess-plan-parallel",
Content: "[resource_keys: file:pkg/a.go] 修复 a[resource_keys: file:pkg/b.go] 修复 b",
})
if err != nil {
t.Fatalf("processPlannedMessage error: %v", err)
}
if p.maxInFlight < 2 {
t.Fatalf("expected parallel execution for non-conflicting keys, got maxInFlight=%d", p.maxInFlight)
}
}
func TestRunPlannedTasks_ConflictingKeysMutuallyExclusive(t *testing.T) {
p := &probeProvider{delayPerCall: 100 * time.Millisecond}
cfg := config.DefaultConfig()
cfg.Agents.Defaults.Workspace = filepath.Join(t.TempDir(), "workspace")
cfg.Agents.Defaults.MaxToolIterations = 2
cfg.Agents.Defaults.ContextCompaction.Enabled = false
loop := NewAgentLoop(cfg, bus.NewMessageBus(), p, nil)
_, err := loop.processPlannedMessage(context.Background(), bus.InboundMessage{
Channel: "cli",
SenderID: "u",
ChatID: "direct",
SessionKey: "sess-plan-locked",
Content: "[resource_keys: file:pkg/a.go] 修复 a[resource_keys: file:pkg/a.go] 补测试",
})
if err != nil {
t.Fatalf("processPlannedMessage error: %v", err)
}
if p.maxInFlight != 1 {
t.Fatalf("expected mutual exclusion for conflicting keys, got maxInFlight=%d", p.maxInFlight)
}
}
func TestRunPlannedTasks_PublishesStepProgress(t *testing.T) {
rp := &recordingProvider{responses: []providers.LLMResponse{
{Content: "done-a", FinishReason: "stop"},
{Content: "done-b", FinishReason: "stop"},
}}
loop := setupLoop(t, rp)
_, err := loop.processPlannedMessage(context.Background(), bus.InboundMessage{
Channel: "cli",
SenderID: "u",
ChatID: "direct",
SessionKey: "sess-plan-progress",
Content: "修复 pkg/a.go补充 pkg/b.go 测试",
})
if err != nil {
t.Fatalf("processPlannedMessage error: %v", err)
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
out1, ok := loop.bus.SubscribeOutbound(ctx)
if !ok {
t.Fatalf("expected first progress outbound")
}
out2, ok := loop.bus.SubscribeOutbound(ctx)
if !ok {
t.Fatalf("expected second progress outbound")
}
all := out1.Content + "\n" + out2.Content
if !strings.Contains(all, "进度 1/2") || !strings.Contains(all, "进度 2/2") {
t.Fatalf("unexpected progress outputs:\n%s", all)
}
}
func TestFindRecentRelatedErrorEvent(t *testing.T) {
ws := filepath.Join(t.TempDir(), "workspace")
_ = os.MkdirAll(filepath.Join(ws, "memory"), 0o755)

View File

@@ -95,7 +95,8 @@ func NewEngine(opts Options, msgBus *bus.MessageBus) *Engine {
if opts.MaxConsecutiveStalls <= 0 {
opts.MaxConsecutiveStalls = 3
}
if opts.MaxDispatchPerTick <= 0 {
// max_dispatch_per_tick <= 0 means "unlimited dispatch per tick".
if opts.MaxDispatchPerTick < 0 {
opts.MaxDispatchPerTick = 2
}
if opts.NotifyCooldownSec <= 0 {
@@ -296,7 +297,7 @@ func (e *Engine) tick() {
dispatched := 0
for _, st := range ordered {
if dispatched >= e.opts.MaxDispatchPerTick {
if e.opts.MaxDispatchPerTick > 0 && dispatched >= e.opts.MaxDispatchPerTick {
break
}
if st.Status == "completed" {
@@ -601,9 +602,6 @@ func (e *Engine) dispatchTask(st *taskState) {
func (e *Engine) sendCompletionNotification(st *taskState) {
e.writeReflectLog("complete", st, "task marked completed")
e.writeTriggerAudit("complete", st, "")
if !e.isHighValueCompletion(st) {
return
}
if !e.shouldNotify("done:"+st.ID, "") {
return
}

View File

@@ -948,26 +948,26 @@ func extractFeishuMessageContent(message *larkim.EventMessage) (string, []string
}
switch msgType {
case string(larkim.MsgTypeText):
case larkim.MsgTypeText:
var textPayload struct {
Text string `json:"text"`
}
if err := json.Unmarshal([]byte(raw), &textPayload); err == nil {
return textPayload.Text, nil
}
case string(larkim.MsgTypePost):
case larkim.MsgTypePost:
md, media := parseFeishuPostToMarkdown(raw)
if md != "" || len(media) > 0 {
return md, media
}
case string(larkim.MsgTypeImage):
case larkim.MsgTypeImage:
var img struct {
ImageKey string `json:"image_key"`
}
if err := json.Unmarshal([]byte(raw), &img); err == nil && img.ImageKey != "" {
return "[image]", []string{"feishu:image:" + img.ImageKey}
}
case string(larkim.MsgTypeFile):
case larkim.MsgTypeFile:
var f struct {
FileKey string `json:"file_key"`
FileName string `json:"file_name"`

View File

@@ -30,8 +30,8 @@ func Validate(cfg *Config) []error {
if rc.AutonomyIdleThresholdSec < 5 {
errs = append(errs, fmt.Errorf("agents.defaults.runtime_control.autonomy_idle_threshold_sec must be >= 5"))
}
if rc.AutonomyMaxRoundsWithoutUser <= 0 {
errs = append(errs, fmt.Errorf("agents.defaults.runtime_control.autonomy_max_rounds_without_user must be > 0"))
if rc.AutonomyMaxRoundsWithoutUser < 0 {
errs = append(errs, fmt.Errorf("agents.defaults.runtime_control.autonomy_max_rounds_without_user must be >= 0"))
}
if rc.AutonomyMaxPendingDurationSec < 10 {
errs = append(errs, fmt.Errorf("agents.defaults.runtime_control.autonomy_max_pending_duration_sec must be >= 10"))
@@ -96,8 +96,8 @@ func Validate(cfg *Config) []error {
if aut.MaxConsecutiveStalls <= 0 {
errs = append(errs, fmt.Errorf("agents.defaults.autonomy.max_consecutive_stalls must be > 0 when enabled=true"))
}
if aut.MaxDispatchPerTick <= 0 {
errs = append(errs, fmt.Errorf("agents.defaults.autonomy.max_dispatch_per_tick must be > 0 when enabled=true"))
if aut.MaxDispatchPerTick < 0 {
errs = append(errs, fmt.Errorf("agents.defaults.autonomy.max_dispatch_per_tick must be >= 0 when enabled=true"))
}
if aut.NotifyCooldownSec <= 0 {
errs = append(errs, fmt.Errorf("agents.defaults.autonomy.notify_cooldown_sec must be > 0 when enabled=true"))

View File

@@ -1929,7 +1929,7 @@ func (s *RegistryServer) handleWebUITaskAudit(w http.ResponseWriter, r *http.Req
http.Error(w, "unauthorized", http.StatusUnauthorized)
return
}
if r.Method != http.MethodGet {
if r.Method != http.MethodGet && r.Method != http.MethodPost {
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
return
}
@@ -2052,11 +2052,10 @@ func (s *RegistryServer) handleWebUITaskQueue(w http.ResponseWriter, r *http.Req
path := filepath.Join(strings.TrimSpace(s.workspacePath), "memory", "task-audit.jsonl")
includeHeartbeat := r.URL.Query().Get("include_heartbeat") == "1"
b, err := os.ReadFile(path)
if err != nil {
_ = json.NewEncoder(w).Encode(map[string]interface{}{"ok": true, "running": []map[string]interface{}{}, "items": []map[string]interface{}{}})
return
lines := []string{}
if err == nil {
lines = strings.Split(string(b), "\n")
}
lines := strings.Split(string(b), "\n")
type agg struct {
Last map[string]interface{}
Logs []string
@@ -2144,6 +2143,104 @@ func (s *RegistryServer) handleWebUITaskQueue(w http.ResponseWriter, r *http.Req
}
}
// Merge command watchdog queue from memory/task_queue.json for visibility.
queuePath := filepath.Join(strings.TrimSpace(s.workspacePath), "memory", "task_queue.json")
if qb, qErr := os.ReadFile(queuePath); qErr == nil {
var q map[string]interface{}
if json.Unmarshal(qb, &q) == nil {
if arr, ok := q["running"].([]interface{}); ok {
for _, item := range arr {
row, ok := item.(map[string]interface{})
if !ok {
continue
}
id := fmt.Sprintf("%v", row["id"])
if strings.TrimSpace(id) == "" {
continue
}
label := fmt.Sprintf("%v", row["label"])
source := strings.TrimSpace(fmt.Sprintf("%v", row["source"]))
if source == "" {
source = "command_watchdog"
}
rec := map[string]interface{}{
"task_id": "cmd:" + id,
"time": fmt.Sprintf("%v", row["started_at"]),
"status": "running",
"source": "command_watchdog",
"channel": source,
"session": "watchdog:" + id,
"input_preview": label,
"duration_ms": 0,
"attempts": 1,
"retry_count": 0,
"logs": []string{
fmt.Sprintf("watchdog source=%s heavy=%v", source, row["heavy"]),
fmt.Sprintf("next_check_at=%v stalled_rounds=%v/%v", row["next_check_at"], row["stalled_rounds"], row["stall_round_limit"]),
},
"idle_run": true,
}
items = append(items, rec)
running = append(running, rec)
}
}
if arr, ok := q["waiting"].([]interface{}); ok {
for _, item := range arr {
row, ok := item.(map[string]interface{})
if !ok {
continue
}
id := fmt.Sprintf("%v", row["id"])
if strings.TrimSpace(id) == "" {
continue
}
label := fmt.Sprintf("%v", row["label"])
source := strings.TrimSpace(fmt.Sprintf("%v", row["source"]))
if source == "" {
source = "command_watchdog"
}
rec := map[string]interface{}{
"task_id": "cmd:" + id,
"time": fmt.Sprintf("%v", row["enqueued_at"]),
"status": "waiting",
"source": "command_watchdog",
"channel": source,
"session": "watchdog:" + id,
"input_preview": label,
"duration_ms": 0,
"attempts": 1,
"retry_count": 0,
"logs": []string{
fmt.Sprintf("watchdog source=%s heavy=%v", source, row["heavy"]),
fmt.Sprintf("enqueued_at=%v", row["enqueued_at"]),
},
"idle_run": true,
}
items = append(items, rec)
}
}
if wd, ok := q["watchdog"].(map[string]interface{}); ok {
items = append(items, map[string]interface{}{
"task_id": "cmd:watchdog",
"time": fmt.Sprintf("%v", q["time"]),
"status": "running",
"source": "command_watchdog",
"channel": "watchdog",
"session": "watchdog:stats",
"input_preview": "command watchdog capacity snapshot",
"duration_ms": 0,
"attempts": 1,
"retry_count": 0,
"logs": []string{
fmt.Sprintf("cpu_total=%v usage_ratio=%v reserve_pct=%v", wd["cpu_total"], wd["usage_ratio"], wd["reserve_pct"]),
fmt.Sprintf("active=%v/%v heavy=%v/%v waiting=%v running=%v", wd["active"], wd["max_active"], wd["active_heavy"], wd["max_heavy"], wd["waiting"], wd["running"]),
},
"idle_run": true,
})
}
}
}
sort.Slice(items, func(i, j int) bool { return fmt.Sprintf("%v", items[i]["time"]) > fmt.Sprintf("%v", items[j]["time"]) })
stats := map[string]int{"total": len(items), "running": len(running), "idle_round_budget": 0, "active_user": 0, "manual_pause": 0}
for _, it := range items {

797
pkg/tools/command_tick.go Normal file
View File

@@ -0,0 +1,797 @@
package tools
import (
"context"
"encoding/json"
"errors"
"fmt"
"os"
"os/exec"
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
)
const (
minCommandTick = 1 * time.Second
maxCommandTick = 45 * time.Second
watchdogTick = 1 * time.Second
minWorldCycle = 10 * time.Second
maxWorldCycle = 60 * time.Second
)
var ErrCommandNoProgress = errors.New("command no progress across tick rounds")
type commandRuntimePolicy struct {
BaseTick time.Duration
StallRoundLimit int
MaxRestarts int
Difficulty int
}
func buildCommandRuntimePolicy(command string, baseTick time.Duration) commandRuntimePolicy {
diff := commandDifficulty(command)
cpu := runtime.NumCPU()
// Baseline: kill/restart after 5 unchanged-progress ticks.
stallLimit := 5
// Difficulty adjustment (1..4) => +0..6 rounds.
stallLimit += (diff - 1) * 2
// Hardware adjustment: weaker CPU gets more patience.
switch {
case cpu <= 4:
stallLimit += 5
case cpu <= 8:
stallLimit += 3
case cpu <= 16:
stallLimit += 1
}
if stallLimit < 5 {
stallLimit = 5
}
if stallLimit > 24 {
stallLimit = 24
}
// Restart budget: heavier tasks and weaker CPUs allow extra retries.
restarts := 1
if diff >= 3 {
restarts++
}
if cpu <= 4 {
restarts++
}
if restarts > 3 {
restarts = 3
}
return commandRuntimePolicy{
BaseTick: normalizeCommandTick(baseTick),
StallRoundLimit: stallLimit,
MaxRestarts: restarts,
Difficulty: diff,
}
}
type commandWatchdog struct {
mu sync.Mutex
watches map[uint64]*watchedCommand
waiters []*watchWaiter
nextID uint64
cpuTotal int
baseActive int
baseHeavy int
reservePct float64
usageRatio float64
lastSample time.Time
worldCycle time.Duration
nextSampleAt time.Time
active int
activeHeavy int
queueLimit int
queuePath string
}
type watchedCommand struct {
id uint64
cmd *exec.Cmd
startedAt time.Time
baseTick time.Duration
stallRoundLimit int
nextCheckAt time.Time
lastProgress int
stalledRounds int
progressFn func() int
stallNotify chan int
heavy bool
source string
label string
}
type stalledCommand struct {
cmd *exec.Cmd
rounds int
notify chan int
}
type watchWaiter struct {
id uint64
heavy bool
ready chan struct{}
source string
label string
enqueuedAt time.Time
}
var globalCommandWatchdog = newCommandWatchdog()
var reLoadAverage = regexp.MustCompile(`load averages?:\s*([0-9]+(?:[.,][0-9]+)?)`)
func newCommandWatchdog() *commandWatchdog {
cpu := runtime.NumCPU()
baseActive, baseHeavy, queueLimit := deriveWatchdogLimits(cpu)
wd := &commandWatchdog{
watches: make(map[uint64]*watchedCommand),
waiters: make([]*watchWaiter, 0, queueLimit),
cpuTotal: cpu,
baseActive: baseActive,
baseHeavy: baseHeavy,
reservePct: 0.20,
usageRatio: 0,
worldCycle: 20 * time.Second,
queueLimit: queueLimit,
}
go wd.loop()
return wd
}
func deriveWatchdogLimits(cpu int) (maxActive, maxHeavy, queueLimit int) {
if cpu <= 0 {
cpu = 2
}
maxActive = cpu
if maxActive < 2 {
maxActive = 2
}
if maxActive > 12 {
maxActive = 12
}
maxHeavy = cpu/4 + 1
if maxHeavy < 1 {
maxHeavy = 1
}
if maxHeavy > 4 {
maxHeavy = 4
}
queueLimit = maxActive * 8
if queueLimit < 16 {
queueLimit = 16
}
return
}
func (wd *commandWatchdog) loop() {
ticker := time.NewTicker(watchdogTick)
defer ticker.Stop()
for now := range ticker.C {
wd.refreshSystemUsage(now)
wd.tick(now)
}
}
func (wd *commandWatchdog) refreshSystemUsage(now time.Time) {
if wd == nil {
return
}
wd.mu.Lock()
if wd.nextSampleAt.IsZero() {
wd.nextSampleAt = now
}
if now.Before(wd.nextSampleAt) {
wd.mu.Unlock()
return
}
wd.lastSample = now
cpu := wd.cpuTotal
cycle := wd.computeWorldCycleLocked()
wd.worldCycle = cycle
wd.nextSampleAt = now.Add(cycle)
wd.mu.Unlock()
usage := sampleSystemUsageRatio(cpu)
wd.mu.Lock()
wd.usageRatio = usage
wd.mu.Unlock()
}
func (wd *commandWatchdog) computeWorldCycleLocked() time.Duration {
if wd == nil {
return 20 * time.Second
}
// Game-world style cycle:
// base=20s; busier world => shorter cycle; idle world => longer cycle.
cycle := 20 * time.Second
pending := len(wd.waiters)
if pending > 0 {
cycle -= time.Duration(minInt(pending, 8)) * time.Second
}
if wd.active > wd.baseActive/2 {
cycle -= 3 * time.Second
}
if wd.active == 0 && pending == 0 {
cycle += 10 * time.Second
}
if cycle < minWorldCycle {
cycle = minWorldCycle
}
if cycle > maxWorldCycle {
cycle = maxWorldCycle
}
return cycle
}
func (wd *commandWatchdog) register(cmd *exec.Cmd, baseTick time.Duration, stallRoundLimit int, progressFn func() int, stallNotify chan int, heavy bool, source, label string) func() {
if wd == nil || cmd == nil {
return func() {}
}
base := normalizeCommandTick(baseTick)
id := atomic.AddUint64(&wd.nextID, 1)
w := &watchedCommand{
id: id,
cmd: cmd,
startedAt: time.Now(),
baseTick: base,
stallRoundLimit: stallRoundLimit,
nextCheckAt: time.Now().Add(base),
lastProgress: safeProgress(progressFn),
progressFn: progressFn,
stallNotify: stallNotify,
heavy: heavy,
source: strings.TrimSpace(source),
label: strings.TrimSpace(label),
}
wd.mu.Lock()
wd.watches[id] = w
snap := wd.buildQueueSnapshotLocked()
wd.mu.Unlock()
wd.writeQueueSnapshot(snap)
var once sync.Once
return func() {
once.Do(func() {
wd.mu.Lock()
delete(wd.watches, id)
snap := wd.buildQueueSnapshotLocked()
wd.mu.Unlock()
wd.writeQueueSnapshot(snap)
})
}
}
func (wd *commandWatchdog) setQueuePath(path string) {
if wd == nil {
return
}
path = strings.TrimSpace(path)
if path != "" {
path = filepath.Clean(path)
}
wd.mu.Lock()
changed := wd.queuePath != path
wd.queuePath = path
snap := wd.buildQueueSnapshotLocked()
wd.mu.Unlock()
if changed {
wd.writeQueueSnapshot(snap)
}
}
func (wd *commandWatchdog) acquireSlot(ctx context.Context, heavy bool, source, label string) (func(), error) {
if wd == nil {
return func() {}, nil
}
if ctx == nil {
ctx = context.Background()
}
wd.mu.Lock()
if wd.canAcquireSlotLocked(heavy) {
wd.grantSlotLocked(heavy)
snap := wd.buildQueueSnapshotLocked()
wd.mu.Unlock()
wd.writeQueueSnapshot(snap)
return wd.releaseSlotFunc(heavy), nil
}
// Queue when slots are full; wait until a slot is available or context cancels.
waitID := atomic.AddUint64(&wd.nextID, 1)
w := &watchWaiter{
id: waitID,
heavy: heavy,
ready: make(chan struct{}, 1),
source: strings.TrimSpace(source),
label: strings.TrimSpace(label),
enqueuedAt: time.Now(),
}
wd.waiters = append(wd.waiters, w)
snap := wd.buildQueueSnapshotLocked()
wd.mu.Unlock()
wd.writeQueueSnapshot(snap)
for {
select {
case <-ctx.Done():
wd.mu.Lock()
wd.removeWaiterLocked(waitID)
snap := wd.buildQueueSnapshotLocked()
wd.mu.Unlock()
wd.writeQueueSnapshot(snap)
return nil, ctx.Err()
case <-w.ready:
return wd.releaseSlotFunc(heavy), nil
}
}
}
func (wd *commandWatchdog) releaseSlotFunc(heavy bool) func() {
var once sync.Once
return func() {
once.Do(func() {
wd.mu.Lock()
if wd.active > 0 {
wd.active--
}
if heavy && wd.activeHeavy > 0 {
wd.activeHeavy--
}
wd.scheduleWaitersLocked()
snap := wd.buildQueueSnapshotLocked()
wd.mu.Unlock()
wd.writeQueueSnapshot(snap)
})
}
}
func (wd *commandWatchdog) canAcquireSlotLocked(heavy bool) bool {
maxActive, maxHeavy := wd.dynamicLimitsLocked()
if wd.active >= maxActive {
return false
}
if heavy && wd.activeHeavy >= maxHeavy {
return false
}
return true
}
func (wd *commandWatchdog) grantSlotLocked(heavy bool) {
wd.active++
if heavy {
wd.activeHeavy++
}
}
func (wd *commandWatchdog) dynamicLimitsLocked() (maxActive, maxHeavy int) {
if wd == nil {
return 1, 1
}
maxActive = computeDynamicActiveSlots(wd.cpuTotal, wd.reservePct, wd.usageRatio, wd.baseActive)
maxHeavy = computeDynamicHeavySlots(maxActive, wd.baseHeavy)
return
}
func computeDynamicActiveSlots(cpu int, reservePct, usageRatio float64, baseActive int) int {
if cpu <= 0 {
cpu = 1
}
if reservePct <= 0 {
reservePct = 0.20
}
if reservePct > 0.90 {
reservePct = 0.90
}
if usageRatio < 0 {
usageRatio = 0
}
if usageRatio > 0.95 {
usageRatio = 0.95
}
headroom := 1.0 - reservePct - usageRatio
if headroom < 0 {
headroom = 0
}
maxActive := int(float64(cpu) * headroom)
if maxActive < 1 {
maxActive = 1
}
if baseActive > 0 && maxActive > baseActive {
maxActive = baseActive
}
return maxActive
}
func computeDynamicHeavySlots(maxActive, baseHeavy int) int {
if maxActive <= 0 {
return 1
}
maxHeavy := maxActive/2 + 1
if maxHeavy < 1 {
maxHeavy = 1
}
if baseHeavy > 0 && maxHeavy > baseHeavy {
maxHeavy = baseHeavy
}
if maxHeavy > maxActive {
maxHeavy = maxActive
}
return maxHeavy
}
func (wd *commandWatchdog) scheduleWaitersLocked() {
if len(wd.waiters) == 0 {
return
}
for {
progress := false
for i := 0; i < len(wd.waiters); {
w := wd.waiters[i]
if w == nil {
wd.waiters = append(wd.waiters[:i], wd.waiters[i+1:]...)
progress = true
continue
}
if !wd.canAcquireSlotLocked(w.heavy) {
i++
continue
}
wd.grantSlotLocked(w.heavy)
wd.waiters = append(wd.waiters[:i], wd.waiters[i+1:]...)
select {
case w.ready <- struct{}{}:
default:
}
progress = true
}
if !progress {
break
}
}
}
func (wd *commandWatchdog) removeWaiterLocked(id uint64) {
if id == 0 || len(wd.waiters) == 0 {
return
}
for i, w := range wd.waiters {
if w == nil || w.id != id {
continue
}
wd.waiters = append(wd.waiters[:i], wd.waiters[i+1:]...)
return
}
}
func (wd *commandWatchdog) tick(now time.Time) {
if wd == nil {
return
}
toStall := make([]stalledCommand, 0, 4)
changed := false
wd.mu.Lock()
for id, w := range wd.watches {
if w == nil {
delete(wd.watches, id)
changed = true
continue
}
if now.Before(w.nextCheckAt) {
continue
}
cur := safeProgress(w.progressFn)
if cur > w.lastProgress {
w.lastProgress = cur
w.stalledRounds = 0
} else {
w.stalledRounds++
changed = true
if w.stallRoundLimit > 0 && w.stalledRounds >= w.stallRoundLimit {
delete(wd.watches, id)
changed = true
toStall = append(toStall, stalledCommand{
cmd: w.cmd,
rounds: w.stalledRounds,
notify: w.stallNotify,
})
continue
}
}
next := nextCommandTick(w.baseTick, now.Sub(w.startedAt))
w.nextCheckAt = now.Add(next)
changed = true
}
snap := wd.buildQueueSnapshotLocked()
wd.mu.Unlock()
if changed {
wd.writeQueueSnapshot(snap)
}
for _, st := range toStall {
if st.cmd != nil && st.cmd.Process != nil {
_ = st.cmd.Process.Kill()
}
if st.notify != nil {
select {
case st.notify <- st.rounds:
default:
}
}
}
}
func safeProgress(progressFn func() int) (progress int) {
if progressFn == nil {
return 0
}
defer func() {
if recover() != nil {
progress = 0
}
}()
progress = progressFn()
if progress < 0 {
return 0
}
return progress
}
func runCommandWithDynamicTick(ctx context.Context, cmd *exec.Cmd, source, label string, difficulty int, baseTick time.Duration, stallRoundLimit int, progressFn func() int) error {
base := normalizeCommandTick(baseTick)
heavy := difficulty >= 3
releaseSlot, err := globalCommandWatchdog.acquireSlot(ctx, heavy, source, label)
if err != nil {
return err
}
defer releaseSlot()
if err := cmd.Start(); err != nil {
return err
}
done := make(chan error, 1)
go func() { done <- cmd.Wait() }()
stallNotify := make(chan int, 1)
unwatch := globalCommandWatchdog.register(cmd, base, stallRoundLimit, progressFn, stallNotify, heavy, source, label)
defer unwatch()
for {
select {
case err := <-done:
return err
case stalledRounds := <-stallNotify:
select {
case err := <-done:
return fmt.Errorf("%w: %d ticks without progress (%v)", ErrCommandNoProgress, stalledRounds, err)
case <-time.After(2 * time.Second):
return fmt.Errorf("%w: %d ticks without progress", ErrCommandNoProgress, stalledRounds)
}
case <-ctx.Done():
if cmd.Process != nil {
_ = cmd.Process.Kill()
}
select {
case err := <-done:
if err != nil {
return err
}
case <-time.After(2 * time.Second):
}
return ctx.Err()
}
}
}
func (wd *commandWatchdog) buildQueueSnapshotLocked() map[string]interface{} {
if wd == nil {
return nil
}
maxActive, maxHeavy := wd.dynamicLimitsLocked()
running := make([]map[string]interface{}, 0, len(wd.watches))
for _, w := range wd.watches {
if w == nil {
continue
}
running = append(running, map[string]interface{}{
"id": w.id,
"source": queueNonEmpty(w.source, "exec"),
"label": w.label,
"heavy": w.heavy,
"status": "running",
"started_at": w.startedAt.UTC().Format(time.RFC3339),
"next_check_at": w.nextCheckAt.UTC().Format(time.RFC3339),
"stalled_rounds": w.stalledRounds,
"stall_round_limit": w.stallRoundLimit,
"last_progress": w.lastProgress,
})
}
waiting := make([]map[string]interface{}, 0, len(wd.waiters))
for _, w := range wd.waiters {
if w == nil {
continue
}
waiting = append(waiting, map[string]interface{}{
"id": w.id,
"source": queueNonEmpty(w.source, "exec"),
"label": w.label,
"heavy": w.heavy,
"status": "waiting",
"enqueued_at": w.enqueuedAt.UTC().Format(time.RFC3339),
})
}
return map[string]interface{}{
"time": time.Now().UTC().Format(time.RFC3339),
"watchdog": map[string]interface{}{
"cpu_total": wd.cpuTotal,
"reserve_pct": wd.reservePct,
"usage_ratio": wd.usageRatio,
"world_cycle_sec": int(wd.worldCycle.Seconds()),
"next_sample_at": func() string {
if wd.nextSampleAt.IsZero() {
return ""
}
return wd.nextSampleAt.UTC().Format(time.RFC3339)
}(),
"max_active": maxActive,
"max_heavy": maxHeavy,
"active": wd.active,
"active_heavy": wd.activeHeavy,
"waiting": len(waiting),
"running": len(running),
},
"running": running,
"waiting": waiting,
}
}
func (wd *commandWatchdog) writeQueueSnapshot(snap map[string]interface{}) {
if wd == nil || snap == nil {
return
}
wd.mu.Lock()
path := strings.TrimSpace(wd.queuePath)
wd.mu.Unlock()
if path == "" {
return
}
raw, err := json.MarshalIndent(snap, "", " ")
if err != nil {
return
}
_ = os.MkdirAll(filepath.Dir(path), 0755)
_ = os.WriteFile(path, raw, 0644)
}
func queueNonEmpty(v, fallback string) string {
v = strings.TrimSpace(v)
if v == "" {
return fallback
}
return v
}
func minInt(a, b int) int {
if a < b {
return a
}
return b
}
func nextCommandTick(baseTick, elapsed time.Duration) time.Duration {
base := normalizeCommandTick(baseTick)
if elapsed < 0 {
elapsed = 0
}
next := base + elapsed/8
if next > maxCommandTick {
return maxCommandTick
}
if next < base {
return base
}
return next
}
func normalizeCommandTick(baseTick time.Duration) time.Duration {
if baseTick < minCommandTick {
return minCommandTick
}
if baseTick > maxCommandTick {
return maxCommandTick
}
return baseTick
}
func commandDifficulty(command string) int {
cmd := strings.ToLower(strings.TrimSpace(command))
if cmd == "" {
return 1
}
// 4: very heavy build / container graph.
for _, p := range []string{"docker build", "docker compose build", "bazel build", "gradle build", "mvn package"} {
if strings.Contains(cmd, p) {
return 4
}
}
// 3: compile/test/install heavy workloads.
for _, p := range []string{"go test", "go build", "cargo build", "npm install", "npm ci", "pnpm install", "yarn install", "npm run build", "pnpm build", "yarn build"} {
if strings.Contains(cmd, p) {
return 3
}
}
// 2: medium multi-step shell chains.
if strings.Contains(cmd, "&&") || strings.Contains(cmd, "|") {
return 2
}
return 1
}
func sampleSystemUsageRatio(cpu int) float64 {
if cpu <= 0 {
cpu = 1
}
load1, ok := readLoadAverage1()
if !ok {
return 0
}
ratio := load1 / float64(cpu)
if ratio < 0 {
return 0
}
if ratio > 0.95 {
return 0.95
}
return ratio
}
func readLoadAverage1() (float64, bool) {
// Linux fast path.
if b, err := os.ReadFile("/proc/loadavg"); err == nil {
fields := strings.Fields(strings.TrimSpace(string(b)))
if len(fields) > 0 {
if v, err := strconv.ParseFloat(fields[0], 64); err == nil && v >= 0 {
return v, true
}
}
}
// macOS/BSD fallback.
if out, err := runCommandOutputWithTimeout(300*time.Millisecond, "sysctl", "-n", "vm.loadavg"); err == nil {
fields := strings.Fields(strings.Trim(strings.TrimSpace(string(out)), "{}"))
if len(fields) > 0 {
if v, err := strconv.ParseFloat(strings.ReplaceAll(fields[0], ",", "."), 64); err == nil && v >= 0 {
return v, true
}
}
}
if out, err := runCommandOutputWithTimeout(300*time.Millisecond, "uptime"); err == nil {
m := reLoadAverage.FindStringSubmatch(strings.ToLower(string(out)))
if len(m) >= 2 {
if v, err := strconv.ParseFloat(strings.ReplaceAll(m[1], ",", "."), 64); err == nil && v >= 0 {
return v, true
}
}
}
return 0, false
}
func runCommandOutputWithTimeout(timeout time.Duration, name string, args ...string) ([]byte, error) {
if timeout <= 0 {
timeout = 300 * time.Millisecond
}
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
return exec.CommandContext(ctx, name, args...).Output()
}

View File

@@ -0,0 +1,24 @@
package tools
import "testing"
func TestComputeDynamicActiveSlots_ReservesTwentyPercent(t *testing.T) {
got := computeDynamicActiveSlots(10, 0.20, 0.0, 12)
if got != 8 {
t.Fatalf("expected 8 active slots with 20%% reserve on 10 CPU, got %d", got)
}
}
func TestComputeDynamicActiveSlots_ReducesWithSystemUsage(t *testing.T) {
got := computeDynamicActiveSlots(10, 0.20, 0.5, 12)
if got != 3 {
t.Fatalf("expected 3 active slots when system usage is 50%%, got %d", got)
}
}
func TestComputeDynamicActiveSlots_AlwaysKeepsOne(t *testing.T) {
got := computeDynamicActiveSlots(8, 0.20, 0.95, 12)
if got != 1 {
t.Fatalf("expected at least 1 active slot under high system usage, got %d", got)
}
}

View File

@@ -12,6 +12,7 @@ import (
type SendCallback func(channel, chatID, action, content, media, messageID, emoji string, buttons [][]bus.Button) error
type MessageTool struct {
mu sync.RWMutex
sendCallback SendCallback
defaultChannel string
defaultChatID string
@@ -104,11 +105,15 @@ func (t *MessageTool) Parameters() map[string]interface{} {
}
func (t *MessageTool) SetContext(channel, chatID string) {
t.mu.Lock()
defer t.mu.Unlock()
t.defaultChannel = channel
t.defaultChatID = chatID
}
func (t *MessageTool) SetSendCallback(callback SendCallback) {
t.mu.Lock()
defer t.mu.Unlock()
t.sendCallback = callback
}
@@ -168,18 +173,24 @@ func (t *MessageTool) Execute(ctx context.Context, args map[string]interface{})
chatID = to
}
t.mu.RLock()
defaultChannel := t.defaultChannel
defaultChatID := t.defaultChatID
sendCallback := t.sendCallback
t.mu.RUnlock()
if channel == "" {
channel = t.defaultChannel
channel = defaultChannel
}
if chatID == "" {
chatID = t.defaultChatID
chatID = defaultChatID
}
if channel == "" || chatID == "" {
return "Error: No target channel/chat specified", nil
}
if t.sendCallback == nil {
if sendCallback == nil {
return "Error: Message sending not configured", nil
}
@@ -207,7 +218,7 @@ func (t *MessageTool) Execute(ctx context.Context, args map[string]interface{})
}
}
if err := t.sendCallback(channel, chatID, action, content, media, messageID, emoji, buttons); err != nil {
if err := sendCallback(channel, chatID, action, content, media, messageID, emoji, buttons); err != nil {
return fmt.Sprintf("Error sending message: %v", err), nil
}

View File

@@ -3,12 +3,14 @@ package tools
import (
"context"
"fmt"
"sync"
"time"
"clawgo/pkg/cron"
)
type RemindTool struct {
mu sync.RWMutex
cs *cron.CronService
defaultChannel string
defaultChatID string
@@ -19,6 +21,8 @@ func NewRemindTool(cs *cron.CronService) *RemindTool {
}
func (t *RemindTool) SetContext(channel, chatID string) {
t.mu.Lock()
defer t.mu.Unlock()
t.defaultChannel = channel
t.defaultChatID = chatID
}
@@ -43,6 +47,14 @@ func (t *RemindTool) Parameters() map[string]interface{} {
"type": "string",
"description": "When to remind (e.g., '10m', '1h', '2026-02-12 10:00')",
},
"channel": map[string]interface{}{
"type": "string",
"description": "Optional destination channel override",
},
"chat_id": map[string]interface{}{
"type": "string",
"description": "Optional destination chat ID override",
},
},
"required": []string{"message", "time_expr"},
}
@@ -63,6 +75,21 @@ func (t *RemindTool) Execute(ctx context.Context, args map[string]interface{}) (
return "", fmt.Errorf("time_expr is required")
}
channel, _ := args["channel"].(string)
chatID, _ := args["chat_id"].(string)
if channel == "" || chatID == "" {
t.mu.RLock()
defaultChannel := t.defaultChannel
defaultChatID := t.defaultChatID
t.mu.RUnlock()
if channel == "" {
channel = defaultChannel
}
if chatID == "" {
chatID = defaultChatID
}
}
// Try duration first (e.g., "10m", "1h30m")
if d, err := time.ParseDuration(timeExpr); err == nil {
at := time.Now().Add(d).UnixMilli()
@@ -70,7 +97,7 @@ func (t *RemindTool) Execute(ctx context.Context, args map[string]interface{}) (
Kind: "at",
AtMS: &at,
}
job, err := t.cs.AddJob("Reminder", schedule, message, true, t.defaultChannel, t.defaultChatID)
job, err := t.cs.AddJob("Reminder", schedule, message, true, channel, chatID)
if err != nil {
return "", fmt.Errorf("failed to schedule reminder: %w", err)
}
@@ -120,7 +147,7 @@ func (t *RemindTool) Execute(ctx context.Context, args map[string]interface{}) (
AtMS: &at,
}
job, err := t.cs.AddJob("Reminder", schedule, message, true, t.defaultChannel, t.defaultChatID)
job, err := t.cs.AddJob("Reminder", schedule, message, true, channel, chatID)
if err != nil {
return "", fmt.Errorf("failed to schedule reminder: %w", err)
}

View File

@@ -1,8 +1,8 @@
package tools
import (
"bytes"
"context"
"errors"
"fmt"
"os"
"os/exec"
@@ -16,12 +16,12 @@ import (
)
type ExecTool struct {
workingDir string
timeout time.Duration
sandboxEnabled bool
sandboxImage string
workingDir string
timeout time.Duration
sandboxEnabled bool
sandboxImage string
autoInstallMissing bool
procManager *ProcessManager
procManager *ProcessManager
}
func NewExecTool(cfg config.ShellConfig, workspace string, pm *ProcessManager) *ExecTool {
@@ -81,6 +81,11 @@ func (t *ExecTool) Execute(ctx context.Context, args map[string]interface{}) (st
cwd = wd
}
}
queueBase := strings.TrimSpace(t.workingDir)
if queueBase == "" {
queueBase = cwd
}
globalCommandWatchdog.setQueuePath(resolveCommandQueuePath(queueBase))
if bg, _ := args["background"].(bool); bg {
if t.procManager == nil {
@@ -112,26 +117,38 @@ func (t *ExecTool) executeInSandbox(ctx context.Context, command, cwd string) (s
t.sandboxImage,
"sh", "-c", command,
}
cmdCtx, cancel := context.WithTimeout(ctx, t.timeout)
defer cancel()
cmd := exec.CommandContext(cmdCtx, "docker", dockerArgs...)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
err := cmd.Run()
output := stdout.String()
if stderr.Len() > 0 {
output += "\nSTDERR:\n" + stderr.String()
policy := buildCommandRuntimePolicy(command, t.commandTickBase(command))
var merged strings.Builder
for attempt := 0; attempt <= policy.MaxRestarts; attempt++ {
cmd := exec.CommandContext(ctx, "docker", dockerArgs...)
var stdout, stderr trackedOutput
cmd.Stdout = &stdout
cmd.Stderr = &stderr
err := runCommandWithDynamicTick(ctx, cmd, "exec:sandbox", command, policy.Difficulty, policy.BaseTick, policy.StallRoundLimit, func() int {
return stdout.Len() + stderr.Len()
})
out := stdout.String()
if stderr.Len() > 0 {
out += "\nSTDERR:\n" + stderr.String()
}
if strings.TrimSpace(out) != "" {
if merged.Len() > 0 {
merged.WriteString("\n")
}
merged.WriteString(out)
}
if err == nil {
return merged.String(), nil
}
if errors.Is(err, ErrCommandNoProgress) && ctx.Err() == nil && attempt < policy.MaxRestarts {
merged.WriteString(fmt.Sprintf("\n[RESTART] no progress for %d ticks, restarting (%d/%d)\n",
policy.StallRoundLimit, attempt+1, policy.MaxRestarts))
continue
}
merged.WriteString(fmt.Sprintf("\nSandbox Exit code: %v", err))
return merged.String(), nil
}
if err != nil {
output += fmt.Sprintf("\nSandbox Exit code: %v", err)
}
return output, nil
return merged.String(), nil
}
func (t *ExecTool) SetTimeout(timeout time.Duration) {
@@ -139,19 +156,13 @@ func (t *ExecTool) SetTimeout(timeout time.Duration) {
}
func (t *ExecTool) executeCommand(ctx context.Context, command, cwd string) (string, error) {
output, err, timedOut := t.runShellCommand(ctx, command, cwd)
if timedOut {
return fmt.Sprintf("Error: Command timed out after %v", t.timeout), nil
}
output, err := t.runShellCommand(ctx, command, cwd)
if err != nil && t.autoInstallMissing {
if missingCmd := detectMissingCommandFromOutput(output); missingCmd != "" {
if installLog, installed := t.tryAutoInstallMissingCommand(ctx, missingCmd, cwd); installed {
output += "\n[AUTO-INSTALL]\n" + installLog
retryOutput, retryErr, retryTimedOut := t.runShellCommand(ctx, command, cwd)
if retryTimedOut {
return fmt.Sprintf("Error: Command timed out after %v", t.timeout), nil
}
retryOutput, retryErr := t.runShellCommand(ctx, command, cwd)
output += "\n[RETRY]\n" + retryOutput
err = retryErr
}
@@ -173,32 +184,44 @@ func (t *ExecTool) executeCommand(ctx context.Context, command, cwd string) (str
return output, nil
}
func (t *ExecTool) runShellCommand(ctx context.Context, command, cwd string) (string, error, bool) {
cmdCtx, cancel := context.WithTimeout(ctx, t.timeout)
defer cancel()
cmd := exec.CommandContext(cmdCtx, "sh", "-c", command)
cmd.Env = buildExecEnv()
if cwd != "" {
cmd.Dir = cwd
}
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
err := cmd.Run()
output := stdout.String()
if stderr.Len() > 0 {
output += "\nSTDERR:\n" + stderr.String()
}
if err != nil {
if cmdCtx.Err() == context.DeadlineExceeded {
return output, err, true
func (t *ExecTool) runShellCommand(ctx context.Context, command, cwd string) (string, error) {
policy := buildCommandRuntimePolicy(command, t.commandTickBase(command))
var merged strings.Builder
for attempt := 0; attempt <= policy.MaxRestarts; attempt++ {
cmd := exec.CommandContext(ctx, "sh", "-c", command)
cmd.Env = buildExecEnv()
if cwd != "" {
cmd.Dir = cwd
}
var stdout, stderr trackedOutput
cmd.Stdout = &stdout
cmd.Stderr = &stderr
err := runCommandWithDynamicTick(ctx, cmd, "exec", command, policy.Difficulty, policy.BaseTick, policy.StallRoundLimit, func() int {
return stdout.Len() + stderr.Len()
})
out := stdout.String()
if stderr.Len() > 0 {
out += "\nSTDERR:\n" + stderr.String()
}
if strings.TrimSpace(out) != "" {
if merged.Len() > 0 {
merged.WriteString("\n")
}
merged.WriteString(out)
}
if err == nil {
return merged.String(), nil
}
if errors.Is(err, ErrCommandNoProgress) && ctx.Err() == nil && attempt < policy.MaxRestarts {
merged.WriteString(fmt.Sprintf("\n[RESTART] no progress for %d ticks, restarting (%d/%d)\n",
policy.StallRoundLimit, attempt+1, policy.MaxRestarts))
continue
}
return merged.String(), err
}
return output, err, false
return merged.String(), nil
}
func buildExecEnv() []string {
@@ -212,6 +235,70 @@ func buildExecEnv() []string {
return append(env, "PATH="+current+":"+fallback)
}
func (t *ExecTool) commandTickBase(command string) time.Duration {
base := 2 * time.Second
if isHeavyCommand(command) {
base = 4 * time.Second
}
// Reuse configured timeout as a pacing hint (not a kill deadline).
if t.timeout > 0 {
derived := t.timeout / 30
if derived > base {
base = derived
}
}
if base > 12*time.Second {
base = 12 * time.Second
}
return base
}
func resolveCommandQueuePath(cwd string) string {
cwd = strings.TrimSpace(cwd)
if cwd == "" {
if wd, err := os.Getwd(); err == nil {
cwd = wd
}
}
if cwd == "" {
return ""
}
abs, err := filepath.Abs(cwd)
if err != nil {
return ""
}
return filepath.Join(abs, "memory", "task_queue.json")
}
func isHeavyCommand(command string) bool {
cmd := strings.ToLower(strings.TrimSpace(command))
if cmd == "" {
return false
}
heavyPatterns := []string{
"docker build",
"docker compose build",
"go build",
"go test",
"npm install",
"npm ci",
"npm run build",
"pnpm install",
"pnpm build",
"yarn install",
"yarn build",
"cargo build",
"mvn package",
"gradle build",
}
for _, p := range heavyPatterns {
if strings.Contains(cmd, p) {
return true
}
}
return false
}
func detectMissingCommandFromOutput(output string) string {
patterns := []*regexp.Regexp{
regexp.MustCompile(`(?m)(?:^|[:\s])([a-zA-Z0-9._+-]+): not found`),
@@ -278,19 +365,8 @@ func (t *ExecTool) tryAutoInstallMissingCommand(ctx context.Context, commandName
return fmt.Sprintf("No supported package manager found to install missing command: %s", name), false
}
timeout := 5 * time.Minute
if t.timeout > 0 && t.timeout < timeout {
timeout = t.timeout
}
for _, installCmd := range candidates {
installCtx, cancel := context.WithTimeout(ctx, timeout)
output, err, timedOut := t.runShellCommand(installCtx, installCmd, cwd)
cancel()
if timedOut {
continue
}
output, err := t.runShellCommand(ctx, installCmd, cwd)
if err == nil && commandExists(name) {
return fmt.Sprintf("Installed %s using: %s\n%s", name, installCmd, output), true
}

View File

@@ -0,0 +1,48 @@
package tools
import (
"testing"
"time"
)
func TestIsHeavyCommand(t *testing.T) {
tests := []struct {
command string
heavy bool
}{
{command: "docker build -t app .", heavy: true},
{command: "docker compose build api", heavy: true},
{command: "go test ./...", heavy: true},
{command: "npm run build", heavy: true},
{command: "echo hello", heavy: false},
}
for _, tt := range tests {
if got := isHeavyCommand(tt.command); got != tt.heavy {
t.Fatalf("isHeavyCommand(%q)=%v want %v", tt.command, got, tt.heavy)
}
}
}
func TestCommandTickBase(t *testing.T) {
light := (&ExecTool{}).commandTickBase("echo hello")
heavy := (&ExecTool{}).commandTickBase("docker build -t app .")
if heavy <= light {
t.Fatalf("expected heavy command base tick > light, got heavy=%v light=%v", heavy, light)
}
}
func TestNextCommandTick(t *testing.T) {
base := 2 * time.Second
t1 := nextCommandTick(base, 30*time.Second)
t2 := nextCommandTick(base, 5*time.Minute)
if t1 < base {
t.Fatalf("tick should not shrink below base: %v", t1)
}
if t2 <= t1 {
t.Fatalf("tick should grow with elapsed time: t1=%v t2=%v", t1, t2)
}
if t2 > 45*time.Second {
t.Fatalf("tick should be capped, got %v", t2)
}
}

View File

@@ -2,6 +2,7 @@ package tools
import (
"context"
"errors"
"fmt"
"os"
"os/exec"
@@ -45,8 +46,8 @@ func (t *SkillExecTool) Parameters() map[string]interface{} {
},
"timeout_sec": map[string]interface{}{
"type": "integer",
"default": 60,
"description": "Execution timeout in seconds",
"default": 0,
"description": "Deprecated. No hard timeout is enforced.",
},
"reason": map[string]interface{}{
"type": "string",
@@ -70,10 +71,8 @@ func (t *SkillExecTool) Execute(ctx context.Context, args map[string]interface{}
t.writeAudit(skill, script, reason, false, err.Error())
return "", err
}
timeoutSec := 60
if raw, ok := args["timeout_sec"].(float64); ok && raw > 0 {
timeoutSec = int(raw)
if strings.TrimSpace(t.workspace) != "" {
globalCommandWatchdog.setQueuePath(filepath.Join(strings.TrimSpace(t.workspace), "memory", "task_queue.json"))
}
skillDir, err := t.resolveSkillDir(skill)
@@ -115,22 +114,55 @@ func (t *SkillExecTool) Execute(ctx context.Context, args map[string]interface{}
}
}
runCtx, cancel := context.WithTimeout(ctx, time.Duration(timeoutSec)*time.Second)
defer cancel()
cmd, err := buildSkillCommand(runCtx, scriptPath, cmdArgs)
if err != nil {
t.writeAudit(skill, script, reason, false, err.Error())
return "", err
commandLabel := relScript
if len(cmdArgs) > 0 {
commandLabel += " " + strings.Join(cmdArgs, " ")
}
cmd.Dir = skillDir
output, err := cmd.CombinedOutput()
if err != nil {
t.writeAudit(skill, script, reason, false, err.Error())
return "", fmt.Errorf("skill execution failed: %w\n%s", err, string(output))
policy := buildCommandRuntimePolicy(commandLabel, 2*time.Second)
var merged strings.Builder
var runErr error
for attempt := 0; attempt <= policy.MaxRestarts; attempt++ {
cmd, err := buildSkillCommand(ctx, scriptPath, cmdArgs)
if err != nil {
t.writeAudit(skill, script, reason, false, err.Error())
return "", err
}
cmd.Dir = skillDir
var stdout, stderr trackedOutput
cmd.Stdout = &stdout
cmd.Stderr = &stderr
err = runCommandWithDynamicTick(ctx, cmd, "skill_exec", commandLabel, policy.Difficulty, policy.BaseTick, policy.StallRoundLimit, func() int {
return stdout.Len() + stderr.Len()
})
out := stdout.String()
if stderr.Len() > 0 {
out += "\nSTDERR:\n" + stderr.String()
}
if strings.TrimSpace(out) != "" {
if merged.Len() > 0 {
merged.WriteString("\n")
}
merged.WriteString(out)
}
if err == nil {
runErr = nil
break
}
runErr = err
if errors.Is(err, ErrCommandNoProgress) && ctx.Err() == nil && attempt < policy.MaxRestarts {
merged.WriteString(fmt.Sprintf("\n[RESTART] no progress for %d ticks, restarting (%d/%d)\n",
policy.StallRoundLimit, attempt+1, policy.MaxRestarts))
continue
}
break
}
output := merged.String()
if runErr != nil {
t.writeAudit(skill, script, reason, false, runErr.Error())
return "", fmt.Errorf("skill execution failed: %w\n%s", runErr, output)
}
out := strings.TrimSpace(string(output))
out := strings.TrimSpace(output)
if out == "" {
out = "(no output)"
}

View File

@@ -3,9 +3,11 @@ package tools
import (
"context"
"fmt"
"sync"
)
type SpawnTool struct {
mu sync.RWMutex
manager *SubagentManager
originChannel string
originChatID string
@@ -51,12 +53,22 @@ func (t *SpawnTool) Parameters() map[string]interface{} {
"type": "string",
"description": "Optional task ID under the pipeline",
},
"channel": map[string]interface{}{
"type": "string",
"description": "Optional origin channel override",
},
"chat_id": map[string]interface{}{
"type": "string",
"description": "Optional origin chat ID override",
},
},
"required": []string{"task"},
}
}
func (t *SpawnTool) SetContext(channel, chatID string) {
t.mu.Lock()
defer t.mu.Unlock()
t.originChannel = channel
t.originChatID = chatID
}
@@ -79,7 +91,22 @@ func (t *SpawnTool) Execute(ctx context.Context, args map[string]interface{}) (s
return "Error: Subagent manager not configured", nil
}
result, err := t.manager.Spawn(ctx, task, label, t.originChannel, t.originChatID, pipelineID, taskID)
originChannel, _ := args["channel"].(string)
originChatID, _ := args["chat_id"].(string)
if originChannel == "" || originChatID == "" {
t.mu.RLock()
defaultChannel := t.originChannel
defaultChatID := t.originChatID
t.mu.RUnlock()
if originChannel == "" {
originChannel = defaultChannel
}
if originChatID == "" {
originChatID = defaultChatID
}
}
result, err := t.manager.Spawn(ctx, task, label, originChannel, originChatID, pipelineID, taskID)
if err != nil {
return "", fmt.Errorf("failed to spawn subagent: %w", err)
}

View File

@@ -0,0 +1,43 @@
package tools
import (
"bytes"
"sync"
"sync/atomic"
)
// trackedOutput is a thread-safe writer+buffer pair used by command progress checks.
type trackedOutput struct {
mu sync.Mutex
buf bytes.Buffer
size atomic.Int64
}
func (t *trackedOutput) Write(p []byte) (int, error) {
if t == nil {
return 0, nil
}
t.mu.Lock()
n, err := t.buf.Write(p)
t.mu.Unlock()
if n > 0 {
t.size.Add(int64(n))
}
return n, err
}
func (t *trackedOutput) Len() int {
if t == nil {
return 0
}
return int(t.size.Load())
}
func (t *trackedOutput) String() string {
if t == nil {
return ""
}
t.mu.Lock()
defer t.mu.Unlock()
return t.buf.String()
}

View File

@@ -119,7 +119,7 @@ const TaskAudit: React.FC = () => {
if (!ok) return;
}
try {
const url = `/webui/api/task_queue${q}`;
const url = `/webui/api/task_audit${q}`;
const r = await fetch(url, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ action, task_id: selected.task_id }) });
if (!r.ok) throw new Error(await r.text());
await fetchData();
@@ -129,6 +129,7 @@ const TaskAudit: React.FC = () => {
};
const selectedPretty = useMemo(() => selected ? JSON.stringify(selected, null, 2) : '', [selected]);
const selectedReadonly = selected?.source === 'command_watchdog';
return (
<div className="h-full p-4 md:p-6 flex flex-col gap-4">
@@ -140,6 +141,7 @@ const TaskAudit: React.FC = () => {
<option value="autonomy">{t('sourceAutonomy')}</option>
<option value="direct">{t('sourceDirect')}</option>
<option value="memory_todo">{t('sourceMemoryTodo')}</option>
<option value="command_watchdog">command_watchdog</option>
<option value="-">-</option>
</select>
<select value={statusFilter} onChange={(e)=>setStatusFilter(e.target.value)} className="bg-zinc-900 border border-zinc-700 rounded px-2 py-1 text-xs">
@@ -195,13 +197,16 @@ const TaskAudit: React.FC = () => {
<div className="border border-zinc-800 rounded-xl bg-zinc-900/40 overflow-hidden flex flex-col min-h-0">
<div className="px-3 py-2 border-b border-zinc-800 text-xs text-zinc-400 uppercase tracking-wider">{t('taskDetail')}</div>
<div className="p-4 overflow-y-auto min-h-0 space-y-3 text-sm">
{selected && (
{selected && !selectedReadonly && (
<div className="flex items-center gap-2 flex-wrap">
<button onClick={()=>taskAction('pause')} className="px-2 py-1 text-xs rounded bg-amber-700/70 hover:bg-amber-600">{t('pauseTask')}</button>
<button onClick={()=>taskAction('retry')} className="px-2 py-1 text-xs rounded bg-indigo-700/70 hover:bg-indigo-600">{t('retryTask')}</button>
<button onClick={()=>taskAction('complete')} className="px-2 py-1 text-xs rounded bg-emerald-700/70 hover:bg-emerald-600">{t('completeTask')}</button>
<button onClick={()=>taskAction('ignore')} className="px-2 py-1 text-xs rounded bg-zinc-700 hover:bg-zinc-600">{t('ignoreTask')}</button>
</div>
)}
{selectedReadonly && (
<div className="text-xs text-zinc-400">{t('source')}: command_watchdogreadonly</div>
)}
{!selected ? (
<div className="text-zinc-500">{t('selectTask')}</div>