Files
clawgo/pkg/tools/task_watchdog.go
2026-03-07 12:27:34 +08:00

887 lines
20 KiB
Go

package tools
import (
"context"
"encoding/json"
"errors"
"fmt"
"os"
"os/exec"
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
)
const (
minCommandTick = 1 * time.Second
maxCommandTick = 45 * time.Second
watchdogTick = 1 * time.Second
minWorldCycle = 10 * time.Second
maxWorldCycle = 60 * time.Second
)
var ErrCommandNoProgress = errors.New("command no progress across tick rounds")
var ErrTaskWatchdogTimeout = errors.New("task watchdog timeout exceeded")
type commandRuntimePolicy struct {
BaseTick time.Duration
StallRoundLimit int
MaxRestarts int
Difficulty int
}
func buildCommandRuntimePolicy(command string, baseTick time.Duration) commandRuntimePolicy {
diff := commandDifficulty(command)
cpu := runtime.NumCPU()
// Baseline: kill/restart after 5 unchanged-progress ticks.
stallLimit := 5
// Difficulty adjustment (1..4) => +0..6 rounds.
stallLimit += (diff - 1) * 2
// Hardware adjustment: weaker CPU gets more patience.
switch {
case cpu <= 4:
stallLimit += 5
case cpu <= 8:
stallLimit += 3
case cpu <= 16:
stallLimit += 1
}
if stallLimit < 5 {
stallLimit = 5
}
if stallLimit > 24 {
stallLimit = 24
}
// Restart budget: heavier tasks and weaker CPUs allow extra retries.
restarts := 1
if diff >= 3 {
restarts++
}
if cpu <= 4 {
restarts++
}
if restarts > 3 {
restarts = 3
}
return commandRuntimePolicy{
BaseTick: normalizeCommandTick(baseTick),
StallRoundLimit: stallLimit,
MaxRestarts: restarts,
Difficulty: diff,
}
}
type commandWatchdog struct {
mu sync.Mutex
watches map[uint64]*watchedCommand
waiters []*watchWaiter
nextID uint64
cpuTotal int
baseActive int
baseHeavy int
reservePct float64
usageRatio float64
lastSample time.Time
worldCycle time.Duration
nextSampleAt time.Time
active int
activeHeavy int
queueLimit int
queuePath string
}
type watchedCommand struct {
id uint64
cmd *exec.Cmd
startedAt time.Time
baseTick time.Duration
stallRoundLimit int
nextCheckAt time.Time
lastProgress int
stalledRounds int
progressFn func() int
stallNotify chan int
heavy bool
source string
label string
}
type stalledCommand struct {
cmd *exec.Cmd
rounds int
notify chan int
}
type watchWaiter struct {
id uint64
heavy bool
ready chan struct{}
source string
label string
enqueuedAt time.Time
}
var globalCommandWatchdog = newCommandWatchdog()
var reLoadAverage = regexp.MustCompile(`load averages?:\s*([0-9]+(?:[.,][0-9]+)?)`)
func newCommandWatchdog() *commandWatchdog {
cpu := runtime.NumCPU()
baseActive, baseHeavy, queueLimit := deriveWatchdogLimits(cpu)
wd := &commandWatchdog{
watches: make(map[uint64]*watchedCommand),
waiters: make([]*watchWaiter, 0, queueLimit),
cpuTotal: cpu,
baseActive: baseActive,
baseHeavy: baseHeavy,
reservePct: 0.20,
usageRatio: 0,
worldCycle: 20 * time.Second,
queueLimit: queueLimit,
}
go wd.loop()
return wd
}
func deriveWatchdogLimits(cpu int) (maxActive, maxHeavy, queueLimit int) {
if cpu <= 0 {
cpu = 2
}
maxActive = cpu
if maxActive < 2 {
maxActive = 2
}
if maxActive > 12 {
maxActive = 12
}
maxHeavy = cpu/4 + 1
if maxHeavy < 1 {
maxHeavy = 1
}
if maxHeavy > 4 {
maxHeavy = 4
}
queueLimit = maxActive * 8
if queueLimit < 16 {
queueLimit = 16
}
return
}
func (wd *commandWatchdog) loop() {
ticker := time.NewTicker(watchdogTick)
defer ticker.Stop()
for now := range ticker.C {
wd.refreshSystemUsage(now)
wd.tick(now)
}
}
func (wd *commandWatchdog) refreshSystemUsage(now time.Time) {
if wd == nil {
return
}
wd.mu.Lock()
if wd.nextSampleAt.IsZero() {
wd.nextSampleAt = now
}
if now.Before(wd.nextSampleAt) {
wd.mu.Unlock()
return
}
wd.lastSample = now
cpu := wd.cpuTotal
cycle := wd.computeWorldCycleLocked()
wd.worldCycle = cycle
wd.nextSampleAt = now.Add(cycle)
wd.mu.Unlock()
usage := sampleSystemUsageRatio(cpu)
wd.mu.Lock()
wd.usageRatio = usage
wd.mu.Unlock()
}
func (wd *commandWatchdog) computeWorldCycleLocked() time.Duration {
if wd == nil {
return 20 * time.Second
}
// Game-world style cycle:
// base=20s; busier world => shorter cycle; idle world => longer cycle.
cycle := 20 * time.Second
pending := len(wd.waiters)
if pending > 0 {
cycle -= time.Duration(minInt(pending, 8)) * time.Second
}
if wd.active > wd.baseActive/2 {
cycle -= 3 * time.Second
}
if wd.active == 0 && pending == 0 {
cycle += 10 * time.Second
}
if cycle < minWorldCycle {
cycle = minWorldCycle
}
if cycle > maxWorldCycle {
cycle = maxWorldCycle
}
return cycle
}
func (wd *commandWatchdog) register(cmd *exec.Cmd, baseTick time.Duration, stallRoundLimit int, progressFn func() int, stallNotify chan int, heavy bool, source, label string) func() {
if wd == nil || cmd == nil {
return func() {}
}
base := normalizeCommandTick(baseTick)
id := atomic.AddUint64(&wd.nextID, 1)
w := &watchedCommand{
id: id,
cmd: cmd,
startedAt: time.Now(),
baseTick: base,
stallRoundLimit: stallRoundLimit,
nextCheckAt: time.Now().Add(base),
lastProgress: safeProgress(progressFn),
progressFn: progressFn,
stallNotify: stallNotify,
heavy: heavy,
source: strings.TrimSpace(source),
label: strings.TrimSpace(label),
}
wd.mu.Lock()
wd.watches[id] = w
snap := wd.buildQueueSnapshotLocked()
wd.mu.Unlock()
wd.writeQueueSnapshot(snap)
var once sync.Once
return func() {
once.Do(func() {
wd.mu.Lock()
delete(wd.watches, id)
snap := wd.buildQueueSnapshotLocked()
wd.mu.Unlock()
wd.writeQueueSnapshot(snap)
})
}
}
func (wd *commandWatchdog) setQueuePath(path string) {
if wd == nil {
return
}
path = strings.TrimSpace(path)
if path != "" {
path = filepath.Clean(path)
}
wd.mu.Lock()
changed := wd.queuePath != path
wd.queuePath = path
snap := wd.buildQueueSnapshotLocked()
wd.mu.Unlock()
if changed {
wd.writeQueueSnapshot(snap)
}
}
func (wd *commandWatchdog) acquireSlot(ctx context.Context, heavy bool, source, label string) (func(), error) {
if wd == nil {
return func() {}, nil
}
if ctx == nil {
ctx = context.Background()
}
wd.mu.Lock()
if wd.canAcquireSlotLocked(heavy) {
wd.grantSlotLocked(heavy)
snap := wd.buildQueueSnapshotLocked()
wd.mu.Unlock()
wd.writeQueueSnapshot(snap)
return wd.releaseSlotFunc(heavy), nil
}
// Queue when slots are full; wait until a slot is available or context cancels.
waitID := atomic.AddUint64(&wd.nextID, 1)
w := &watchWaiter{
id: waitID,
heavy: heavy,
ready: make(chan struct{}, 1),
source: strings.TrimSpace(source),
label: strings.TrimSpace(label),
enqueuedAt: time.Now(),
}
wd.waiters = append(wd.waiters, w)
snap := wd.buildQueueSnapshotLocked()
wd.mu.Unlock()
wd.writeQueueSnapshot(snap)
for {
select {
case <-ctx.Done():
wd.mu.Lock()
wd.removeWaiterLocked(waitID)
snap := wd.buildQueueSnapshotLocked()
wd.mu.Unlock()
wd.writeQueueSnapshot(snap)
return nil, ctx.Err()
case <-w.ready:
return wd.releaseSlotFunc(heavy), nil
}
}
}
func (wd *commandWatchdog) releaseSlotFunc(heavy bool) func() {
var once sync.Once
return func() {
once.Do(func() {
wd.mu.Lock()
if wd.active > 0 {
wd.active--
}
if heavy && wd.activeHeavy > 0 {
wd.activeHeavy--
}
wd.scheduleWaitersLocked()
snap := wd.buildQueueSnapshotLocked()
wd.mu.Unlock()
wd.writeQueueSnapshot(snap)
})
}
}
func (wd *commandWatchdog) canAcquireSlotLocked(heavy bool) bool {
maxActive, maxHeavy := wd.dynamicLimitsLocked()
if wd.active >= maxActive {
return false
}
if heavy && wd.activeHeavy >= maxHeavy {
return false
}
return true
}
func (wd *commandWatchdog) grantSlotLocked(heavy bool) {
wd.active++
if heavy {
wd.activeHeavy++
}
}
func (wd *commandWatchdog) dynamicLimitsLocked() (maxActive, maxHeavy int) {
if wd == nil {
return 1, 1
}
maxActive = computeDynamicActiveSlots(wd.cpuTotal, wd.reservePct, wd.usageRatio, wd.baseActive)
maxHeavy = computeDynamicHeavySlots(maxActive, wd.baseHeavy)
return
}
func computeDynamicActiveSlots(cpu int, reservePct, usageRatio float64, baseActive int) int {
if cpu <= 0 {
cpu = 1
}
if reservePct <= 0 {
reservePct = 0.20
}
if reservePct > 0.90 {
reservePct = 0.90
}
if usageRatio < 0 {
usageRatio = 0
}
if usageRatio > 0.95 {
usageRatio = 0.95
}
headroom := 1.0 - reservePct - usageRatio
if headroom < 0 {
headroom = 0
}
maxActive := int(float64(cpu) * headroom)
if maxActive < 1 {
maxActive = 1
}
if baseActive > 0 && maxActive > baseActive {
maxActive = baseActive
}
return maxActive
}
func computeDynamicHeavySlots(maxActive, baseHeavy int) int {
if maxActive <= 0 {
return 1
}
maxHeavy := maxActive/2 + 1
if maxHeavy < 1 {
maxHeavy = 1
}
if baseHeavy > 0 && maxHeavy > baseHeavy {
maxHeavy = baseHeavy
}
if maxHeavy > maxActive {
maxHeavy = maxActive
}
return maxHeavy
}
func (wd *commandWatchdog) scheduleWaitersLocked() {
if len(wd.waiters) == 0 {
return
}
for {
progress := false
for i := 0; i < len(wd.waiters); {
w := wd.waiters[i]
if w == nil {
wd.waiters = append(wd.waiters[:i], wd.waiters[i+1:]...)
progress = true
continue
}
if !wd.canAcquireSlotLocked(w.heavy) {
i++
continue
}
wd.grantSlotLocked(w.heavy)
wd.waiters = append(wd.waiters[:i], wd.waiters[i+1:]...)
select {
case w.ready <- struct{}{}:
default:
}
progress = true
}
if !progress {
break
}
}
}
func (wd *commandWatchdog) removeWaiterLocked(id uint64) {
if id == 0 || len(wd.waiters) == 0 {
return
}
for i, w := range wd.waiters {
if w == nil || w.id != id {
continue
}
wd.waiters = append(wd.waiters[:i], wd.waiters[i+1:]...)
return
}
}
func (wd *commandWatchdog) tick(now time.Time) {
if wd == nil {
return
}
toStall := make([]stalledCommand, 0, 4)
changed := false
wd.mu.Lock()
for id, w := range wd.watches {
if w == nil {
delete(wd.watches, id)
changed = true
continue
}
if now.Before(w.nextCheckAt) {
continue
}
cur := safeProgress(w.progressFn)
if cur > w.lastProgress {
w.lastProgress = cur
w.stalledRounds = 0
} else {
w.stalledRounds++
changed = true
if w.stallRoundLimit > 0 && w.stalledRounds >= w.stallRoundLimit {
delete(wd.watches, id)
changed = true
toStall = append(toStall, stalledCommand{
cmd: w.cmd,
rounds: w.stalledRounds,
notify: w.stallNotify,
})
continue
}
}
next := nextCommandTick(w.baseTick, now.Sub(w.startedAt))
w.nextCheckAt = now.Add(next)
changed = true
}
snap := wd.buildQueueSnapshotLocked()
wd.mu.Unlock()
if changed {
wd.writeQueueSnapshot(snap)
}
for _, st := range toStall {
if st.cmd != nil && st.cmd.Process != nil {
_ = st.cmd.Process.Kill()
}
if st.notify != nil {
select {
case st.notify <- st.rounds:
default:
}
}
}
}
func safeProgress(progressFn func() int) (progress int) {
if progressFn == nil {
return 0
}
defer func() {
if recover() != nil {
progress = 0
}
}()
progress = progressFn()
if progress < 0 {
return 0
}
return progress
}
func runCommandWithDynamicTick(ctx context.Context, cmd *exec.Cmd, source, label string, difficulty int, baseTick time.Duration, stallRoundLimit int, progressFn func() int) error {
base := normalizeCommandTick(baseTick)
heavy := difficulty >= 3
releaseSlot, err := globalCommandWatchdog.acquireSlot(ctx, heavy, source, label)
if err != nil {
return err
}
defer releaseSlot()
if err := cmd.Start(); err != nil {
return err
}
done := make(chan error, 1)
go func() { done <- cmd.Wait() }()
stallNotify := make(chan int, 1)
unwatch := globalCommandWatchdog.register(cmd, base, stallRoundLimit, progressFn, stallNotify, heavy, source, label)
defer unwatch()
for {
select {
case err := <-done:
return err
case stalledRounds := <-stallNotify:
select {
case err := <-done:
return fmt.Errorf("%w: %d ticks without progress (%v)", ErrCommandNoProgress, stalledRounds, err)
case <-time.After(2 * time.Second):
return fmt.Errorf("%w: %d ticks without progress", ErrCommandNoProgress, stalledRounds)
}
case <-ctx.Done():
if cmd.Process != nil {
_ = cmd.Process.Kill()
}
select {
case err := <-done:
if err != nil {
return err
}
case <-time.After(2 * time.Second):
}
return ctx.Err()
}
}
}
type stringTaskResult struct {
output string
err error
}
type stringTaskWatchdogOptions struct {
ProgressFn func() int
CanExtend func() bool
}
// runStringTaskWithTaskWatchdog executes a string-returning task with the same
// tick pacing as the command watchdog, but only times out after a full timeout
// window without observable progress or an allowed extension signal.
func runStringTaskWithTaskWatchdog(
ctx context.Context,
timeoutSec int,
baseTick time.Duration,
opts stringTaskWatchdogOptions,
run func(context.Context) (string, error),
) (string, error) {
if run == nil {
return "", fmt.Errorf("run function is nil")
}
if timeoutSec <= 0 {
return run(ctx)
}
if ctx == nil {
ctx = context.Background()
}
timeout := time.Duration(timeoutSec) * time.Second
lastProgressAt := time.Now()
lastProgress := safeProgress(opts.ProgressFn)
tick := normalizeCommandTick(baseTick)
if tick <= 0 {
tick = 2 * time.Second
}
runCtx, cancel := context.WithCancel(ctx)
defer cancel()
done := make(chan stringTaskResult, 1)
go func() {
out, err := run(runCtx)
done <- stringTaskResult{output: out, err: err}
}()
timer := time.NewTimer(tick)
defer timer.Stop()
for {
select {
case <-ctx.Done():
cancel()
return "", ctx.Err()
case res := <-done:
return res.output, res.err
case <-timer.C:
if cur := safeProgress(opts.ProgressFn); cur > lastProgress {
lastProgress = cur
lastProgressAt = time.Now()
}
stalledFor := time.Since(lastProgressAt)
if stalledFor >= timeout {
if opts.CanExtend != nil && opts.CanExtend() {
lastProgressAt = time.Now()
stalledFor = 0
} else {
cancel()
select {
case res := <-done:
if res.err != nil {
return "", fmt.Errorf("%w: %v", ErrTaskWatchdogTimeout, res.err)
}
case <-time.After(2 * time.Second):
}
return "", fmt.Errorf("%w: %ds", ErrTaskWatchdogTimeout, timeoutSec)
}
}
next := nextCommandTick(tick, stalledFor)
if next <= 0 {
next = tick
}
timer.Reset(next)
}
}
}
func (wd *commandWatchdog) buildQueueSnapshotLocked() map[string]interface{} {
if wd == nil {
return nil
}
maxActive, maxHeavy := wd.dynamicLimitsLocked()
running := make([]map[string]interface{}, 0, len(wd.watches))
for _, w := range wd.watches {
if w == nil {
continue
}
running = append(running, map[string]interface{}{
"id": w.id,
"source": queueNonEmpty(w.source, "exec"),
"label": w.label,
"heavy": w.heavy,
"status": "running",
"started_at": w.startedAt.UTC().Format(time.RFC3339),
"next_check_at": w.nextCheckAt.UTC().Format(time.RFC3339),
"stalled_rounds": w.stalledRounds,
"stall_round_limit": w.stallRoundLimit,
"last_progress": w.lastProgress,
})
}
waiting := make([]map[string]interface{}, 0, len(wd.waiters))
for _, w := range wd.waiters {
if w == nil {
continue
}
waiting = append(waiting, map[string]interface{}{
"id": w.id,
"source": queueNonEmpty(w.source, "exec"),
"label": w.label,
"heavy": w.heavy,
"status": "waiting",
"enqueued_at": w.enqueuedAt.UTC().Format(time.RFC3339),
})
}
return map[string]interface{}{
"time": time.Now().UTC().Format(time.RFC3339),
"watchdog": map[string]interface{}{
"cpu_total": wd.cpuTotal,
"reserve_pct": wd.reservePct,
"usage_ratio": wd.usageRatio,
"world_cycle_sec": int(wd.worldCycle.Seconds()),
"next_sample_at": func() string {
if wd.nextSampleAt.IsZero() {
return ""
}
return wd.nextSampleAt.UTC().Format(time.RFC3339)
}(),
"max_active": maxActive,
"max_heavy": maxHeavy,
"active": wd.active,
"active_heavy": wd.activeHeavy,
"waiting": len(waiting),
"running": len(running),
},
"running": running,
"waiting": waiting,
}
}
func (wd *commandWatchdog) writeQueueSnapshot(snap map[string]interface{}) {
if wd == nil || snap == nil {
return
}
wd.mu.Lock()
path := strings.TrimSpace(wd.queuePath)
wd.mu.Unlock()
if path == "" {
return
}
raw, err := json.MarshalIndent(snap, "", " ")
if err != nil {
return
}
_ = os.MkdirAll(filepath.Dir(path), 0755)
_ = os.WriteFile(path, raw, 0644)
}
func queueNonEmpty(v, fallback string) string {
v = strings.TrimSpace(v)
if v == "" {
return fallback
}
return v
}
func minInt(a, b int) int {
if a < b {
return a
}
return b
}
func nextCommandTick(baseTick, elapsed time.Duration) time.Duration {
base := normalizeCommandTick(baseTick)
if elapsed < 0 {
elapsed = 0
}
next := base + elapsed/8
if next > maxCommandTick {
return maxCommandTick
}
if next < base {
return base
}
return next
}
func normalizeCommandTick(baseTick time.Duration) time.Duration {
if baseTick < minCommandTick {
return minCommandTick
}
if baseTick > maxCommandTick {
return maxCommandTick
}
return baseTick
}
func commandDifficulty(command string) int {
cmd := strings.ToLower(strings.TrimSpace(command))
if cmd == "" {
return 1
}
// 4: very heavy build / container graph.
for _, p := range []string{"docker build", "docker compose build", "bazel build", "gradle build", "mvn package"} {
if strings.Contains(cmd, p) {
return 4
}
}
// 3: compile/test/install heavy workloads.
for _, p := range []string{"go test", "go build", "cargo build", "npm install", "npm ci", "pnpm install", "yarn install", "npm run build", "pnpm build", "yarn build"} {
if strings.Contains(cmd, p) {
return 3
}
}
// 2: medium multi-step shell chains.
if strings.Contains(cmd, "&&") || strings.Contains(cmd, "|") {
return 2
}
return 1
}
func sampleSystemUsageRatio(cpu int) float64 {
if cpu <= 0 {
cpu = 1
}
load1, ok := readLoadAverage1()
if !ok {
return 0
}
ratio := load1 / float64(cpu)
if ratio < 0 {
return 0
}
if ratio > 0.95 {
return 0.95
}
return ratio
}
func readLoadAverage1() (float64, bool) {
// Linux fast path.
if b, err := os.ReadFile("/proc/loadavg"); err == nil {
fields := strings.Fields(strings.TrimSpace(string(b)))
if len(fields) > 0 {
if v, err := strconv.ParseFloat(fields[0], 64); err == nil && v >= 0 {
return v, true
}
}
}
// macOS/BSD fallback.
if out, err := runCommandOutputWithTimeout(300*time.Millisecond, "sysctl", "-n", "vm.loadavg"); err == nil {
fields := strings.Fields(strings.Trim(strings.TrimSpace(string(out)), "{}"))
if len(fields) > 0 {
if v, err := strconv.ParseFloat(strings.ReplaceAll(fields[0], ",", "."), 64); err == nil && v >= 0 {
return v, true
}
}
}
if out, err := runCommandOutputWithTimeout(300*time.Millisecond, "uptime"); err == nil {
m := reLoadAverage.FindStringSubmatch(strings.ToLower(string(out)))
if len(m) >= 2 {
if v, err := strconv.ParseFloat(strings.ReplaceAll(m[1], ",", "."), 64); err == nil && v >= 0 {
return v, true
}
}
}
return 0, false
}
func runCommandOutputWithTimeout(timeout time.Duration, name string, args ...string) ([]byte, error) {
if timeout <= 0 {
timeout = 300 * time.Millisecond
}
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
return exec.CommandContext(ctx, name, args...).Output()
}