feat: harden concurrency scheduling and task watchdog

2026-05-29 04:27:28 +08:00 · 2026-03-05 11:32:06 +08:00
parent 0f3196f305
commit 2fbb98bccd
20 changed files with 1526 additions and 159 deletions
--- a/pkg/tools/shell.go
+++ b/pkg/tools/shell.go
@@ -1,8 +1,8 @@
 package tools

 import (
-	"bytes"
 	"context"
+	"errors"
 	"fmt"
 	"os"
 	"os/exec"
@@ -16,12 +16,12 @@ import (
 )

 type ExecTool struct {
-	workingDir     string
-	timeout        time.Duration
-	sandboxEnabled    bool
-	sandboxImage      string
+	workingDir         string
+	timeout            time.Duration
+	sandboxEnabled     bool
+	sandboxImage       string
 	autoInstallMissing bool
-	procManager       *ProcessManager
+	procManager        *ProcessManager
 }

 func NewExecTool(cfg config.ShellConfig, workspace string, pm *ProcessManager) *ExecTool {
@@ -81,6 +81,11 @@ func (t *ExecTool) Execute(ctx context.Context, args map[string]interface{}) (st
 			cwd = wd
 		}
 	}
+	queueBase := strings.TrimSpace(t.workingDir)
+	if queueBase == "" {
+		queueBase = cwd
+	}
+	globalCommandWatchdog.setQueuePath(resolveCommandQueuePath(queueBase))

 	if bg, _ := args["background"].(bool); bg {
 		if t.procManager == nil {
@@ -112,26 +117,38 @@ func (t *ExecTool) executeInSandbox(ctx context.Context, command, cwd string) (s
 		t.sandboxImage,
 		"sh", "-c", command,
 	}
-
-	cmdCtx, cancel := context.WithTimeout(ctx, t.timeout)
-	defer cancel()
-
-	cmd := exec.CommandContext(cmdCtx, "docker", dockerArgs...)
-	var stdout, stderr bytes.Buffer
-	cmd.Stdout = &stdout
-	cmd.Stderr = &stderr
-
-	err := cmd.Run()
-	output := stdout.String()
-	if stderr.Len() > 0 {
-		output += "\nSTDERR:\n" + stderr.String()
+	policy := buildCommandRuntimePolicy(command, t.commandTickBase(command))
+	var merged strings.Builder
+	for attempt := 0; attempt <= policy.MaxRestarts; attempt++ {
+		cmd := exec.CommandContext(ctx, "docker", dockerArgs...)
+		var stdout, stderr trackedOutput
+		cmd.Stdout = &stdout
+		cmd.Stderr = &stderr
+		err := runCommandWithDynamicTick(ctx, cmd, "exec:sandbox", command, policy.Difficulty, policy.BaseTick, policy.StallRoundLimit, func() int {
+			return stdout.Len() + stderr.Len()
+		})
+		out := stdout.String()
+		if stderr.Len() > 0 {
+			out += "\nSTDERR:\n" + stderr.String()
+		}
+		if strings.TrimSpace(out) != "" {
+			if merged.Len() > 0 {
+				merged.WriteString("\n")
+			}
+			merged.WriteString(out)
+		}
+		if err == nil {
+			return merged.String(), nil
+		}
+		if errors.Is(err, ErrCommandNoProgress) && ctx.Err() == nil && attempt < policy.MaxRestarts {
+			merged.WriteString(fmt.Sprintf("\n[RESTART] no progress for %d ticks, restarting (%d/%d)\n",
+				policy.StallRoundLimit, attempt+1, policy.MaxRestarts))
+			continue
+		}
+		merged.WriteString(fmt.Sprintf("\nSandbox Exit code: %v", err))
+		return merged.String(), nil
 	}
-
-	if err != nil {
-		output += fmt.Sprintf("\nSandbox Exit code: %v", err)
-	}
-
-	return output, nil
+	return merged.String(), nil
 }

 func (t *ExecTool) SetTimeout(timeout time.Duration) {
@@ -139,19 +156,13 @@ func (t *ExecTool) SetTimeout(timeout time.Duration) {
 }

 func (t *ExecTool) executeCommand(ctx context.Context, command, cwd string) (string, error) {
-	output, err, timedOut := t.runShellCommand(ctx, command, cwd)
-	if timedOut {
-		return fmt.Sprintf("Error: Command timed out after %v", t.timeout), nil
-	}
+	output, err := t.runShellCommand(ctx, command, cwd)

 	if err != nil && t.autoInstallMissing {
 		if missingCmd := detectMissingCommandFromOutput(output); missingCmd != "" {
 			if installLog, installed := t.tryAutoInstallMissingCommand(ctx, missingCmd, cwd); installed {
 				output += "\n[AUTO-INSTALL]\n" + installLog
-				retryOutput, retryErr, retryTimedOut := t.runShellCommand(ctx, command, cwd)
-				if retryTimedOut {
-					return fmt.Sprintf("Error: Command timed out after %v", t.timeout), nil
-				}
+				retryOutput, retryErr := t.runShellCommand(ctx, command, cwd)
 				output += "\n[RETRY]\n" + retryOutput
 				err = retryErr
 			}
@@ -173,32 +184,44 @@ func (t *ExecTool) executeCommand(ctx context.Context, command, cwd string) (str
 	return output, nil
 }

-func (t *ExecTool) runShellCommand(ctx context.Context, command, cwd string) (string, error, bool) {
-	cmdCtx, cancel := context.WithTimeout(ctx, t.timeout)
-	defer cancel()
-
-	cmd := exec.CommandContext(cmdCtx, "sh", "-c", command)
-	cmd.Env = buildExecEnv()
-	if cwd != "" {
-		cmd.Dir = cwd
-	}
-
-	var stdout, stderr bytes.Buffer
-	cmd.Stdout = &stdout
-	cmd.Stderr = &stderr
-
-	err := cmd.Run()
-	output := stdout.String()
-	if stderr.Len() > 0 {
-		output += "\nSTDERR:\n" + stderr.String()
-	}
-
-	if err != nil {
-		if cmdCtx.Err() == context.DeadlineExceeded {
-			return output, err, true
+func (t *ExecTool) runShellCommand(ctx context.Context, command, cwd string) (string, error) {
+	policy := buildCommandRuntimePolicy(command, t.commandTickBase(command))
+	var merged strings.Builder
+	for attempt := 0; attempt <= policy.MaxRestarts; attempt++ {
+		cmd := exec.CommandContext(ctx, "sh", "-c", command)
+		cmd.Env = buildExecEnv()
+		if cwd != "" {
+			cmd.Dir = cwd
 		}
+
+		var stdout, stderr trackedOutput
+		cmd.Stdout = &stdout
+		cmd.Stderr = &stderr
+
+		err := runCommandWithDynamicTick(ctx, cmd, "exec", command, policy.Difficulty, policy.BaseTick, policy.StallRoundLimit, func() int {
+			return stdout.Len() + stderr.Len()
+		})
+		out := stdout.String()
+		if stderr.Len() > 0 {
+			out += "\nSTDERR:\n" + stderr.String()
+		}
+		if strings.TrimSpace(out) != "" {
+			if merged.Len() > 0 {
+				merged.WriteString("\n")
+			}
+			merged.WriteString(out)
+		}
+		if err == nil {
+			return merged.String(), nil
+		}
+		if errors.Is(err, ErrCommandNoProgress) && ctx.Err() == nil && attempt < policy.MaxRestarts {
+			merged.WriteString(fmt.Sprintf("\n[RESTART] no progress for %d ticks, restarting (%d/%d)\n",
+				policy.StallRoundLimit, attempt+1, policy.MaxRestarts))
+			continue
+		}
+		return merged.String(), err
 	}
-	return output, err, false
+	return merged.String(), nil
 }

 func buildExecEnv() []string {
@@ -212,6 +235,70 @@ func buildExecEnv() []string {
 	return append(env, "PATH="+current+":"+fallback)
 }

+func (t *ExecTool) commandTickBase(command string) time.Duration {
+	base := 2 * time.Second
+	if isHeavyCommand(command) {
+		base = 4 * time.Second
+	}
+	// Reuse configured timeout as a pacing hint (not a kill deadline).
+	if t.timeout > 0 {
+		derived := t.timeout / 30
+		if derived > base {
+			base = derived
+		}
+	}
+	if base > 12*time.Second {
+		base = 12 * time.Second
+	}
+	return base
+}
+
+func resolveCommandQueuePath(cwd string) string {
+	cwd = strings.TrimSpace(cwd)
+	if cwd == "" {
+		if wd, err := os.Getwd(); err == nil {
+			cwd = wd
+		}
+	}
+	if cwd == "" {
+		return ""
+	}
+	abs, err := filepath.Abs(cwd)
+	if err != nil {
+		return ""
+	}
+	return filepath.Join(abs, "memory", "task_queue.json")
+}
+
+func isHeavyCommand(command string) bool {
+	cmd := strings.ToLower(strings.TrimSpace(command))
+	if cmd == "" {
+		return false
+	}
+	heavyPatterns := []string{
+		"docker build",
+		"docker compose build",
+		"go build",
+		"go test",
+		"npm install",
+		"npm ci",
+		"npm run build",
+		"pnpm install",
+		"pnpm build",
+		"yarn install",
+		"yarn build",
+		"cargo build",
+		"mvn package",
+		"gradle build",
+	}
+	for _, p := range heavyPatterns {
+		if strings.Contains(cmd, p) {
+			return true
+		}
+	}
+	return false
+}
+
 func detectMissingCommandFromOutput(output string) string {
 	patterns := []*regexp.Regexp{
 		regexp.MustCompile(`(?m)(?:^|[:\s])([a-zA-Z0-9._+-]+): not found`),
@@ -278,19 +365,8 @@ func (t *ExecTool) tryAutoInstallMissingCommand(ctx context.Context, commandName
 		return fmt.Sprintf("No supported package manager found to install missing command: %s", name), false
 	}

-	timeout := 5 * time.Minute
-	if t.timeout > 0 && t.timeout < timeout {
-		timeout = t.timeout
-	}
-
 	for _, installCmd := range candidates {
-		installCtx, cancel := context.WithTimeout(ctx, timeout)
-		output, err, timedOut := t.runShellCommand(installCtx, installCmd, cwd)
-		cancel()
-
-		if timedOut {
-			continue
-		}
+		output, err := t.runShellCommand(ctx, installCmd, cwd)
 		if err == nil && commandExists(name) {
 			return fmt.Sprintf("Installed %s using: %s\n%s", name, installCmd, output), true
 		}