feat: harden concurrency scheduling and task watchdog

This commit is contained in:
lpf
2026-03-05 11:32:06 +08:00
parent 0f3196f305
commit 2fbb98bccd
20 changed files with 1526 additions and 159 deletions

View File

@@ -2,6 +2,7 @@ package tools
import (
"context"
"errors"
"fmt"
"os"
"os/exec"
@@ -45,8 +46,8 @@ func (t *SkillExecTool) Parameters() map[string]interface{} {
},
"timeout_sec": map[string]interface{}{
"type": "integer",
"default": 60,
"description": "Execution timeout in seconds",
"default": 0,
"description": "Deprecated. No hard timeout is enforced.",
},
"reason": map[string]interface{}{
"type": "string",
@@ -70,10 +71,8 @@ func (t *SkillExecTool) Execute(ctx context.Context, args map[string]interface{}
t.writeAudit(skill, script, reason, false, err.Error())
return "", err
}
timeoutSec := 60
if raw, ok := args["timeout_sec"].(float64); ok && raw > 0 {
timeoutSec = int(raw)
if strings.TrimSpace(t.workspace) != "" {
globalCommandWatchdog.setQueuePath(filepath.Join(strings.TrimSpace(t.workspace), "memory", "task_queue.json"))
}
skillDir, err := t.resolveSkillDir(skill)
@@ -115,22 +114,55 @@ func (t *SkillExecTool) Execute(ctx context.Context, args map[string]interface{}
}
}
runCtx, cancel := context.WithTimeout(ctx, time.Duration(timeoutSec)*time.Second)
defer cancel()
cmd, err := buildSkillCommand(runCtx, scriptPath, cmdArgs)
if err != nil {
t.writeAudit(skill, script, reason, false, err.Error())
return "", err
commandLabel := relScript
if len(cmdArgs) > 0 {
commandLabel += " " + strings.Join(cmdArgs, " ")
}
cmd.Dir = skillDir
output, err := cmd.CombinedOutput()
if err != nil {
t.writeAudit(skill, script, reason, false, err.Error())
return "", fmt.Errorf("skill execution failed: %w\n%s", err, string(output))
policy := buildCommandRuntimePolicy(commandLabel, 2*time.Second)
var merged strings.Builder
var runErr error
for attempt := 0; attempt <= policy.MaxRestarts; attempt++ {
cmd, err := buildSkillCommand(ctx, scriptPath, cmdArgs)
if err != nil {
t.writeAudit(skill, script, reason, false, err.Error())
return "", err
}
cmd.Dir = skillDir
var stdout, stderr trackedOutput
cmd.Stdout = &stdout
cmd.Stderr = &stderr
err = runCommandWithDynamicTick(ctx, cmd, "skill_exec", commandLabel, policy.Difficulty, policy.BaseTick, policy.StallRoundLimit, func() int {
return stdout.Len() + stderr.Len()
})
out := stdout.String()
if stderr.Len() > 0 {
out += "\nSTDERR:\n" + stderr.String()
}
if strings.TrimSpace(out) != "" {
if merged.Len() > 0 {
merged.WriteString("\n")
}
merged.WriteString(out)
}
if err == nil {
runErr = nil
break
}
runErr = err
if errors.Is(err, ErrCommandNoProgress) && ctx.Err() == nil && attempt < policy.MaxRestarts {
merged.WriteString(fmt.Sprintf("\n[RESTART] no progress for %d ticks, restarting (%d/%d)\n",
policy.StallRoundLimit, attempt+1, policy.MaxRestarts))
continue
}
break
}
output := merged.String()
if runErr != nil {
t.writeAudit(skill, script, reason, false, runErr.Error())
return "", fmt.Errorf("skill execution failed: %w\n%s", runErr, output)
}
out := strings.TrimSpace(string(output))
out := strings.TrimSpace(output)
if out == "" {
out = "(no output)"
}