mirror of
https://github.com/YspCoder/clawgo.git
synced 2026-04-14 23:17:30 +08:00
ekg p0: configurable threshold, auto-repair task on repeated errsig, and provider/model audit visibility
This commit is contained in:
@@ -62,6 +62,8 @@ type AgentLoop struct {
|
||||
providerNames []string
|
||||
providerPool map[string]providers.LLMProvider
|
||||
ekg *ekg.Engine
|
||||
providerMu sync.RWMutex
|
||||
sessionProvider map[string]string
|
||||
}
|
||||
|
||||
// StartupCompactionReport provides startup memory/session maintenance stats.
|
||||
@@ -239,6 +241,7 @@ func NewAgentLoop(cfg *config.Config, msgBus *bus.MessageBus, provider providers
|
||||
intentHints: map[string]string{},
|
||||
sessionRunLocks: map[string]*sync.Mutex{},
|
||||
ekg: ekg.New(workspace),
|
||||
sessionProvider: map[string]string{},
|
||||
}
|
||||
|
||||
// Initialize provider fallback chain (primary + proxy_fallbacks).
|
||||
@@ -343,9 +346,9 @@ func (al *AgentLoop) lockSessionRun(sessionKey string) func() {
|
||||
return func() { mu.Unlock() }
|
||||
}
|
||||
|
||||
func (al *AgentLoop) tryFallbackProviders(ctx context.Context, msg bus.InboundMessage, messages []providers.Message, toolDefs []providers.ToolDefinition, options map[string]interface{}, primaryErr error) (*providers.LLMResponse, error) {
|
||||
func (al *AgentLoop) tryFallbackProviders(ctx context.Context, msg bus.InboundMessage, messages []providers.Message, toolDefs []providers.ToolDefinition, options map[string]interface{}, primaryErr error) (*providers.LLMResponse, string, error) {
|
||||
if len(al.providerNames) <= 1 {
|
||||
return nil, primaryErr
|
||||
return nil, "", primaryErr
|
||||
}
|
||||
lastErr := primaryErr
|
||||
candidates := append([]string(nil), al.providerNames[1:]...)
|
||||
@@ -375,11 +378,30 @@ func (al *AgentLoop) tryFallbackProviders(ctx context.Context, msg bus.InboundMe
|
||||
}
|
||||
if err == nil {
|
||||
logger.WarnCF("agent", "LLM fallback provider switched", map[string]interface{}{"provider": name})
|
||||
return resp, nil
|
||||
return resp, name, nil
|
||||
}
|
||||
lastErr = err
|
||||
}
|
||||
return nil, lastErr
|
||||
return nil, "", lastErr
|
||||
}
|
||||
|
||||
func (al *AgentLoop) setSessionProvider(sessionKey, provider string) {
|
||||
key := strings.TrimSpace(sessionKey)
|
||||
if key == "" { return }
|
||||
provider = strings.TrimSpace(provider)
|
||||
if provider == "" { return }
|
||||
al.providerMu.Lock()
|
||||
al.sessionProvider[key] = provider
|
||||
al.providerMu.Unlock()
|
||||
}
|
||||
|
||||
func (al *AgentLoop) getSessionProvider(sessionKey string) string {
|
||||
key := strings.TrimSpace(sessionKey)
|
||||
if key == "" { return "" }
|
||||
al.providerMu.RLock()
|
||||
v := al.sessionProvider[key]
|
||||
al.providerMu.RUnlock()
|
||||
return v
|
||||
}
|
||||
|
||||
func (al *AgentLoop) processInbound(ctx context.Context, msg bus.InboundMessage) {
|
||||
@@ -455,6 +477,8 @@ func (al *AgentLoop) appendTaskAuditEvent(taskID string, msg bus.InboundMessage,
|
||||
"input_preview": truncate(strings.ReplaceAll(msg.Content, "\n", " "), 180),
|
||||
"media_count": len(msg.MediaItems),
|
||||
"media_items": msg.MediaItems,
|
||||
"provider": al.getSessionProvider(msg.SessionKey),
|
||||
"model": al.model,
|
||||
}
|
||||
if al.ekg != nil {
|
||||
al.ekg.Record(ekg.Event{
|
||||
@@ -591,6 +615,9 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage)
|
||||
}
|
||||
unlock := al.lockSessionRun(msg.SessionKey)
|
||||
defer unlock()
|
||||
if len(al.providerNames) > 0 {
|
||||
al.setSessionProvider(msg.SessionKey, al.providerNames[0])
|
||||
}
|
||||
// Add message preview to log
|
||||
preview := truncate(msg.Content, 80)
|
||||
logger.InfoCF("agent", fmt.Sprintf("Processing message from %s:%s: %s", msg.Channel, msg.SenderID, preview),
|
||||
@@ -771,9 +798,12 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
if fb, ferr := al.tryFallbackProviders(ctx, msg, messages, providerToolDefs, options, err); ferr == nil && fb != nil {
|
||||
if fb, fbProvider, ferr := al.tryFallbackProviders(ctx, msg, messages, providerToolDefs, options, err); ferr == nil && fb != nil {
|
||||
response = fb
|
||||
err = nil
|
||||
if fbProvider != "" {
|
||||
al.setSessionProvider(msg.SessionKey, fbProvider)
|
||||
}
|
||||
} else {
|
||||
err = ferr
|
||||
}
|
||||
@@ -1114,9 +1144,12 @@ func (al *AgentLoop) processSystemMessage(ctx context.Context, msg bus.InboundMe
|
||||
response, err := al.provider.Chat(ctx, messages, providerToolDefs, al.model, options)
|
||||
|
||||
if err != nil {
|
||||
if fb, ferr := al.tryFallbackProviders(ctx, msg, messages, providerToolDefs, options, err); ferr == nil && fb != nil {
|
||||
if fb, fbProvider, ferr := al.tryFallbackProviders(ctx, msg, messages, providerToolDefs, options, err); ferr == nil && fb != nil {
|
||||
response = fb
|
||||
err = nil
|
||||
if fbProvider != "" {
|
||||
al.setSessionProvider(msg.SessionKey, fbProvider)
|
||||
}
|
||||
} else {
|
||||
err = ferr
|
||||
}
|
||||
|
||||
@@ -339,6 +339,7 @@ func (e *Engine) tick() {
|
||||
if advice.ShouldEscalate {
|
||||
st.BlockReason = "repeated_error_signature"
|
||||
st.RetryAfter = now.Add(5 * time.Minute)
|
||||
e.enqueueAutoRepairTaskLocked(st, errSig)
|
||||
e.sendFailureNotification(st, "repeated error signature detected; escalate")
|
||||
continue
|
||||
}
|
||||
@@ -725,6 +726,33 @@ func (e *Engine) enqueueInferredNextTasksLocked(st *taskState) {
|
||||
}
|
||||
}
|
||||
|
||||
func (e *Engine) enqueueAutoRepairTaskLocked(st *taskState, errSig string) {
|
||||
if st == nil {
|
||||
return
|
||||
}
|
||||
errSig = strings.TrimSpace(errSig)
|
||||
if errSig == "" {
|
||||
errSig = "unknown_error_signature"
|
||||
}
|
||||
content := fmt.Sprintf("[auto-repair] 排查任务 %s 的重复错误签名并给出修复步骤(errsig=%s)", shortTask(st.Content), shortTask(errSig))
|
||||
existing := map[string]bool{}
|
||||
for _, cur := range e.state {
|
||||
existing[strings.TrimSpace(cur.Content)] = true
|
||||
}
|
||||
items, _ := e.taskStore.Load()
|
||||
for _, it := range items {
|
||||
existing[strings.TrimSpace(it.Content)] = true
|
||||
}
|
||||
if existing[content] {
|
||||
return
|
||||
}
|
||||
id := hashID(content)
|
||||
e.state[id] = &taskState{ID: id, Content: content, Priority: "high", Status: "idle"}
|
||||
items = append(items, TaskItem{ID: id, Content: content, Priority: "high", Status: "todo", Source: "autonomy_repair", UpdatedAt: nowRFC3339()})
|
||||
_ = e.taskStore.Save(items)
|
||||
e.writeReflectLog("infer", st, "generated auto-repair task due to repeated error signature")
|
||||
}
|
||||
|
||||
func (e *Engine) sendFailureNotification(st *taskState, reason string) {
|
||||
e.writeReflectLog("blocked", st, reason)
|
||||
e.writeTriggerAudit("blocked", st, reason)
|
||||
|
||||
@@ -59,6 +59,7 @@ type AutonomyConfig struct {
|
||||
TaskHistoryRetentionDays int `json:"task_history_retention_days" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_TASK_HISTORY_RETENTION_DAYS"`
|
||||
WaitingResumeDebounceSec int `json:"waiting_resume_debounce_sec" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_WAITING_RESUME_DEBOUNCE_SEC"`
|
||||
AllowedTaskKeywords []string `json:"allowed_task_keywords" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_ALLOWED_TASK_KEYWORDS"`
|
||||
EKGConsecutiveErrorThreshold int `json:"ekg_consecutive_error_threshold" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_EKG_CONSECUTIVE_ERROR_THRESHOLD"`
|
||||
// Deprecated: kept for backward compatibility with existing config files.
|
||||
NotifyChannel string `json:"notify_channel,omitempty"`
|
||||
// Deprecated: kept for backward compatibility with existing config files.
|
||||
@@ -332,6 +333,7 @@ func DefaultConfig() *Config {
|
||||
TaskHistoryRetentionDays: 3,
|
||||
WaitingResumeDebounceSec: 5,
|
||||
AllowedTaskKeywords: []string{},
|
||||
EKGConsecutiveErrorThreshold: 3,
|
||||
},
|
||||
Texts: AgentTextConfig{
|
||||
NoResponseFallback: "I've completed processing but have no response to give.",
|
||||
|
||||
@@ -120,6 +120,9 @@ func Validate(cfg *Config) []error {
|
||||
if aut.TaskHistoryRetentionDays <= 0 {
|
||||
errs = append(errs, fmt.Errorf("agents.defaults.autonomy.task_history_retention_days must be > 0 when enabled=true"))
|
||||
}
|
||||
if aut.EKGConsecutiveErrorThreshold <= 0 {
|
||||
errs = append(errs, fmt.Errorf("agents.defaults.autonomy.ekg_consecutive_error_threshold must be > 0 when enabled=true"))
|
||||
}
|
||||
}
|
||||
texts := cfg.Agents.Defaults.Texts
|
||||
if strings.TrimSpace(texts.NoResponseFallback) == "" {
|
||||
|
||||
Reference in New Issue
Block a user