feat: implement channel healthcheck and sentinel auto-healing

This commit is contained in:
root
2026-02-14 16:20:14 +00:00
parent f56005246d
commit 1e4bf34fac
5 changed files with 73 additions and 4 deletions

BIN
clawgo_new Executable file

Binary file not shown.

View File

@@ -17,6 +17,7 @@ type Channel interface {
Send(ctx context.Context, msg bus.OutboundMessage) error
IsRunning() bool
IsAllowed(senderID string) bool
HealthCheck(ctx context.Context) error
}
type BaseChannel struct {

View File

@@ -217,6 +217,31 @@ func (m *Manager) StopAll(ctx context.Context) error {
return nil
}
func (m *Manager) CheckHealth(ctx context.Context) map[string]error {
m.mu.RLock()
defer m.mu.RUnlock()
results := make(map[string]error)
for name, channel := range m.channels {
results[name] = channel.HealthCheck(ctx)
}
return results
}
func (m *Manager) RestartChannel(ctx context.Context, name string) error {
m.mu.Lock()
defer m.mu.Unlock()
channel, ok := m.channels[name]
if !ok {
return fmt.Errorf("channel %s not found", name)
}
logger.InfoCF("channels", "Restarting channel", map[string]interface{}{"channel": name})
_ = channel.Stop(ctx)
return channel.Start(ctx)
}
func (m *Manager) dispatchOutbound(ctx context.Context) {
logger.InfoC("channels", "Outbound dispatcher started")
@@ -271,10 +296,7 @@ func (m *Manager) GetStatus() map[string]interface{} {
status := make(map[string]interface{})
for name, channel := range m.channels {
status[name] = map[string]interface{}{
"enabled": true,
"running": channel.IsRunning(),
}
status[name] = map[string]interface{}{}
}
return status
}

View File

@@ -74,6 +74,16 @@ func (c *TelegramChannel) SetTranscriber(transcriber *voice.GroqTranscriber) {
c.transcriber = transcriber
}
func (c *TelegramChannel) HealthCheck(ctx context.Context) error {
if !c.IsRunning() {
return fmt.Errorf("telegram bot not running")
}
hCtx, cancel := withTelegramAPITimeout(ctx)
defer cancel()
_, err := c.bot.GetMe(hCtx)
return err
}
func (c *TelegramChannel) Start(ctx context.Context) error {
if c.IsRunning() {
return nil

View File

@@ -1,12 +1,14 @@
package sentinel
import (
"context"
"fmt"
"os"
"path/filepath"
"sync"
"time"
"clawgo/pkg/channels"
"clawgo/pkg/config"
"clawgo/pkg/lifecycle"
"clawgo/pkg/logger"
@@ -23,6 +25,7 @@ type Service struct {
runner *lifecycle.LoopRunner
mu sync.RWMutex
lastAlerts map[string]time.Time
mgr *channels.Manager
}
func NewService(cfgPath, workspace string, intervalSec int, autoHeal bool, onAlert AlertFunc) *Service {
@@ -40,6 +43,10 @@ func NewService(cfgPath, workspace string, intervalSec int, autoHeal bool, onAle
}
}
func (s *Service) SetManager(mgr *channels.Manager) {
s.mgr = mgr
}
func (s *Service) Start() {
if !s.runner.Start(s.loop) {
return
@@ -76,6 +83,7 @@ func (s *Service) runChecks() {
issues := s.checkConfig()
issues = append(issues, s.checkMemory()...)
issues = append(issues, s.checkLogs()...)
issues = append(issues, s.checkChannels()...)
if len(issues) == 0 {
return
@@ -86,6 +94,34 @@ func (s *Service) runChecks() {
}
}
func (s *Service) checkChannels() []string {
if s.mgr == nil {
return nil
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
health := s.mgr.CheckHealth(ctx)
var issues []string
for name, err := range health {
if err != nil {
msg := fmt.Sprintf("sentinel: channel %s health check failed: %v", name, err)
issues = append(issues, msg)
if s.autoHeal {
go func(n string) {
logger.InfoCF("sentinel", "Attempting auto-heal for channel", map[string]interface{}{"channel": n})
if rErr := s.mgr.RestartChannel(context.Background(), n); rErr != nil {
logger.ErrorCF("sentinel", "Auto-heal restart failed", map[string]interface{}{"channel": n, "error": rErr.Error()})
} else {
logger.InfoCF("sentinel", "Auto-heal successful", map[string]interface{}{"channel": n})
}
}(name)
}
}
}
return issues
}
func (s *Service) checkConfig() []string {
_, err := os.Stat(s.cfgPath)
if err != nil {