mirror of
https://github.com/YspCoder/clawgo.git
synced 2026-04-12 23:27:30 +08:00
feat: implement channel healthcheck and sentinel auto-healing
This commit is contained in:
BIN
clawgo_new
Executable file
BIN
clawgo_new
Executable file
Binary file not shown.
@@ -17,6 +17,7 @@ type Channel interface {
|
||||
Send(ctx context.Context, msg bus.OutboundMessage) error
|
||||
IsRunning() bool
|
||||
IsAllowed(senderID string) bool
|
||||
HealthCheck(ctx context.Context) error
|
||||
}
|
||||
|
||||
type BaseChannel struct {
|
||||
|
||||
@@ -217,6 +217,31 @@ func (m *Manager) StopAll(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *Manager) CheckHealth(ctx context.Context) map[string]error {
|
||||
m.mu.RLock()
|
||||
defer m.mu.RUnlock()
|
||||
|
||||
results := make(map[string]error)
|
||||
for name, channel := range m.channels {
|
||||
results[name] = channel.HealthCheck(ctx)
|
||||
}
|
||||
return results
|
||||
}
|
||||
|
||||
func (m *Manager) RestartChannel(ctx context.Context, name string) error {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
channel, ok := m.channels[name]
|
||||
if !ok {
|
||||
return fmt.Errorf("channel %s not found", name)
|
||||
}
|
||||
|
||||
logger.InfoCF("channels", "Restarting channel", map[string]interface{}{"channel": name})
|
||||
_ = channel.Stop(ctx)
|
||||
return channel.Start(ctx)
|
||||
}
|
||||
|
||||
func (m *Manager) dispatchOutbound(ctx context.Context) {
|
||||
logger.InfoC("channels", "Outbound dispatcher started")
|
||||
|
||||
@@ -271,10 +296,7 @@ func (m *Manager) GetStatus() map[string]interface{} {
|
||||
|
||||
status := make(map[string]interface{})
|
||||
for name, channel := range m.channels {
|
||||
status[name] = map[string]interface{}{
|
||||
"enabled": true,
|
||||
"running": channel.IsRunning(),
|
||||
}
|
||||
status[name] = map[string]interface{}{}
|
||||
}
|
||||
return status
|
||||
}
|
||||
|
||||
@@ -74,6 +74,16 @@ func (c *TelegramChannel) SetTranscriber(transcriber *voice.GroqTranscriber) {
|
||||
c.transcriber = transcriber
|
||||
}
|
||||
|
||||
func (c *TelegramChannel) HealthCheck(ctx context.Context) error {
|
||||
if !c.IsRunning() {
|
||||
return fmt.Errorf("telegram bot not running")
|
||||
}
|
||||
hCtx, cancel := withTelegramAPITimeout(ctx)
|
||||
defer cancel()
|
||||
_, err := c.bot.GetMe(hCtx)
|
||||
return err
|
||||
}
|
||||
|
||||
func (c *TelegramChannel) Start(ctx context.Context) error {
|
||||
if c.IsRunning() {
|
||||
return nil
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
package sentinel
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"clawgo/pkg/channels"
|
||||
"clawgo/pkg/config"
|
||||
"clawgo/pkg/lifecycle"
|
||||
"clawgo/pkg/logger"
|
||||
@@ -23,6 +25,7 @@ type Service struct {
|
||||
runner *lifecycle.LoopRunner
|
||||
mu sync.RWMutex
|
||||
lastAlerts map[string]time.Time
|
||||
mgr *channels.Manager
|
||||
}
|
||||
|
||||
func NewService(cfgPath, workspace string, intervalSec int, autoHeal bool, onAlert AlertFunc) *Service {
|
||||
@@ -40,6 +43,10 @@ func NewService(cfgPath, workspace string, intervalSec int, autoHeal bool, onAle
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Service) SetManager(mgr *channels.Manager) {
|
||||
s.mgr = mgr
|
||||
}
|
||||
|
||||
func (s *Service) Start() {
|
||||
if !s.runner.Start(s.loop) {
|
||||
return
|
||||
@@ -76,6 +83,7 @@ func (s *Service) runChecks() {
|
||||
issues := s.checkConfig()
|
||||
issues = append(issues, s.checkMemory()...)
|
||||
issues = append(issues, s.checkLogs()...)
|
||||
issues = append(issues, s.checkChannels()...)
|
||||
|
||||
if len(issues) == 0 {
|
||||
return
|
||||
@@ -86,6 +94,34 @@ func (s *Service) runChecks() {
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Service) checkChannels() []string {
|
||||
if s.mgr == nil {
|
||||
return nil
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
health := s.mgr.CheckHealth(ctx)
|
||||
var issues []string
|
||||
for name, err := range health {
|
||||
if err != nil {
|
||||
msg := fmt.Sprintf("sentinel: channel %s health check failed: %v", name, err)
|
||||
issues = append(issues, msg)
|
||||
if s.autoHeal {
|
||||
go func(n string) {
|
||||
logger.InfoCF("sentinel", "Attempting auto-heal for channel", map[string]interface{}{"channel": n})
|
||||
if rErr := s.mgr.RestartChannel(context.Background(), n); rErr != nil {
|
||||
logger.ErrorCF("sentinel", "Auto-heal restart failed", map[string]interface{}{"channel": n, "error": rErr.Error()})
|
||||
} else {
|
||||
logger.InfoCF("sentinel", "Auto-heal successful", map[string]interface{}{"channel": n})
|
||||
}
|
||||
}(name)
|
||||
}
|
||||
}
|
||||
}
|
||||
return issues
|
||||
}
|
||||
|
||||
func (s *Service) checkConfig() []string {
|
||||
_, err := os.Stat(s.cfgPath)
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user