diff --git a/config.example.json b/config.example.json index 8620035..77da77a 100644 --- a/config.example.json +++ b/config.example.json @@ -88,6 +88,7 @@ "telegram": { "enabled": false, "token": "YOUR_TELEGRAM_BOT_TOKEN", + "streaming": false, "allow_from": ["YOUR_USER_ID"], "allow_chats": [], "enable_groups": true, @@ -132,6 +133,14 @@ "api_key": "YOUR_CLIPROXYAPI_KEY", "api_base": "http://localhost:8080/v1", "models": ["glm-4.7", "gpt-4o-mini"], + "responses": { + "web_search_enabled": false, + "web_search_context_size": "", + "file_search_vector_store_ids": [], + "file_search_max_num_results": 0, + "include": [], + "stream_include_usage": false + }, "supports_responses_compact": false, "auth": "bearer", "timeout_sec": 90 @@ -141,6 +150,14 @@ "api_key": "YOUR_BACKUP_PROXY_KEY", "api_base": "http://localhost:8081/v1", "models": ["gpt-4o-mini", "deepseek-chat"], + "responses": { + "web_search_enabled": false, + "web_search_context_size": "", + "file_search_vector_store_ids": [], + "file_search_max_num_results": 0, + "include": [], + "stream_include_usage": false + }, "supports_responses_compact": true, "auth": "bearer", "timeout_sec": 90 diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go index a4027d3..6551e10 100644 --- a/pkg/agent/loop.go +++ b/pkg/agent/loop.go @@ -61,6 +61,8 @@ type AgentLoop struct { sessionRunLocks map[string]*sync.Mutex providerNames []string providerPool map[string]providers.LLMProvider + providerResponses map[string]config.ProviderResponsesConfig + telegramStreaming bool ekg *ekg.Engine providerMu sync.RWMutex sessionProvider map[string]string @@ -242,6 +244,8 @@ func NewAgentLoop(cfg *config.Config, msgBus *bus.MessageBus, provider providers sessionRunLocks: map[string]*sync.Mutex{}, ekg: ekg.New(workspace), sessionProvider: map[string]string{}, + providerResponses: map[string]config.ProviderResponsesConfig{}, + telegramStreaming: cfg.Channels.Telegram.Streaming, } // Initialize provider fallback chain (primary + proxy_fallbacks). @@ -253,6 +257,11 @@ func NewAgentLoop(cfg *config.Config, msgBus *bus.MessageBus, provider providers } loop.providerPool[primaryName] = provider loop.providerNames = append(loop.providerNames, primaryName) + if strings.TrimSpace(primaryName) == "proxy" { + loop.providerResponses[primaryName] = cfg.Providers.Proxy.Responses + } else if pc, ok := cfg.Providers.Proxies[primaryName]; ok { + loop.providerResponses[primaryName] = pc.Responses + } for _, name := range cfg.Agents.Defaults.ProxyFallbacks { if name == "" { continue @@ -270,6 +279,9 @@ func NewAgentLoop(cfg *config.Config, msgBus *bus.MessageBus, provider providers if p2, err := providers.CreateProviderByName(cfg, name); err == nil { loop.providerPool[name] = p2 loop.providerNames = append(loop.providerNames, name) + if pc, ok := cfg.Providers.Proxies[name]; ok { + loop.providerResponses[name] = pc.Responses + } } } @@ -800,10 +812,10 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage) }) messages = injectResponsesMediaParts(messages, msg.Media, msg.MediaItems) - options := buildResponsesOptions(8192, 0.7) + options := al.buildResponsesOptions(msg.SessionKey, 8192, 0.7) var response *providers.LLMResponse var err error - if msg.Channel == "telegram" && strings.TrimSpace(os.Getenv("CLAWGO_TELEGRAM_STREAMING")) == "1" { + if msg.Channel == "telegram" && al.telegramStreaming { if sp, ok := al.provider.(providers.StreamingLLMProvider); ok { streamText := "" lastPush := time.Now().Add(-time.Second) @@ -1158,7 +1170,7 @@ func (al *AgentLoop) processSystemMessage(ctx context.Context, msg bus.InboundMe "tools_json": formatToolsForLog(providerToolDefs), }) - options := buildResponsesOptions(8192, 0.7) + options := al.buildResponsesOptions(sessionKey, 8192, 0.7) response, err := al.provider.Chat(ctx, messages, providerToolDefs, al.model, options) if err != nil { @@ -1292,46 +1304,59 @@ func (al *AgentLoop) buildProviderToolDefs(toolDefs []map[string]interface{}) [] return providerToolDefs } -func buildResponsesOptions(maxTokens int64, temperature float64) map[string]interface{} { +func (al *AgentLoop) buildResponsesOptions(sessionKey string, maxTokens int64, temperature float64) map[string]interface{} { options := map[string]interface{}{ "max_tokens": maxTokens, "temperature": temperature, } + responsesCfg := al.responsesConfigForSession(sessionKey) responseTools := make([]map[string]interface{}, 0, 2) - if strings.TrimSpace(os.Getenv("CLAWGO_RESPONSES_WEB_SEARCH")) == "1" { + if responsesCfg.WebSearchEnabled { webTool := map[string]interface{}{"type": "web_search"} - if contextSize := strings.TrimSpace(os.Getenv("CLAWGO_RESPONSES_WEB_SEARCH_CONTEXT_SIZE")); contextSize != "" { + if contextSize := strings.TrimSpace(responsesCfg.WebSearchContextSize); contextSize != "" { webTool["search_context_size"] = contextSize } responseTools = append(responseTools, webTool) } - if idsRaw := strings.TrimSpace(os.Getenv("CLAWGO_RESPONSES_FILE_SEARCH_VECTOR_STORE_IDS")); idsRaw != "" { - ids := splitCommaList(idsRaw) - if len(ids) > 0 { - fileSearch := map[string]interface{}{ - "type": "file_search", - "vector_store_ids": ids, - } - if maxNumRaw := strings.TrimSpace(os.Getenv("CLAWGO_RESPONSES_FILE_SEARCH_MAX_NUM_RESULTS")); maxNumRaw != "" { - if n, err := strconv.Atoi(maxNumRaw); err == nil && n > 0 { - fileSearch["max_num_results"] = n - } - } - responseTools = append(responseTools, fileSearch) + if len(responsesCfg.FileSearchVectorStoreIDs) > 0 { + fileSearch := map[string]interface{}{ + "type": "file_search", + "vector_store_ids": responsesCfg.FileSearchVectorStoreIDs, } + if responsesCfg.FileSearchMaxNumResults > 0 { + fileSearch["max_num_results"] = responsesCfg.FileSearchMaxNumResults + } + responseTools = append(responseTools, fileSearch) } if len(responseTools) > 0 { options["responses_tools"] = responseTools } - if include := splitCommaList(strings.TrimSpace(os.Getenv("CLAWGO_RESPONSES_INCLUDE"))); len(include) > 0 { - options["responses_include"] = include + if len(responsesCfg.Include) > 0 { + options["responses_include"] = responsesCfg.Include } - if strings.TrimSpace(os.Getenv("CLAWGO_RESPONSES_STREAM_INCLUDE_USAGE")) == "1" { + if responsesCfg.StreamIncludeUsage { options["responses_stream_options"] = map[string]interface{}{"include_usage": true} } return options } +func (al *AgentLoop) responsesConfigForSession(sessionKey string) config.ProviderResponsesConfig { + if al == nil { + return config.ProviderResponsesConfig{} + } + name := strings.TrimSpace(al.getSessionProvider(sessionKey)) + if name == "" && len(al.providerNames) > 0 { + name = al.providerNames[0] + } + if name == "" { + return config.ProviderResponsesConfig{} + } + if cfg, ok := al.providerResponses[name]; ok { + return cfg + } + return config.ProviderResponsesConfig{} +} + func injectResponsesMediaParts(messages []providers.Message, media []string, mediaItems []bus.MediaItem) []providers.Message { if len(messages) == 0 || (len(media) == 0 && len(mediaItems) == 0) { return messages @@ -1349,15 +1374,28 @@ func injectResponsesMediaParts(messages []providers.Message, media []string, med }) } - for _, ref := range media { - ref = strings.TrimSpace(ref) - if ref == "" { - continue + // Fallback-only handling for raw media refs. Prefer structured media_items when present. + if len(mediaItems) == 0 { + for _, ref := range media { + ref = strings.TrimSpace(ref) + if ref == "" { + continue + } + if isResponsesFileID(ref) { + parts = append(parts, providers.MessageContentPart{ + Type: "input_image", + FileID: ref, + }) + continue + } + if !isRemoteReference(ref) { + continue + } + parts = append(parts, providers.MessageContentPart{ + Type: "input_image", + ImageURL: ref, + }) } - parts = append(parts, providers.MessageContentPart{ - Type: "input_image", - ImageURL: ref, - }) } for _, item := range mediaItems { @@ -1371,9 +1409,9 @@ func injectResponsesMediaParts(messages []providers.Message, media []string, med switch { case strings.Contains(typ, "image"): part := providers.MessageContentPart{Type: "input_image"} - if strings.HasPrefix(src, "file_") { + if isResponsesFileID(src) { part.FileID = src - } else { + } else if isRemoteReference(src) { part.ImageURL = src } if part.FileID != "" || part.ImageURL != "" { @@ -1381,9 +1419,9 @@ func injectResponsesMediaParts(messages []providers.Message, media []string, med } case strings.Contains(typ, "file"), strings.Contains(typ, "document"), strings.Contains(typ, "audio"), strings.Contains(typ, "video"): part := providers.MessageContentPart{Type: "input_file"} - if strings.HasPrefix(src, "file_") { + if isResponsesFileID(src) { part.FileID = src - } else { + } else if isRemoteReference(src) { part.FileURL = src } if part.FileID != "" || part.FileURL != "" { @@ -1399,18 +1437,15 @@ func injectResponsesMediaParts(messages []providers.Message, media []string, med return messages } -func splitCommaList(raw string) []string { - if strings.TrimSpace(raw) == "" { - return nil - } - parts := strings.Split(raw, ",") - out := make([]string, 0, len(parts)) - for _, p := range parts { - if s := strings.TrimSpace(p); s != "" { - out = append(out, s) - } - } - return out +func isResponsesFileID(ref string) bool { + return strings.HasPrefix(strings.TrimSpace(ref), "file_") +} + +func isRemoteReference(ref string) bool { + trimmed := strings.ToLower(strings.TrimSpace(ref)) + return strings.HasPrefix(trimmed, "http://") || + strings.HasPrefix(trimmed, "https://") || + strings.HasPrefix(trimmed, "data:") } // GetStartupInfo returns information about loaded tools and skills for logging. diff --git a/pkg/agent/loop_responses_options_test.go b/pkg/agent/loop_responses_options_test.go index e56749a..b206f80 100644 --- a/pkg/agent/loop_responses_options_test.go +++ b/pkg/agent/loop_responses_options_test.go @@ -1,10 +1,10 @@ package agent import ( - "os" "testing" "clawgo/pkg/bus" + "clawgo/pkg/config" "clawgo/pkg/providers" ) @@ -24,32 +24,36 @@ func TestInjectResponsesMediaParts(t *testing.T) { t.Fatalf("unexpected messages length: %#v", got) } parts := got[1].ContentParts - if len(parts) != 4 { - t.Fatalf("expected 4 content parts, got %#v", parts) + if len(parts) != 3 { + t.Fatalf("expected 3 content parts, got %#v", parts) } if parts[0].Type != "input_text" || parts[0].Text != "look" { t.Fatalf("expected first part to preserve input text, got %#v", parts[0]) } - if parts[1].Type != "input_image" || parts[1].ImageURL == "" { - t.Fatalf("expected media URL as input_image, got %#v", parts[1]) + if parts[1].Type != "input_image" || parts[1].FileID != "file_img_1" { + t.Fatalf("expected image media item mapped to file_id, got %#v", parts[1]) } - if parts[2].Type != "input_image" || parts[2].FileID != "file_img_1" { - t.Fatalf("expected image media item mapped to file_id, got %#v", parts[2]) - } - if parts[3].Type != "input_file" || parts[3].FileID != "file_doc_1" { - t.Fatalf("expected document media item mapped to input_file file_id, got %#v", parts[3]) + if parts[2].Type != "input_file" || parts[2].FileID != "file_doc_1" { + t.Fatalf("expected document media item mapped to input_file file_id, got %#v", parts[2]) } } -func TestBuildResponsesOptionsFromEnv(t *testing.T) { - t.Setenv("CLAWGO_RESPONSES_WEB_SEARCH", "1") - t.Setenv("CLAWGO_RESPONSES_WEB_SEARCH_CONTEXT_SIZE", "high") - t.Setenv("CLAWGO_RESPONSES_FILE_SEARCH_VECTOR_STORE_IDS", "vs_1,vs_2") - t.Setenv("CLAWGO_RESPONSES_FILE_SEARCH_MAX_NUM_RESULTS", "8") - t.Setenv("CLAWGO_RESPONSES_INCLUDE", "output_text,tool_calls") - t.Setenv("CLAWGO_RESPONSES_STREAM_INCLUDE_USAGE", "1") +func TestBuildResponsesOptionsFromConfig(t *testing.T) { + al := &AgentLoop{ + providerNames: []string{"proxy"}, + providerResponses: map[string]config.ProviderResponsesConfig{ + "proxy": { + WebSearchEnabled: true, + WebSearchContextSize: "high", + FileSearchVectorStoreIDs: []string{"vs_1", "vs_2"}, + FileSearchMaxNumResults: 8, + Include: []string{"output_text", "tool_calls"}, + StreamIncludeUsage: true, + }, + }, + } - opts := buildResponsesOptions(1024, 0.2) + opts := al.buildResponsesOptions("session-a", 1024, 0.2) if opts["max_tokens"] != int64(1024) { t.Fatalf("max_tokens mismatch: %#v", opts["max_tokens"]) } @@ -72,7 +76,21 @@ func TestBuildResponsesOptionsFromEnv(t *testing.T) { if _, ok := opts["responses_stream_options"]; !ok { t.Fatalf("expected responses_stream_options in options") } - - // keep linter happy for unused os import when build tags differ - _ = os.Getenv("CLAWGO_RESPONSES_WEB_SEARCH") +} + +func TestInjectResponsesMediaParts_SkipsLocalPathsForResponsesContentParts(t *testing.T) { + msgs := []providers.Message{ + {Role: "user", Content: "check local media"}, + } + items := []bus.MediaItem{ + {Type: "image", Path: "/tmp/a.png"}, + {Type: "document", Path: "/tmp/a.pdf"}, + } + got := injectResponsesMediaParts(msgs, nil, items) + if len(got[0].ContentParts) != 1 { + t.Fatalf("expected only input_text for local files, got %#v", got[0].ContentParts) + } + if got[0].ContentParts[0].Type != "input_text" { + t.Fatalf("expected input_text only, got %#v", got[0].ContentParts[0]) + } } diff --git a/pkg/config/config.go b/pkg/config/config.go index d7c4fde..cd42e4e 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -149,6 +149,7 @@ type WhatsAppConfig struct { type TelegramConfig struct { Enabled bool `json:"enabled" env:"CLAWGO_CHANNELS_TELEGRAM_ENABLED"` Token string `json:"token" env:"CLAWGO_CHANNELS_TELEGRAM_TOKEN"` + Streaming bool `json:"streaming"` AllowFrom []string `json:"allow_from" env:"CLAWGO_CHANNELS_TELEGRAM_ALLOW_FROM"` AllowChats []string `json:"allow_chats" env:"CLAWGO_CHANNELS_TELEGRAM_ALLOW_CHATS"` EnableGroups bool `json:"enable_groups" env:"CLAWGO_CHANNELS_TELEGRAM_ENABLE_GROUPS"` @@ -243,12 +244,22 @@ func (p *ProvidersConfig) UnmarshalJSON(data []byte) error { } type ProviderConfig struct { - APIKey string `json:"api_key" env:"CLAWGO_PROVIDERS_{{.Name}}_API_KEY"` - APIBase string `json:"api_base" env:"CLAWGO_PROVIDERS_{{.Name}}_API_BASE"` - Models []string `json:"models" env:"CLAWGO_PROVIDERS_{{.Name}}_MODELS"` - SupportsResponsesCompact bool `json:"supports_responses_compact" env:"CLAWGO_PROVIDERS_{{.Name}}_SUPPORTS_RESPONSES_COMPACT"` - Auth string `json:"auth" env:"CLAWGO_PROVIDERS_{{.Name}}_AUTH"` - TimeoutSec int `json:"timeout_sec" env:"CLAWGO_PROVIDERS_PROXY_TIMEOUT_SEC"` + APIKey string `json:"api_key" env:"CLAWGO_PROVIDERS_{{.Name}}_API_KEY"` + APIBase string `json:"api_base" env:"CLAWGO_PROVIDERS_{{.Name}}_API_BASE"` + Models []string `json:"models" env:"CLAWGO_PROVIDERS_{{.Name}}_MODELS"` + SupportsResponsesCompact bool `json:"supports_responses_compact" env:"CLAWGO_PROVIDERS_{{.Name}}_SUPPORTS_RESPONSES_COMPACT"` + Auth string `json:"auth" env:"CLAWGO_PROVIDERS_{{.Name}}_AUTH"` + TimeoutSec int `json:"timeout_sec" env:"CLAWGO_PROVIDERS_PROXY_TIMEOUT_SEC"` + Responses ProviderResponsesConfig `json:"responses"` +} + +type ProviderResponsesConfig struct { + WebSearchEnabled bool `json:"web_search_enabled"` + WebSearchContextSize string `json:"web_search_context_size"` + FileSearchVectorStoreIDs []string `json:"file_search_vector_store_ids"` + FileSearchMaxNumResults int `json:"file_search_max_num_results"` + Include []string `json:"include"` + StreamIncludeUsage bool `json:"stream_include_usage"` } type GatewayConfig struct { @@ -444,6 +455,7 @@ func DefaultConfig() *Config { Telegram: TelegramConfig{ Enabled: false, Token: "", + Streaming: false, AllowFrom: []string{}, AllowChats: []string{}, EnableGroups: true, diff --git a/pkg/config/validate.go b/pkg/config/validate.go index 8971b58..0825089 100644 --- a/pkg/config/validate.go +++ b/pkg/config/validate.go @@ -320,6 +320,18 @@ func validateProviderConfig(path string, p ProviderConfig) []error { if len(p.Models) == 0 { errs = append(errs, fmt.Errorf("%s.models must contain at least one model", path)) } + if p.Responses.WebSearchContextSize != "" { + switch p.Responses.WebSearchContextSize { + case "low", "medium", "high": + default: + errs = append(errs, fmt.Errorf("%s.responses.web_search_context_size must be one of: low, medium, high", path)) + } + } + if p.Responses.FileSearchMaxNumResults < 0 { + errs = append(errs, fmt.Errorf("%s.responses.file_search_max_num_results must be >= 0", path)) + } + errs = append(errs, validateNonEmptyStringList(path+".responses.file_search_vector_store_ids", p.Responses.FileSearchVectorStoreIDs)...) + errs = append(errs, validateNonEmptyStringList(path+".responses.include", p.Responses.Include)...) return errs } diff --git a/webui/src/i18n/index.ts b/webui/src/i18n/index.ts index 52bd753..cdc59d3 100644 --- a/webui/src/i18n/index.ts +++ b/webui/src/i18n/index.ts @@ -279,6 +279,14 @@ const resources = { api_base: 'API Base', protocol: 'Protocol', models: 'Models', + responses: 'Responses', + streaming: 'Streaming', + web_search_enabled: 'Web Search Enabled', + web_search_context_size: 'Web Search Context Size', + file_search_vector_store_ids: 'File Search Vector Store IDs', + file_search_max_num_results: 'File Search Max Results', + include: 'Include', + stream_include_usage: 'Stream Include Usage', organization: 'Organization', project: 'Project', region: 'Region', @@ -693,6 +701,14 @@ const resources = { api_base: 'API 基础地址', protocol: '协议', models: '模型列表', + responses: 'Responses 配置', + streaming: '流式输出', + web_search_enabled: '启用网页搜索', + web_search_context_size: '网页搜索上下文大小', + file_search_vector_store_ids: '文件搜索向量库 ID 列表', + file_search_max_num_results: '文件搜索最大结果数', + include: '包含字段', + stream_include_usage: '流式包含用量', organization: '组织', project: '项目', region: '区域', diff --git a/webui/src/pages/Config.tsx b/webui/src/pages/Config.tsx index dc228ca..5809edd 100644 --- a/webui/src/pages/Config.tsx +++ b/webui/src/pages/Config.tsx @@ -118,6 +118,14 @@ const Config: React.FC = () => { api_key: '', api_base: '', models: [], + responses: { + web_search_enabled: false, + web_search_context_size: '', + file_search_vector_store_ids: [], + file_search_max_num_results: 0, + include: [], + stream_include_usage: false, + }, supports_responses_compact: false, auth: 'bearer', timeout_sec: 120,