diff --git a/README.md b/README.md index 46c3c02..2d78596 100644 --- a/README.md +++ b/README.md @@ -238,12 +238,8 @@ clawgo uninstall [--purge] [--remove-bin] - 支持 `clawgo config set/get/check/reload` - 严格 JSON 解析(未知字段会报错) - 配置热更新失败自动回滚备份 +- Provider 接口已统一为 `responses` -Provider(`providers.proxy` / `providers.proxies.`)新增开关: - -- `cross_session_call_id`(bool,默认 `false`) - - `true`:沿用 `call_id` 传 `function_call_output` - - `false`:不传 `call_id`,改为传递 tool 结果内容(适合跨会话/聚合路由场景) --- diff --git a/README_EN.md b/README_EN.md index 1fe3e12..85f7752 100644 --- a/README_EN.md +++ b/README_EN.md @@ -238,12 +238,8 @@ clawgo uninstall [--purge] [--remove-bin] - Supports `clawgo config set/get/check/reload` - Strict JSON parsing (unknown fields fail fast) - Auto rollback on failed hot reload +- Provider interface is now `responses` only -New provider switch (`providers.proxy` / `providers.proxies.`): - -- `cross_session_call_id` (bool, default `false`) - - `true`: keep using `call_id` with `function_call_output` - - `false`: do not send `call_id`; pass tool results as plain content (recommended for cross-session/aggregator routing) --- diff --git a/cmd/clawgo/cmd_onboard.go b/cmd/clawgo/cmd_onboard.go index c34c8e3..881e215 100644 --- a/cmd/clawgo/cmd_onboard.go +++ b/cmd/clawgo/cmd_onboard.go @@ -36,7 +36,7 @@ func onboard() { fmt.Println("\nNext steps:") fmt.Println(" 1. Configure CLIProxyAPI at", configPath) fmt.Println(" Ensure CLIProxyAPI is running: https://github.com/router-for-me/CLIProxyAPI") - fmt.Println(" Set providers..protocol/models; use supports_responses_compact=true only with protocol=responses") + fmt.Println(" Set providers..models for Responses API") fmt.Println(" 2. Chat: clawgo agent -m \"Hello!\"") } diff --git a/config.example.json b/config.example.json index c4c6fc0..8620035 100644 --- a/config.example.json +++ b/config.example.json @@ -131,8 +131,6 @@ "proxy": { "api_key": "YOUR_CLIPROXYAPI_KEY", "api_base": "http://localhost:8080/v1", - "protocol": "chat_completions", - "cross_session_call_id": false, "models": ["glm-4.7", "gpt-4o-mini"], "supports_responses_compact": false, "auth": "bearer", @@ -142,8 +140,6 @@ "backup": { "api_key": "YOUR_BACKUP_PROXY_KEY", "api_base": "http://localhost:8081/v1", - "protocol": "responses", - "cross_session_call_id": false, "models": ["gpt-4o-mini", "deepseek-chat"], "supports_responses_compact": true, "auth": "bearer", diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go index 6bd6fe4..a4027d3 100644 --- a/pkg/agent/loop.go +++ b/pkg/agent/loop.go @@ -777,17 +777,7 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage) }) toolDefs := al.tools.GetDefinitions() - providerToolDefs := make([]providers.ToolDefinition, 0, len(toolDefs)) - for _, td := range toolDefs { - providerToolDefs = append(providerToolDefs, providers.ToolDefinition{ - Type: td["type"].(string), - Function: providers.ToolFunctionDefinition{ - Name: td["function"].(map[string]interface{})["name"].(string), - Description: td["function"].(map[string]interface{})["description"].(string), - Parameters: td["function"].(map[string]interface{})["parameters"].(map[string]interface{}), - }, - }) - } + providerToolDefs := al.buildProviderToolDefs(toolDefs) // Log LLM request details logger.DebugCF("agent", logger.C0152, @@ -809,7 +799,8 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage) "tools_json": formatToolsForLog(providerToolDefs), }) - options := map[string]interface{}{"max_tokens": 8192, "temperature": 0.7} + messages = injectResponsesMediaParts(messages, msg.Media, msg.MediaItems) + options := buildResponsesOptions(8192, 0.7) var response *providers.LLMResponse var err error if msg.Channel == "telegram" && strings.TrimSpace(os.Getenv("CLAWGO_TELEGRAM_STREAMING")) == "1" { @@ -850,11 +841,6 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage) } } if err != nil { - errText := strings.ToLower(err.Error()) - if strings.Contains(errText, "no tool call found for function call output") { - removed := al.sessions.PurgeOrphanToolOutputs(msg.SessionKey) - logger.WarnCF("agent", logger.C0154, map[string]interface{}{"session_key": msg.SessionKey, "removed": removed}) - } logger.ErrorCF("agent", logger.C0155, map[string]interface{}{ "iteration": iteration, @@ -1150,17 +1136,7 @@ func (al *AgentLoop) processSystemMessage(ctx context.Context, msg bus.InboundMe iteration++ toolDefs := al.tools.GetDefinitions() - providerToolDefs := make([]providers.ToolDefinition, 0, len(toolDefs)) - for _, td := range toolDefs { - providerToolDefs = append(providerToolDefs, providers.ToolDefinition{ - Type: td["type"].(string), - Function: providers.ToolFunctionDefinition{ - Name: td["function"].(map[string]interface{})["name"].(string), - Description: td["function"].(map[string]interface{})["description"].(string), - Parameters: td["function"].(map[string]interface{})["parameters"].(map[string]interface{}), - }, - }) - } + providerToolDefs := al.buildProviderToolDefs(toolDefs) // Log LLM request details logger.DebugCF("agent", logger.C0152, @@ -1182,10 +1158,7 @@ func (al *AgentLoop) processSystemMessage(ctx context.Context, msg bus.InboundMe "tools_json": formatToolsForLog(providerToolDefs), }) - options := map[string]interface{}{ - "max_tokens": 8192, - "temperature": 0.7, - } + options := buildResponsesOptions(8192, 0.7) response, err := al.provider.Chat(ctx, messages, providerToolDefs, al.model, options) if err != nil { @@ -1200,40 +1173,6 @@ func (al *AgentLoop) processSystemMessage(ctx context.Context, msg bus.InboundMe } } if err != nil { - errText := strings.ToLower(err.Error()) - if strings.Contains(errText, "no tool call found for function call output") { - removed := al.sessions.PurgeOrphanToolOutputs(sessionKey) - logger.WarnCF("agent", logger.C0160, map[string]interface{}{"session_key": sessionKey, "removed": removed}) - if removed > 0 { - // Rebuild context from cleaned history and retry current iteration. - history = al.sessions.GetHistory(sessionKey) - summary = al.sessions.GetSummary(sessionKey) - messages = al.contextBuilder.BuildMessages( - history, - summary, - msg.Content, - nil, - originChannel, - originChatID, - responseLang, - ) - continue - } - if strings.ToLower(strings.TrimSpace(msg.Metadata["trigger"])) == "heartbeat" { - al.sessions.ResetSession(sessionKey) - messages = al.contextBuilder.BuildMessages( - []providers.Message{}, - "", - msg.Content, - nil, - originChannel, - originChatID, - responseLang, - ) - logger.WarnCF("agent", logger.C0161, map[string]interface{}{"session_key": sessionKey}) - continue - } - } logger.ErrorCF("agent", logger.C0162, map[string]interface{}{ "iteration": iteration, @@ -1325,6 +1264,155 @@ func truncate(s string, maxLen int) string { return s[:maxLen-3] + "..." } +func (al *AgentLoop) buildProviderToolDefs(toolDefs []map[string]interface{}) []providers.ToolDefinition { + providerToolDefs := make([]providers.ToolDefinition, 0, len(toolDefs)) + for _, td := range toolDefs { + fnRaw, ok := td["function"].(map[string]interface{}) + if !ok { + continue + } + name, _ := fnRaw["name"].(string) + description, _ := fnRaw["description"].(string) + params, _ := fnRaw["parameters"].(map[string]interface{}) + if strings.TrimSpace(name) == "" { + continue + } + if params == nil { + params = map[string]interface{}{} + } + providerToolDefs = append(providerToolDefs, providers.ToolDefinition{ + Type: "function", + Function: providers.ToolFunctionDefinition{ + Name: name, + Description: description, + Parameters: params, + }, + }) + } + return providerToolDefs +} + +func buildResponsesOptions(maxTokens int64, temperature float64) map[string]interface{} { + options := map[string]interface{}{ + "max_tokens": maxTokens, + "temperature": temperature, + } + responseTools := make([]map[string]interface{}, 0, 2) + if strings.TrimSpace(os.Getenv("CLAWGO_RESPONSES_WEB_SEARCH")) == "1" { + webTool := map[string]interface{}{"type": "web_search"} + if contextSize := strings.TrimSpace(os.Getenv("CLAWGO_RESPONSES_WEB_SEARCH_CONTEXT_SIZE")); contextSize != "" { + webTool["search_context_size"] = contextSize + } + responseTools = append(responseTools, webTool) + } + if idsRaw := strings.TrimSpace(os.Getenv("CLAWGO_RESPONSES_FILE_SEARCH_VECTOR_STORE_IDS")); idsRaw != "" { + ids := splitCommaList(idsRaw) + if len(ids) > 0 { + fileSearch := map[string]interface{}{ + "type": "file_search", + "vector_store_ids": ids, + } + if maxNumRaw := strings.TrimSpace(os.Getenv("CLAWGO_RESPONSES_FILE_SEARCH_MAX_NUM_RESULTS")); maxNumRaw != "" { + if n, err := strconv.Atoi(maxNumRaw); err == nil && n > 0 { + fileSearch["max_num_results"] = n + } + } + responseTools = append(responseTools, fileSearch) + } + } + if len(responseTools) > 0 { + options["responses_tools"] = responseTools + } + if include := splitCommaList(strings.TrimSpace(os.Getenv("CLAWGO_RESPONSES_INCLUDE"))); len(include) > 0 { + options["responses_include"] = include + } + if strings.TrimSpace(os.Getenv("CLAWGO_RESPONSES_STREAM_INCLUDE_USAGE")) == "1" { + options["responses_stream_options"] = map[string]interface{}{"include_usage": true} + } + return options +} + +func injectResponsesMediaParts(messages []providers.Message, media []string, mediaItems []bus.MediaItem) []providers.Message { + if len(messages) == 0 || (len(media) == 0 && len(mediaItems) == 0) { + return messages + } + last := len(messages) - 1 + if strings.ToLower(strings.TrimSpace(messages[last].Role)) != "user" { + return messages + } + + parts := make([]providers.MessageContentPart, 0, 1+len(media)+len(mediaItems)) + if strings.TrimSpace(messages[last].Content) != "" { + parts = append(parts, providers.MessageContentPart{ + Type: "input_text", + Text: messages[last].Content, + }) + } + + for _, ref := range media { + ref = strings.TrimSpace(ref) + if ref == "" { + continue + } + parts = append(parts, providers.MessageContentPart{ + Type: "input_image", + ImageURL: ref, + }) + } + + for _, item := range mediaItems { + typ := strings.ToLower(strings.TrimSpace(item.Type)) + ref := strings.TrimSpace(item.Ref) + path := strings.TrimSpace(item.Path) + src := ref + if src == "" { + src = path + } + switch { + case strings.Contains(typ, "image"): + part := providers.MessageContentPart{Type: "input_image"} + if strings.HasPrefix(src, "file_") { + part.FileID = src + } else { + part.ImageURL = src + } + if part.FileID != "" || part.ImageURL != "" { + parts = append(parts, part) + } + case strings.Contains(typ, "file"), strings.Contains(typ, "document"), strings.Contains(typ, "audio"), strings.Contains(typ, "video"): + part := providers.MessageContentPart{Type: "input_file"} + if strings.HasPrefix(src, "file_") { + part.FileID = src + } else { + part.FileURL = src + } + if part.FileID != "" || part.FileURL != "" { + parts = append(parts, part) + } + } + } + + if len(parts) == 0 { + return messages + } + messages[last].ContentParts = parts + return messages +} + +func splitCommaList(raw string) []string { + if strings.TrimSpace(raw) == "" { + return nil + } + parts := strings.Split(raw, ",") + out := make([]string, 0, len(parts)) + for _, p := range parts { + if s := strings.TrimSpace(p); s != "" { + out = append(out, s) + } + } + return out +} + // GetStartupInfo returns information about loaded tools and skills for logging. func (al *AgentLoop) compactSessionIfNeeded(sessionKey string) { if !al.compactionEnabled { diff --git a/pkg/agent/loop_responses_options_test.go b/pkg/agent/loop_responses_options_test.go new file mode 100644 index 0000000..e56749a --- /dev/null +++ b/pkg/agent/loop_responses_options_test.go @@ -0,0 +1,78 @@ +package agent + +import ( + "os" + "testing" + + "clawgo/pkg/bus" + "clawgo/pkg/providers" +) + +func TestInjectResponsesMediaParts(t *testing.T) { + msgs := []providers.Message{ + {Role: "system", Content: "sys"}, + {Role: "user", Content: "look"}, + } + media := []string{"https://example.com/a.png"} + items := []bus.MediaItem{ + {Type: "image", Ref: "file_img_1"}, + {Type: "document", Ref: "file_doc_1"}, + } + + got := injectResponsesMediaParts(msgs, media, items) + if len(got) != 2 { + t.Fatalf("unexpected messages length: %#v", got) + } + parts := got[1].ContentParts + if len(parts) != 4 { + t.Fatalf("expected 4 content parts, got %#v", parts) + } + if parts[0].Type != "input_text" || parts[0].Text != "look" { + t.Fatalf("expected first part to preserve input text, got %#v", parts[0]) + } + if parts[1].Type != "input_image" || parts[1].ImageURL == "" { + t.Fatalf("expected media URL as input_image, got %#v", parts[1]) + } + if parts[2].Type != "input_image" || parts[2].FileID != "file_img_1" { + t.Fatalf("expected image media item mapped to file_id, got %#v", parts[2]) + } + if parts[3].Type != "input_file" || parts[3].FileID != "file_doc_1" { + t.Fatalf("expected document media item mapped to input_file file_id, got %#v", parts[3]) + } +} + +func TestBuildResponsesOptionsFromEnv(t *testing.T) { + t.Setenv("CLAWGO_RESPONSES_WEB_SEARCH", "1") + t.Setenv("CLAWGO_RESPONSES_WEB_SEARCH_CONTEXT_SIZE", "high") + t.Setenv("CLAWGO_RESPONSES_FILE_SEARCH_VECTOR_STORE_IDS", "vs_1,vs_2") + t.Setenv("CLAWGO_RESPONSES_FILE_SEARCH_MAX_NUM_RESULTS", "8") + t.Setenv("CLAWGO_RESPONSES_INCLUDE", "output_text,tool_calls") + t.Setenv("CLAWGO_RESPONSES_STREAM_INCLUDE_USAGE", "1") + + opts := buildResponsesOptions(1024, 0.2) + if opts["max_tokens"] != int64(1024) { + t.Fatalf("max_tokens mismatch: %#v", opts["max_tokens"]) + } + if opts["temperature"] != 0.2 { + t.Fatalf("temperature mismatch: %#v", opts["temperature"]) + } + toolsRaw, ok := opts["responses_tools"].([]map[string]interface{}) + if !ok || len(toolsRaw) != 2 { + t.Fatalf("expected two built-in response tools, got %#v", opts["responses_tools"]) + } + if toolsRaw[0]["type"] != "web_search" { + t.Fatalf("expected web_search tool first, got %#v", toolsRaw[0]) + } + if toolsRaw[1]["type"] != "file_search" { + t.Fatalf("expected file_search tool second, got %#v", toolsRaw[1]) + } + if _, ok := opts["responses_include"]; !ok { + t.Fatalf("expected responses_include in options") + } + if _, ok := opts["responses_stream_options"]; !ok { + t.Fatalf("expected responses_stream_options in options") + } + + // keep linter happy for unused os import when build tags differ + _ = os.Getenv("CLAWGO_RESPONSES_WEB_SEARCH") +} diff --git a/pkg/channels/telegram.go b/pkg/channels/telegram.go index 6266632..fa8c33a 100644 --- a/pkg/channels/telegram.go +++ b/pkg/channels/telegram.go @@ -29,6 +29,8 @@ const ( telegramAPICallTimeout = 15 * time.Second telegramMaxConcurrentHandlers = 32 telegramStopWaitHandlersPeriod = 5 * time.Second + telegramSafeHTMLMaxRunes = 3500 + telegramStreamPreviewMaxRunes = 3200 ) type TelegramChannel struct { @@ -42,11 +44,18 @@ type TelegramChannel struct { handleSem chan struct{} handleWG sync.WaitGroup botUsername string + streamMu sync.Mutex + streamState map[string]telegramStreamState +} + +type telegramStreamState struct { + MessageID int + LastContent string } func (c *TelegramChannel) SupportsAction(action string) bool { switch strings.ToLower(strings.TrimSpace(action)) { - case "", "send", "edit", "delete", "react": + case "", "send", "edit", "delete", "react", "stream": return true default: return false @@ -67,6 +76,7 @@ func NewTelegramChannel(cfg config.TelegramConfig, bus *bus.MessageBus) (*Telegr config: cfg, chatIDs: make(map[string]int64), handleSem: make(chan struct{}, telegramMaxConcurrentHandlers), + streamState: make(map[string]telegramStreamState), }, nil } @@ -252,6 +262,7 @@ func (c *TelegramChannel) Send(ctx context.Context, msg bus.OutboundMessage) err if action != "send" { return c.handleAction(ctx, chatIDInt, action, msg) } + streamKey := telegramStreamKey(chatIDInt, msg.ReplyToID) htmlContent := sanitizeTelegramHTML(markdownToTelegramHTML(msg.Content)) @@ -292,6 +303,9 @@ func (c *TelegramChannel) Send(ctx context.Context, msg bus.OutboundMessage) err return err } } + c.streamMu.Lock() + delete(c.streamState, streamKey) + c.streamMu.Unlock() return nil } @@ -325,8 +339,14 @@ func (c *TelegramChannel) Send(ctx context.Context, msg bus.OutboundMessage) err return err } } + c.streamMu.Lock() + delete(c.streamState, streamKey) + c.streamMu.Unlock() return nil } + c.streamMu.Lock() + delete(c.streamState, streamKey) + c.streamMu.Unlock() return nil } @@ -730,21 +750,132 @@ func parseChatID(chatIDStr string) (int64, error) { return id, err } +func telegramStreamKey(chatID int64, replyToID string) string { + return fmt.Sprintf("%d:%s", chatID, strings.TrimSpace(replyToID)) +} + +func tailRunes(s string, maxRunes int) string { + if maxRunes <= 0 { + return s + } + r := []rune(s) + if len(r) <= maxRunes { + return s + } + return "…\n" + string(r[len(r)-maxRunes:]) +} + +func clampTelegramHTML(markdown string, maxRunes int) string { + if maxRunes <= 0 { + maxRunes = telegramSafeHTMLMaxRunes + } + htmlContent := sanitizeTelegramHTML(markdownToTelegramHTML(markdown)) + if len([]rune(htmlContent)) <= maxRunes { + return htmlContent + } + chunks := splitTelegramMarkdown(markdown, maxRunes-400) + if len(chunks) == 0 { + return "" + } + return sanitizeTelegramHTML(markdownToTelegramHTML(chunks[0])) +} + +func (c *TelegramChannel) handleStreamAction(ctx context.Context, chatID int64, msg bus.OutboundMessage) error { + streamKey := telegramStreamKey(chatID, msg.ReplyToID) + content := tailRunes(msg.Content, telegramStreamPreviewMaxRunes) + htmlContent := clampTelegramHTML(content, telegramSafeHTMLMaxRunes) + if strings.TrimSpace(htmlContent) == "" { + return nil + } + + c.streamMu.Lock() + state := c.streamState[streamKey] + if state.LastContent == htmlContent { + c.streamMu.Unlock() + return nil + } + c.streamMu.Unlock() + + if state.MessageID == 0 { + sendParams := telegoutil.Message(telegoutil.ID(chatID), htmlContent).WithParseMode(telego.ModeHTML) + if replyID, ok := parseTelegramMessageID(msg.ReplyToID); ok { + sendParams.ReplyParameters = &telego.ReplyParameters{MessageID: replyID} + } + sendCtx, cancel := withTelegramAPITimeout(ctx) + sent, err := c.bot.SendMessage(sendCtx, sendParams) + cancel() + if err != nil { + plain := tailRunes(plainTextFromTelegramHTML(htmlContent), telegramSafeHTMLMaxRunes) + if strings.TrimSpace(plain) == "" { + return err + } + sendPlainCtx, cancelPlain := withTelegramAPITimeout(ctx) + sent, err = c.bot.SendMessage(sendPlainCtx, telegoutil.Message(telegoutil.ID(chatID), plain)) + cancelPlain() + if err != nil { + return err + } + } + c.streamMu.Lock() + c.streamState[streamKey] = telegramStreamState{MessageID: sent.MessageID, LastContent: htmlContent} + c.streamMu.Unlock() + return nil + } + + editCtx, cancel := withTelegramAPITimeout(ctx) + _, err := c.bot.EditMessageText(editCtx, &telego.EditMessageTextParams{ + ChatID: telegoutil.ID(chatID), + MessageID: state.MessageID, + Text: htmlContent, + ParseMode: telego.ModeHTML, + }) + cancel() + if err != nil { + errText := strings.ToLower(err.Error()) + if strings.Contains(errText, "message is not modified") { + return nil + } + if strings.Contains(errText, "message is too long") || strings.Contains(errText, "can't parse entities") { + plain := tailRunes(plainTextFromTelegramHTML(htmlContent), telegramSafeHTMLMaxRunes) + if strings.TrimSpace(plain) == "" { + return err + } + editPlainCtx, cancelPlain := withTelegramAPITimeout(ctx) + _, err = c.bot.EditMessageText(editPlainCtx, &telego.EditMessageTextParams{ + ChatID: telegoutil.ID(chatID), + MessageID: state.MessageID, + Text: plain, + }) + cancelPlain() + if err != nil && !strings.Contains(strings.ToLower(err.Error()), "message is not modified") { + return err + } + } else { + return err + } + } + + c.streamMu.Lock() + state.LastContent = htmlContent + c.streamState[streamKey] = state + c.streamMu.Unlock() + return nil +} + func (c *TelegramChannel) handleAction(ctx context.Context, chatID int64, action string, msg bus.OutboundMessage) error { messageID, ok := parseTelegramMessageID(msg.MessageID) - if !ok && action != "send" { + if !ok && action != "send" && action != "stream" { return fmt.Errorf("message_id required for action=%s", action) } switch action { case "edit": - htmlContent := sanitizeTelegramHTML(markdownToTelegramHTML(msg.Content)) - if len([]rune(htmlContent)) > 3500 { - htmlContent = sanitizeTelegramHTML(markdownToTelegramHTML(splitTelegramMarkdown(msg.Content, 3000)[0])) - } + htmlContent := clampTelegramHTML(msg.Content, telegramSafeHTMLMaxRunes) editCtx, cancel := withTelegramAPITimeout(ctx) defer cancel() _, err := c.bot.EditMessageText(editCtx, &telego.EditMessageTextParams{ChatID: telegoutil.ID(chatID), MessageID: messageID, Text: htmlContent, ParseMode: telego.ModeHTML}) return err + case "stream": + return c.handleStreamAction(ctx, chatID, msg) case "delete": delCtx, cancel := withTelegramAPITimeout(ctx) defer cancel() diff --git a/pkg/config/config.go b/pkg/config/config.go index b4ceaf6..d7c4fde 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -45,26 +45,26 @@ type AgentDefaults struct { } type AutonomyConfig struct { - Enabled bool `json:"enabled" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_ENABLED"` - TickIntervalSec int `json:"tick_interval_sec" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_TICK_INTERVAL_SEC"` - MinRunIntervalSec int `json:"min_run_interval_sec" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_MIN_RUN_INTERVAL_SEC"` - MaxPendingDurationSec int `json:"max_pending_duration_sec" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_MAX_PENDING_DURATION_SEC"` - MaxConsecutiveStalls int `json:"max_consecutive_stalls" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_MAX_CONSECUTIVE_STALLS"` - MaxDispatchPerTick int `json:"max_dispatch_per_tick" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_MAX_DISPATCH_PER_TICK"` - NotifyCooldownSec int `json:"notify_cooldown_sec" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_NOTIFY_COOLDOWN_SEC"` - NotifySameReasonCooldownSec int `json:"notify_same_reason_cooldown_sec" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_NOTIFY_SAME_REASON_COOLDOWN_SEC"` - QuietHours string `json:"quiet_hours" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_QUIET_HOURS"` - UserIdleResumeSec int `json:"user_idle_resume_sec" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_USER_IDLE_RESUME_SEC"` - MaxRoundsWithoutUser int `json:"max_rounds_without_user" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_MAX_ROUNDS_WITHOUT_USER"` - TaskHistoryRetentionDays int `json:"task_history_retention_days" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_TASK_HISTORY_RETENTION_DAYS"` - WaitingResumeDebounceSec int `json:"waiting_resume_debounce_sec" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_WAITING_RESUME_DEBOUNCE_SEC"` - IdleRoundBudgetReleaseSec int `json:"idle_round_budget_release_sec" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_IDLE_ROUND_BUDGET_RELEASE_SEC"` - AllowedTaskKeywords []string `json:"allowed_task_keywords" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_ALLOWED_TASK_KEYWORDS"` - EKGConsecutiveErrorThreshold int `json:"ekg_consecutive_error_threshold" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_EKG_CONSECUTIVE_ERROR_THRESHOLD"` + Enabled bool `json:"enabled" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_ENABLED"` + TickIntervalSec int `json:"tick_interval_sec" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_TICK_INTERVAL_SEC"` + MinRunIntervalSec int `json:"min_run_interval_sec" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_MIN_RUN_INTERVAL_SEC"` + MaxPendingDurationSec int `json:"max_pending_duration_sec" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_MAX_PENDING_DURATION_SEC"` + MaxConsecutiveStalls int `json:"max_consecutive_stalls" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_MAX_CONSECUTIVE_STALLS"` + MaxDispatchPerTick int `json:"max_dispatch_per_tick" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_MAX_DISPATCH_PER_TICK"` + NotifyCooldownSec int `json:"notify_cooldown_sec" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_NOTIFY_COOLDOWN_SEC"` + NotifySameReasonCooldownSec int `json:"notify_same_reason_cooldown_sec" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_NOTIFY_SAME_REASON_COOLDOWN_SEC"` + QuietHours string `json:"quiet_hours" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_QUIET_HOURS"` + UserIdleResumeSec int `json:"user_idle_resume_sec" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_USER_IDLE_RESUME_SEC"` + MaxRoundsWithoutUser int `json:"max_rounds_without_user" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_MAX_ROUNDS_WITHOUT_USER"` + TaskHistoryRetentionDays int `json:"task_history_retention_days" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_TASK_HISTORY_RETENTION_DAYS"` + WaitingResumeDebounceSec int `json:"waiting_resume_debounce_sec" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_WAITING_RESUME_DEBOUNCE_SEC"` + IdleRoundBudgetReleaseSec int `json:"idle_round_budget_release_sec" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_IDLE_ROUND_BUDGET_RELEASE_SEC"` + AllowedTaskKeywords []string `json:"allowed_task_keywords" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_ALLOWED_TASK_KEYWORDS"` + EKGConsecutiveErrorThreshold int `json:"ekg_consecutive_error_threshold" env:"CLAWGO_AGENTS_DEFAULTS_AUTONOMY_EKG_CONSECUTIVE_ERROR_THRESHOLD"` // Deprecated: kept for backward compatibility with existing config files. - NotifyChannel string `json:"notify_channel,omitempty"` + NotifyChannel string `json:"notify_channel,omitempty"` // Deprecated: kept for backward compatibility with existing config files. - NotifyChatID string `json:"notify_chat_id,omitempty"` + NotifyChatID string `json:"notify_chat_id,omitempty"` } type AgentTextConfig struct { @@ -128,9 +128,9 @@ type ContextCompactionConfig struct { } type ChannelsConfig struct { - InboundMessageIDDedupeTTLSeconds int `json:"inbound_message_id_dedupe_ttl_seconds" env:"CLAWGO_CHANNELS_INBOUND_MESSAGE_ID_DEDUPE_TTL_SECONDS"` - InboundContentDedupeWindowSeconds int `json:"inbound_content_dedupe_window_seconds" env:"CLAWGO_CHANNELS_INBOUND_CONTENT_DEDUPE_WINDOW_SECONDS"` - OutboundDedupeWindowSeconds int `json:"outbound_dedupe_window_seconds" env:"CLAWGO_CHANNELS_OUTBOUND_DEDUPE_WINDOW_SECONDS"` + InboundMessageIDDedupeTTLSeconds int `json:"inbound_message_id_dedupe_ttl_seconds" env:"CLAWGO_CHANNELS_INBOUND_MESSAGE_ID_DEDUPE_TTL_SECONDS"` + InboundContentDedupeWindowSeconds int `json:"inbound_content_dedupe_window_seconds" env:"CLAWGO_CHANNELS_INBOUND_CONTENT_DEDUPE_WINDOW_SECONDS"` + OutboundDedupeWindowSeconds int `json:"outbound_dedupe_window_seconds" env:"CLAWGO_CHANNELS_OUTBOUND_DEDUPE_WINDOW_SECONDS"` WhatsApp WhatsAppConfig `json:"whatsapp"` Telegram TelegramConfig `json:"telegram"` Feishu FeishuConfig `json:"feishu"` @@ -156,15 +156,15 @@ type TelegramConfig struct { } type FeishuConfig struct { - Enabled bool `json:"enabled" env:"CLAWGO_CHANNELS_FEISHU_ENABLED"` - AppID string `json:"app_id" env:"CLAWGO_CHANNELS_FEISHU_APP_ID"` - AppSecret string `json:"app_secret" env:"CLAWGO_CHANNELS_FEISHU_APP_SECRET"` - EncryptKey string `json:"encrypt_key" env:"CLAWGO_CHANNELS_FEISHU_ENCRYPT_KEY"` - VerificationToken string `json:"verification_token" env:"CLAWGO_CHANNELS_FEISHU_VERIFICATION_TOKEN"` - AllowFrom []string `json:"allow_from" env:"CLAWGO_CHANNELS_FEISHU_ALLOW_FROM"` - AllowChats []string `json:"allow_chats" env:"CLAWGO_CHANNELS_FEISHU_ALLOW_CHATS"` - EnableGroups bool `json:"enable_groups" env:"CLAWGO_CHANNELS_FEISHU_ENABLE_GROUPS"` - RequireMentionInGroups bool `json:"require_mention_in_groups" env:"CLAWGO_CHANNELS_FEISHU_REQUIRE_MENTION_IN_GROUPS"` + Enabled bool `json:"enabled" env:"CLAWGO_CHANNELS_FEISHU_ENABLED"` + AppID string `json:"app_id" env:"CLAWGO_CHANNELS_FEISHU_APP_ID"` + AppSecret string `json:"app_secret" env:"CLAWGO_CHANNELS_FEISHU_APP_SECRET"` + EncryptKey string `json:"encrypt_key" env:"CLAWGO_CHANNELS_FEISHU_ENCRYPT_KEY"` + VerificationToken string `json:"verification_token" env:"CLAWGO_CHANNELS_FEISHU_VERIFICATION_TOKEN"` + AllowFrom []string `json:"allow_from" env:"CLAWGO_CHANNELS_FEISHU_ALLOW_FROM"` + AllowChats []string `json:"allow_chats" env:"CLAWGO_CHANNELS_FEISHU_ALLOW_CHATS"` + EnableGroups bool `json:"enable_groups" env:"CLAWGO_CHANNELS_FEISHU_ENABLE_GROUPS"` + RequireMentionInGroups bool `json:"require_mention_in_groups" env:"CLAWGO_CHANNELS_FEISHU_REQUIRE_MENTION_IN_GROUPS"` } type DiscordConfig struct { @@ -245,8 +245,6 @@ func (p *ProvidersConfig) UnmarshalJSON(data []byte) error { type ProviderConfig struct { APIKey string `json:"api_key" env:"CLAWGO_PROVIDERS_{{.Name}}_API_KEY"` APIBase string `json:"api_base" env:"CLAWGO_PROVIDERS_{{.Name}}_API_BASE"` - Protocol string `json:"protocol" env:"CLAWGO_PROVIDERS_{{.Name}}_PROTOCOL"` - CrossSessionCallID bool `json:"cross_session_call_id" env:"CLAWGO_PROVIDERS_{{.Name}}_CROSS_SESSION_CALL_ID"` Models []string `json:"models" env:"CLAWGO_PROVIDERS_{{.Name}}_MODELS"` SupportsResponsesCompact bool `json:"supports_responses_compact" env:"CLAWGO_PROVIDERS_{{.Name}}_SUPPORTS_RESPONSES_COMPACT"` Auth string `json:"auth" env:"CLAWGO_PROVIDERS_{{.Name}}_AUTH"` @@ -367,21 +365,21 @@ func DefaultConfig() *Config { PromptTemplate: "Read HEARTBEAT.md if it exists (workspace context). Follow it strictly. Do not infer or repeat old tasks from prior chats. If nothing needs attention, reply HEARTBEAT_OK.", }, Autonomy: AutonomyConfig{ - Enabled: false, - TickIntervalSec: 30, - MinRunIntervalSec: 20, - MaxPendingDurationSec: 180, - MaxConsecutiveStalls: 3, - MaxDispatchPerTick: 2, - NotifyCooldownSec: 300, - NotifySameReasonCooldownSec: 900, - QuietHours: "23:00-08:00", - UserIdleResumeSec: 20, - MaxRoundsWithoutUser: 12, - TaskHistoryRetentionDays: 3, - WaitingResumeDebounceSec: 5, - IdleRoundBudgetReleaseSec: 1800, - AllowedTaskKeywords: []string{}, + Enabled: false, + TickIntervalSec: 30, + MinRunIntervalSec: 20, + MaxPendingDurationSec: 180, + MaxConsecutiveStalls: 3, + MaxDispatchPerTick: 2, + NotifyCooldownSec: 300, + NotifySameReasonCooldownSec: 900, + QuietHours: "23:00-08:00", + UserIdleResumeSec: 20, + MaxRoundsWithoutUser: 12, + TaskHistoryRetentionDays: 3, + WaitingResumeDebounceSec: 5, + IdleRoundBudgetReleaseSec: 1800, + AllowedTaskKeywords: []string{}, EKGConsecutiveErrorThreshold: 3, }, Texts: AgentTextConfig{ @@ -435,7 +433,7 @@ func DefaultConfig() *Config { }, }, Channels: ChannelsConfig{ - InboundMessageIDDedupeTTLSeconds: 600, + InboundMessageIDDedupeTTLSeconds: 600, InboundContentDedupeWindowSeconds: 12, OutboundDedupeWindowSeconds: 12, WhatsApp: WhatsAppConfig{ @@ -489,7 +487,6 @@ func DefaultConfig() *Config { Providers: ProvidersConfig{ Proxy: ProviderConfig{ APIBase: "http://localhost:8080/v1", - Protocol: "chat_completions", Models: []string{"glm-4.7"}, TimeoutSec: 90, }, diff --git a/pkg/config/validate.go b/pkg/config/validate.go index ec0ff9d..8971b58 100644 --- a/pkg/config/validate.go +++ b/pkg/config/validate.go @@ -208,8 +208,8 @@ func Validate(cfg *Config) []error { if active == "" { active = "proxy" } - if pc, ok := providerConfigByName(cfg, active); !ok || !pc.SupportsResponsesCompact || pc.Protocol != "responses" { - errs = append(errs, fmt.Errorf("context_compaction.mode=responses_compact requires active proxy %q with protocol=responses and supports_responses_compact=true", active)) + if pc, ok := providerConfigByName(cfg, active); !ok || !pc.SupportsResponsesCompact { + errs = append(errs, fmt.Errorf("context_compaction.mode=responses_compact requires active proxy %q with supports_responses_compact=true", active)) } } @@ -314,16 +314,6 @@ func validateProviderConfig(path string, p ProviderConfig) []error { if p.APIBase == "" { errs = append(errs, fmt.Errorf("%s.api_base is required", path)) } - if p.Protocol != "" { - switch p.Protocol { - case "chat_completions", "responses": - default: - errs = append(errs, fmt.Errorf("%s.protocol must be one of: chat_completions, responses", path)) - } - } - if p.SupportsResponsesCompact && p.Protocol != "responses" { - errs = append(errs, fmt.Errorf("%s.supports_responses_compact=true requires protocol=responses", path)) - } if p.TimeoutSec <= 0 { errs = append(errs, fmt.Errorf("%s.timeout_sec must be > 0", path)) } diff --git a/pkg/providers/http_provider.go b/pkg/providers/http_provider.go index 79c0ec8..bdc31ab 100644 --- a/pkg/providers/http_provider.go +++ b/pkg/providers/http_provider.go @@ -16,31 +16,22 @@ import ( "time" ) -const ( - ProtocolChatCompletions = "chat_completions" - ProtocolResponses = "responses" -) - type HTTPProvider struct { apiKey string apiBase string - protocol string defaultModel string - crossSessionCallID bool supportsResponsesCompact bool authMode string timeout time.Duration httpClient *http.Client } -func NewHTTPProvider(apiKey, apiBase, protocol, defaultModel string, crossSessionCallID bool, supportsResponsesCompact bool, authMode string, timeout time.Duration) *HTTPProvider { +func NewHTTPProvider(apiKey, apiBase, defaultModel string, supportsResponsesCompact bool, authMode string, timeout time.Duration) *HTTPProvider { normalizedBase := normalizeAPIBase(apiBase) return &HTTPProvider{ apiKey: apiKey, apiBase: normalizedBase, - protocol: normalizeProtocol(protocol), defaultModel: strings.TrimSpace(defaultModel), - crossSessionCallID: crossSessionCallID, supportsResponsesCompact: supportsResponsesCompact, authMode: authMode, timeout: timeout, @@ -55,39 +46,24 @@ func (p *HTTPProvider) Chat(ctx context.Context, messages []Message, tools []Too logger.DebugCF("provider", logger.C0133, map[string]interface{}{ "api_base": p.apiBase, - "protocol": p.protocol, "model": model, "messages_count": len(messages), "tools_count": len(tools), "timeout": p.timeout.String(), }) - if p.protocol == ProtocolResponses { - body, statusCode, contentType, err := p.callResponses(ctx, messages, tools, model, options) - if err != nil { - return nil, err - } - if statusCode != http.StatusOK { - preview := previewResponseBody(body) - return nil, fmt.Errorf("API error (status %d, content-type %q): %s", statusCode, contentType, preview) - } - if !json.Valid(body) { - return nil, fmt.Errorf("API error (status %d, content-type %q): non-JSON response: %s", statusCode, contentType, previewResponseBody(body)) - } - return parseResponsesAPIResponse(body) - } - - body, statusCode, contentType, err := p.callChatCompletions(ctx, messages, tools, model, options) + body, statusCode, contentType, err := p.callResponses(ctx, messages, tools, model, options) if err != nil { return nil, err } if statusCode != http.StatusOK { - return nil, fmt.Errorf("API error (status %d, content-type %q): %s", statusCode, contentType, previewResponseBody(body)) + preview := previewResponseBody(body) + return nil, fmt.Errorf("API error (status %d, content-type %q): %s", statusCode, contentType, preview) } if !json.Valid(body) { return nil, fmt.Errorf("API error (status %d, content-type %q): non-JSON response: %s", statusCode, contentType, previewResponseBody(body)) } - return parseChatCompletionsResponse(body) + return parseResponsesAPIResponse(body) } func (p *HTTPProvider) ChatStream(ctx context.Context, messages []Message, tools []ToolDefinition, model string, options map[string]interface{}, onDelta func(string)) (*LLMResponse, error) { @@ -97,20 +73,7 @@ func (p *HTTPProvider) ChatStream(ctx context.Context, messages []Message, tools if p.apiBase == "" { return nil, fmt.Errorf("API base not configured") } - if p.protocol == ProtocolResponses { - body, status, ctype, err := p.callResponsesStream(ctx, messages, tools, model, options, onDelta) - if err != nil { - return nil, err - } - if status != http.StatusOK { - return nil, fmt.Errorf("API error (status %d, content-type %q): %s", status, ctype, previewResponseBody(body)) - } - if !json.Valid(body) { - return nil, fmt.Errorf("API error (status %d, content-type %q): non-JSON response: %s", status, ctype, previewResponseBody(body)) - } - return parseResponsesAPIResponse(body) - } - body, status, ctype, err := p.callChatCompletionsStream(ctx, messages, tools, model, options, onDelta) + body, status, ctype, err := p.callResponsesStream(ctx, messages, tools, model, options, onDelta) if err != nil { return nil, err } @@ -120,52 +83,29 @@ func (p *HTTPProvider) ChatStream(ctx context.Context, messages []Message, tools if !json.Valid(body) { return nil, fmt.Errorf("API error (status %d, content-type %q): non-JSON response: %s", status, ctype, previewResponseBody(body)) } - return parseChatCompletionsResponse(body) -} - -func (p *HTTPProvider) callChatCompletions(ctx context.Context, messages []Message, tools []ToolDefinition, model string, options map[string]interface{}) ([]byte, int, string, error) { - requestBody := map[string]interface{}{ - "model": model, - "messages": toChatCompletionsMessages(messages), - } - if len(tools) > 0 { - requestBody["tools"] = tools - requestBody["tool_choice"] = "auto" - } - if maxTokens, ok := int64FromOption(options, "max_tokens"); ok { - requestBody["max_tokens"] = maxTokens - } - if temperature, ok := float64FromOption(options, "temperature"); ok { - requestBody["temperature"] = temperature - } - return p.postJSON(ctx, endpointFor(p.apiBase, "/chat/completions"), requestBody) + return parseResponsesAPIResponse(body) } func (p *HTTPProvider) callResponses(ctx context.Context, messages []Message, tools []ToolDefinition, model string, options map[string]interface{}) ([]byte, int, string, error) { input := make([]map[string]interface{}, 0, len(messages)) pendingCalls := map[string]struct{}{} for _, msg := range messages { - input = append(input, toResponsesInputItemsWithState(msg, pendingCalls, p.crossSessionCallID)...) + input = append(input, toResponsesInputItemsWithState(msg, pendingCalls)...) } requestBody := map[string]interface{}{ "model": model, "input": input, } - if len(tools) > 0 { - responseTools := make([]map[string]interface{}, 0, len(tools)) - for _, t := range tools { - entry := map[string]interface{}{ - "type": "function", - "name": t.Function.Name, - "parameters": t.Function.Parameters, - } - if strings.TrimSpace(t.Function.Description) != "" { - entry["description"] = t.Function.Description - } - responseTools = append(responseTools, entry) - } + responseTools := buildResponsesTools(tools, options) + if len(responseTools) > 0 { requestBody["tools"] = responseTools requestBody["tool_choice"] = "auto" + if tc, ok := rawOption(options, "tool_choice"); ok { + requestBody["tool_choice"] = tc + } + if tc, ok := rawOption(options, "responses_tool_choice"); ok { + requestBody["tool_choice"] = tc + } } if maxTokens, ok := int64FromOption(options, "max_tokens"); ok { requestBody["max_output_tokens"] = maxTokens @@ -173,82 +113,23 @@ func (p *HTTPProvider) callResponses(ctx context.Context, messages []Message, to if temperature, ok := float64FromOption(options, "temperature"); ok { requestBody["temperature"] = temperature } + if include, ok := stringSliceOption(options, "responses_include"); ok && len(include) > 0 { + requestBody["include"] = include + } + if metadata, ok := mapOption(options, "responses_metadata"); ok && len(metadata) > 0 { + requestBody["metadata"] = metadata + } + if prevID, ok := stringOption(options, "responses_previous_response_id"); ok && prevID != "" { + requestBody["previous_response_id"] = prevID + } return p.postJSON(ctx, endpointFor(p.apiBase, "/responses"), requestBody) } -func toChatCompletionsMessages(messages []Message) []map[string]interface{} { - out := make([]map[string]interface{}, 0, len(messages)) - for _, msg := range messages { - entry := map[string]interface{}{ - "role": msg.Role, - } - content := toChatCompletionsContent(msg) - if len(content) > 0 { - entry["content"] = content - } else { - entry["content"] = msg.Content - } - - if len(msg.ToolCalls) > 0 { - entry["tool_calls"] = msg.ToolCalls - } - if strings.TrimSpace(msg.ToolCallID) != "" { - entry["tool_call_id"] = msg.ToolCallID - } - - out = append(out, entry) - } - return out -} - -func toChatCompletionsContent(msg Message) []map[string]interface{} { - if len(msg.ContentParts) == 0 { - return nil - } - content := make([]map[string]interface{}, 0, len(msg.ContentParts)) - for _, part := range msg.ContentParts { - switch strings.ToLower(strings.TrimSpace(part.Type)) { - case "input_text": - if part.Text == "" { - continue - } - content = append(content, map[string]interface{}{ - "type": "text", - "text": part.Text, - }) - case "input_image": - if part.ImageURL == "" { - continue - } - content = append(content, map[string]interface{}{ - "type": "image_url", - "image_url": map[string]interface{}{ - "url": part.ImageURL, - }, - }) - case "input_file": - fileLabel := part.Filename - if fileLabel == "" { - fileLabel = "attached file" - } - mimeType := part.MIMEType - if mimeType == "" { - mimeType = "application/octet-stream" - } - content = append(content, map[string]interface{}{ - "type": "text", - "text": fmt.Sprintf("[file attachment: %s, mime=%s]", fileLabel, mimeType), - }) - } - } - return content -} - func toResponsesInputItems(msg Message) []map[string]interface{} { - return toResponsesInputItemsWithState(msg, nil, true) + return toResponsesInputItemsWithState(msg, nil) } -func toResponsesInputItemsWithState(msg Message, pendingCalls map[string]struct{}, crossSessionCallID bool) []map[string]interface{} { +func toResponsesInputItemsWithState(msg Message, pendingCalls map[string]struct{}) []map[string]interface{} { role := strings.ToLower(strings.TrimSpace(msg.Role)) switch role { case "system", "developer", "user": @@ -304,12 +185,6 @@ func toResponsesInputItemsWithState(msg Message, pendingCalls map[string]struct{ } return items case "tool": - if !crossSessionCallID { - if strings.TrimSpace(msg.Content) == "" { - return nil - } - return []map[string]interface{}{responsesMessageItem("user", msg.Content, "input_text")} - } callID := msg.ToolCallID if callID == "" { return nil @@ -335,7 +210,7 @@ func responsesMessageContent(msg Message) []map[string]interface{} { content := make([]map[string]interface{}, 0, len(msg.ContentParts)) for _, part := range msg.ContentParts { switch strings.ToLower(strings.TrimSpace(part.Type)) { - case "input_text": + case "input_text", "text": if part.Text == "" { continue } @@ -343,31 +218,200 @@ func responsesMessageContent(msg Message) []map[string]interface{} { "type": "input_text", "text": part.Text, }) - case "input_image": - if part.ImageURL == "" { - continue - } - content = append(content, map[string]interface{}{ - "type": "input_image", - "image_url": part.ImageURL, - }) - case "input_file": - if part.FileData == "" { - continue - } + case "input_image", "image": entry := map[string]interface{}{ - "type": "input_file", - "file_data": part.FileData, + "type": "input_image", + } + if part.ImageURL != "" { + entry["image_url"] = part.ImageURL + } + if part.FileID != "" { + entry["file_id"] = part.FileID + } + if detail := strings.TrimSpace(part.Detail); detail != "" { + entry["detail"] = detail + } + if _, ok := entry["image_url"]; !ok { + if _, ok := entry["file_id"]; !ok { + continue + } + } + content = append(content, entry) + case "input_file", "file": + entry := map[string]interface{}{ + "type": "input_file", + } + if part.FileData != "" { + entry["file_data"] = part.FileData + } + if part.FileID != "" { + entry["file_id"] = part.FileID + } + if part.FileURL != "" { + entry["file_url"] = part.FileURL } if part.Filename != "" { entry["filename"] = part.Filename } + if _, ok := entry["file_data"]; !ok { + if _, ok := entry["file_id"]; !ok { + if _, ok := entry["file_url"]; !ok { + continue + } + } + } content = append(content, entry) } } return content } +func buildResponsesTools(tools []ToolDefinition, options map[string]interface{}) []map[string]interface{} { + responseTools := make([]map[string]interface{}, 0, len(tools)+2) + for _, t := range tools { + typ := strings.ToLower(strings.TrimSpace(t.Type)) + if typ == "" { + typ = "function" + } + if typ == "function" { + name := strings.TrimSpace(t.Function.Name) + if name == "" { + name = strings.TrimSpace(t.Name) + } + if name == "" { + continue + } + entry := map[string]interface{}{ + "type": "function", + "name": name, + "parameters": map[string]interface{}{}, + } + if t.Function.Parameters != nil { + entry["parameters"] = t.Function.Parameters + } else if t.Parameters != nil { + entry["parameters"] = t.Parameters + } + desc := strings.TrimSpace(t.Function.Description) + if desc == "" { + desc = strings.TrimSpace(t.Description) + } + if desc != "" { + entry["description"] = desc + } + if t.Function.Strict != nil { + entry["strict"] = *t.Function.Strict + } else if t.Strict != nil { + entry["strict"] = *t.Strict + } + responseTools = append(responseTools, entry) + continue + } + + // Built-in tool types (web_search, file_search, code_interpreter, etc.). + entry := map[string]interface{}{ + "type": typ, + } + if name := strings.TrimSpace(t.Name); name != "" { + entry["name"] = name + } + if desc := strings.TrimSpace(t.Description); desc != "" { + entry["description"] = desc + } + if t.Strict != nil { + entry["strict"] = *t.Strict + } + for k, v := range t.Parameters { + entry[k] = v + } + responseTools = append(responseTools, entry) + } + + if extraTools, ok := mapSliceOption(options, "responses_tools"); ok { + responseTools = append(responseTools, extraTools...) + } + return responseTools +} + +func rawOption(options map[string]interface{}, key string) (interface{}, bool) { + if options == nil { + return nil, false + } + v, ok := options[key] + if !ok || v == nil { + return nil, false + } + return v, true +} + +func stringOption(options map[string]interface{}, key string) (string, bool) { + v, ok := rawOption(options, key) + if !ok { + return "", false + } + s, ok := v.(string) + if !ok { + return "", false + } + return strings.TrimSpace(s), true +} + +func mapOption(options map[string]interface{}, key string) (map[string]interface{}, bool) { + v, ok := rawOption(options, key) + if !ok { + return nil, false + } + m, ok := v.(map[string]interface{}) + return m, ok +} + +func stringSliceOption(options map[string]interface{}, key string) ([]string, bool) { + v, ok := rawOption(options, key) + if !ok { + return nil, false + } + switch t := v.(type) { + case []string: + out := make([]string, 0, len(t)) + for _, item := range t { + if s := strings.TrimSpace(item); s != "" { + out = append(out, s) + } + } + return out, true + case []interface{}: + out := make([]string, 0, len(t)) + for _, item := range t { + s := strings.TrimSpace(fmt.Sprintf("%v", item)) + if s != "" { + out = append(out, s) + } + } + return out, true + } + return nil, false +} + +func mapSliceOption(options map[string]interface{}, key string) ([]map[string]interface{}, bool) { + v, ok := rawOption(options, key) + if !ok { + return nil, false + } + switch t := v.(type) { + case []map[string]interface{}: + return t, true + case []interface{}: + out := make([]map[string]interface{}, 0, len(t)) + for _, item := range t { + m, ok := item.(map[string]interface{}) + if ok { + out = append(out, m) + } + } + return out, true + } + return nil, false +} + func responsesMessageItem(role, text, contentType string) map[string]interface{} { ct := contentType if ct == "" { @@ -385,79 +429,27 @@ func responsesMessageItem(role, text, contentType string) map[string]interface{} } } -func (p *HTTPProvider) callChatCompletionsStream(ctx context.Context, messages []Message, tools []ToolDefinition, model string, options map[string]interface{}, onDelta func(string)) ([]byte, int, string, error) { - requestBody := map[string]interface{}{ - "model": model, - "messages": toChatCompletionsMessages(messages), - "stream": true, - } - if len(tools) > 0 { - requestBody["tools"] = tools - requestBody["tool_choice"] = "auto" - } - if maxTokens, ok := int64FromOption(options, "max_tokens"); ok { - requestBody["max_tokens"] = maxTokens - } - if temperature, ok := float64FromOption(options, "temperature"); ok { - requestBody["temperature"] = temperature - } - var fullText strings.Builder - rawBody, status, ctype, err := p.postJSONStream(ctx, endpointFor(p.apiBase, "/chat/completions"), requestBody, func(event string) { - var chunk struct { - Choices []struct { - Delta struct { - Content string `json:"content"` - } `json:"delta"` - } `json:"choices"` - } - if err := json.Unmarshal([]byte(event), &chunk); err != nil { - return - } - if len(chunk.Choices) > 0 { - d := chunk.Choices[0].Delta.Content - if d != "" { - fullText.WriteString(d) - onDelta(d) - } - } - }) - if err != nil { - return nil, status, ctype, err - } - if status != http.StatusOK || !strings.Contains(strings.ToLower(ctype), "text/event-stream") { - return rawBody, status, ctype, nil - } - body, _ := json.Marshal(map[string]interface{}{ - "choices": []map[string]interface{}{{ - "message": map[string]interface{}{"content": fullText.String()}, - "finish_reason": "stop", - }}, - }) - return body, status, "application/json", nil -} - func (p *HTTPProvider) callResponsesStream(ctx context.Context, messages []Message, tools []ToolDefinition, model string, options map[string]interface{}, onDelta func(string)) ([]byte, int, string, error) { input := make([]map[string]interface{}, 0, len(messages)) pendingCalls := map[string]struct{}{} for _, msg := range messages { - input = append(input, toResponsesInputItemsWithState(msg, pendingCalls, p.crossSessionCallID)...) + input = append(input, toResponsesInputItemsWithState(msg, pendingCalls)...) } requestBody := map[string]interface{}{ "model": model, "input": input, "stream": true, } - if len(tools) > 0 { - responseTools := make([]map[string]interface{}, 0, len(tools)) - for _, t := range tools { - entry := map[string]interface{}{"type": "function", "name": t.Function.Name, "parameters": t.Function.Parameters} - if strings.TrimSpace(t.Function.Description) != "" { - entry["description"] = t.Function.Description - } - responseTools = append(responseTools, entry) - } + responseTools := buildResponsesTools(tools, options) + if len(responseTools) > 0 { requestBody["tools"] = responseTools requestBody["tool_choice"] = "auto" + if tc, ok := rawOption(options, "tool_choice"); ok { + requestBody["tool_choice"] = tc + } + if tc, ok := rawOption(options, "responses_tool_choice"); ok { + requestBody["tool_choice"] = tc + } } if maxTokens, ok := int64FromOption(options, "max_tokens"); ok { requestBody["max_output_tokens"] = maxTokens @@ -465,6 +457,12 @@ func (p *HTTPProvider) callResponsesStream(ctx context.Context, messages []Messa if temperature, ok := float64FromOption(options, "temperature"); ok { requestBody["temperature"] = temperature } + if include, ok := stringSliceOption(options, "responses_include"); ok && len(include) > 0 { + requestBody["include"] = include + } + if streamOpts, ok := mapOption(options, "responses_stream_options"); ok && len(streamOpts) > 0 { + requestBody["stream_options"] = streamOpts + } return p.postJSONStream(ctx, endpointFor(p.apiBase, "/responses"), requestBody, func(event string) { var obj map[string]interface{} if err := json.Unmarshal([]byte(event), &obj); err != nil { @@ -600,71 +598,6 @@ func (p *HTTPProvider) postJSON(ctx context.Context, endpoint string, payload in return body, resp.StatusCode, strings.TrimSpace(resp.Header.Get("Content-Type")), nil } -func parseChatCompletionsResponse(body []byte) (*LLMResponse, error) { - var apiResponse struct { - Choices []struct { - Message struct { - Content *string `json:"content"` - ToolCalls []struct { - ID string `json:"id"` - Type string `json:"type"` - Function *struct { - Name string `json:"name"` - Arguments string `json:"arguments"` - } `json:"function"` - } `json:"tool_calls"` - } `json:"message"` - FinishReason string `json:"finish_reason"` - } `json:"choices"` - Usage *UsageInfo `json:"usage"` - } - - if err := json.Unmarshal(body, &apiResponse); err != nil { - return nil, fmt.Errorf("failed to unmarshal response: %w", err) - } - if len(apiResponse.Choices) == 0 { - return &LLMResponse{Content: "", FinishReason: "stop"}, nil - } - choice := apiResponse.Choices[0] - toolCalls := make([]ToolCall, 0, len(choice.Message.ToolCalls)) - for i, tc := range choice.Message.ToolCalls { - if tc.Type != "" && tc.Type != "function" { - continue - } - if tc.Function == nil || strings.TrimSpace(tc.Function.Name) == "" { - continue - } - args := map[string]interface{}{} - if strings.TrimSpace(tc.Function.Arguments) != "" { - if err := json.Unmarshal([]byte(tc.Function.Arguments), &args); err != nil { - args["raw"] = tc.Function.Arguments - } - } - id := strings.TrimSpace(tc.ID) - if id == "" { - id = fmt.Sprintf("call_%d", i+1) - } - toolCalls = append(toolCalls, ToolCall{ID: id, Name: tc.Function.Name, Arguments: args}) - } - - content := "" - if choice.Message.Content != nil { - content = *choice.Message.Content - } - if len(toolCalls) == 0 { - compatCalls, cleanedContent := parseCompatFunctionCalls(content) - if len(compatCalls) > 0 { - toolCalls = compatCalls - content = cleanedContent - } - } - finishReason := strings.TrimSpace(choice.FinishReason) - if finishReason == "" { - finishReason = "stop" - } - return &LLMResponse{Content: content, ToolCalls: toolCalls, FinishReason: finishReason, Usage: apiResponse.Usage}, nil -} - func parseResponsesAPIResponse(body []byte) (*LLMResponse, error) { var resp struct { Status string `json:"status"` @@ -833,17 +766,6 @@ func endpointFor(base, relative string) string { return b + relative } -func normalizeProtocol(raw string) string { - switch strings.TrimSpace(raw) { - case "", ProtocolChatCompletions: - return ProtocolChatCompletions - case ProtocolResponses: - return ProtocolResponses - default: - return ProtocolChatCompletions - } -} - func parseCompatFunctionCalls(content string) ([]ToolCall, string) { if strings.TrimSpace(content) == "" || !strings.Contains(content, "") { return nil, content @@ -910,7 +832,7 @@ func (p *HTTPProvider) GetDefaultModel() string { } func (p *HTTPProvider) SupportsResponsesCompact() bool { - return p != nil && p.supportsResponsesCompact && p.protocol == ProtocolResponses + return p != nil && p.supportsResponsesCompact } func (p *HTTPProvider) BuildSummaryViaResponsesCompact(ctx context.Context, model string, existingSummary string, messages []Message, maxSummaryChars int) (string, error) { @@ -923,7 +845,7 @@ func (p *HTTPProvider) BuildSummaryViaResponsesCompact(ctx context.Context, mode } pendingCalls := map[string]struct{}{} for _, msg := range messages { - input = append(input, toResponsesInputItemsWithState(msg, pendingCalls, p.crossSessionCallID)...) + input = append(input, toResponsesInputItemsWithState(msg, pendingCalls)...) } if len(input) == 0 { return strings.TrimSpace(existingSummary), nil @@ -1030,7 +952,7 @@ func CreateProviderByName(cfg *config.Config, name string) (LLMProvider, error) if len(pc.Models) > 0 { defaultModel = pc.Models[0] } - return NewHTTPProvider(pc.APIKey, pc.APIBase, pc.Protocol, defaultModel, pc.CrossSessionCallID, pc.SupportsResponsesCompact, pc.Auth, time.Duration(pc.TimeoutSec)*time.Second), nil + return NewHTTPProvider(pc.APIKey, pc.APIBase, defaultModel, pc.SupportsResponsesCompact, pc.Auth, time.Duration(pc.TimeoutSec)*time.Second), nil } func CreateProviders(cfg *config.Config) (map[string]LLMProvider, error) { @@ -1072,7 +994,7 @@ func ProviderSupportsResponsesCompact(cfg *config.Config, name string) bool { if err != nil { return false } - return pc.SupportsResponsesCompact && normalizeProtocol(pc.Protocol) == ProtocolResponses + return pc.SupportsResponsesCompact } func ListProviderNames(cfg *config.Config) []string { diff --git a/pkg/providers/http_provider_responses_tools_test.go b/pkg/providers/http_provider_responses_tools_test.go new file mode 100644 index 0000000..7c0a999 --- /dev/null +++ b/pkg/providers/http_provider_responses_tools_test.go @@ -0,0 +1,62 @@ +package providers + +import "testing" + +func TestBuildResponsesTools_IncludesFunctionAndBuiltinTools(t *testing.T) { + tools := []ToolDefinition{ + { + Type: "function", + Function: ToolFunctionDefinition{ + Name: "read_file", + Parameters: map[string]interface{}{"type": "object"}, + }, + }, + { + Type: "web_search", + Parameters: map[string]interface{}{"search_context_size": "high"}, + }, + } + options := map[string]interface{}{ + "responses_tools": []interface{}{ + map[string]interface{}{ + "type": "file_search", + "vector_store_ids": []string{"vs_123"}, + }, + }, + } + + got := buildResponsesTools(tools, options) + if len(got) != 3 { + t.Fatalf("expected 3 tools, got %#v", got) + } + if got[0]["type"] != "function" || got[0]["name"] != "read_file" { + t.Fatalf("expected function tool in first slot, got %#v", got[0]) + } + if got[1]["type"] != "web_search" { + t.Fatalf("expected web_search tool in second slot, got %#v", got[1]) + } + if got[2]["type"] != "file_search" { + t.Fatalf("expected file_search tool from options, got %#v", got[2]) + } +} + +func TestResponsesMessageContent_SupportsImageAndFileByID(t *testing.T) { + msg := Message{ + Role: "user", + ContentParts: []MessageContentPart{ + {Type: "input_image", FileID: "file_img_1", Detail: "high"}, + {Type: "input_file", FileID: "file_doc_1", Filename: "doc.pdf"}, + }, + } + + content := responsesMessageContent(msg) + if len(content) != 2 { + t.Fatalf("expected two content items, got %#v", content) + } + if content[0]["type"] != "input_image" || content[0]["file_id"] != "file_img_1" { + t.Fatalf("expected input_image by file_id, got %#v", content[0]) + } + if content[1]["type"] != "input_file" || content[1]["file_id"] != "file_doc_1" { + t.Fatalf("expected input_file by file_id, got %#v", content[1]) + } +} diff --git a/pkg/providers/http_provider_toolcall_pairing_test.go b/pkg/providers/http_provider_toolcall_pairing_test.go index 52515eb..299e6c7 100644 --- a/pkg/providers/http_provider_toolcall_pairing_test.go +++ b/pkg/providers/http_provider_toolcall_pairing_test.go @@ -6,7 +6,7 @@ func TestToResponsesInputItemsWithState_DropsOrphanToolOutputs(t *testing.T) { pending := map[string]struct{}{} orphan := Message{Role: "tool", ToolCallID: "call-orphan", Content: "orphan output"} - if got := toResponsesInputItemsWithState(orphan, pending, true); len(got) != 0 { + if got := toResponsesInputItemsWithState(orphan, pending); len(got) != 0 { t.Fatalf("expected orphan tool output to be dropped, got: %#v", got) } @@ -20,7 +20,7 @@ func TestToResponsesInputItemsWithState_DropsOrphanToolOutputs(t *testing.T) { }, }}, } - items := toResponsesInputItemsWithState(assistant, pending, true) + items := toResponsesInputItemsWithState(assistant, pending) if len(items) == 0 { t.Fatalf("assistant tool call should produce responses items") } @@ -29,7 +29,7 @@ func TestToResponsesInputItemsWithState_DropsOrphanToolOutputs(t *testing.T) { } matched := Message{Role: "tool", ToolCallID: "call-1", Content: "file content"} - matchedItems := toResponsesInputItemsWithState(matched, pending, true) + matchedItems := toResponsesInputItemsWithState(matched, pending) if len(matchedItems) != 1 { t.Fatalf("expected matched tool output item, got %#v", matchedItems) } @@ -40,24 +40,3 @@ func TestToResponsesInputItemsWithState_DropsOrphanToolOutputs(t *testing.T) { t.Fatalf("matched tool output should clear pending call id") } } - -func TestToResponsesInputItemsWithState_ToolResultAsUserInputWhenCallIDDisabled(t *testing.T) { - pending := map[string]struct{}{ - "call-1": {}, - } - msg := Message{Role: "tool", ToolCallID: "call-1", Content: "file content"} - - items := toResponsesInputItemsWithState(msg, pending, false) - if len(items) != 1 { - t.Fatalf("expected one fallback tool result item, got %#v", items) - } - if items[0]["type"] != "message" { - t.Fatalf("expected message item, got %#v", items[0]) - } - if items[0]["role"] != "user" { - t.Fatalf("expected fallback role=user, got %#v", items[0]) - } - if _, ok := pending["call-1"]; !ok { - t.Fatalf("pending call state should remain untouched when call_id mode is disabled") - } -} diff --git a/pkg/providers/types.go b/pkg/providers/types.go index 16a36db..d5fe2c8 100644 --- a/pkg/providers/types.go +++ b/pkg/providers/types.go @@ -40,9 +40,12 @@ type MessageContentPart struct { Type string `json:"type"` Text string `json:"text,omitempty"` ImageURL string `json:"image_url,omitempty"` + Detail string `json:"detail,omitempty"` MIMEType string `json:"mime_type,omitempty"` Filename string `json:"filename,omitempty"` FileData string `json:"file_data,omitempty"` + FileID string `json:"file_id,omitempty"` + FileURL string `json:"file_url,omitempty"` } type LLMProvider interface { @@ -63,12 +66,17 @@ type ResponsesCompactor interface { } type ToolDefinition struct { - Type string `json:"type"` - Function ToolFunctionDefinition `json:"function"` + Type string `json:"type"` + Name string `json:"name,omitempty"` + Description string `json:"description,omitempty"` + Parameters map[string]interface{} `json:"parameters,omitempty"` + Strict *bool `json:"strict,omitempty"` + Function ToolFunctionDefinition `json:"function"` } type ToolFunctionDefinition struct { Name string `json:"name"` Description string `json:"description"` Parameters map[string]interface{} `json:"parameters"` + Strict *bool `json:"strict,omitempty"` } diff --git a/webui/src/pages/Config.tsx b/webui/src/pages/Config.tsx index d923249..dc228ca 100644 --- a/webui/src/pages/Config.tsx +++ b/webui/src/pages/Config.tsx @@ -117,8 +117,6 @@ const Config: React.FC = () => { next.providers.proxies[name] = { api_key: '', api_base: '', - protocol: 'responses', - cross_session_call_id: false, models: [], supports_responses_compact: false, auth: 'bearer', @@ -216,11 +214,10 @@ const Config: React.FC = () => {
{Object.entries(((cfg as any)?.providers?.proxies || {}) as Record).map(([name, p]) => ( -
+
{name}
updateProxyField(name, 'api_base', e.target.value)} placeholder={t('configLabels.api_base')} className="md:col-span-2 px-2 py-1 rounded bg-zinc-950 border border-zinc-800" /> updateProxyField(name, 'api_key', e.target.value)} placeholder={t('configLabels.api_key')} className="md:col-span-2 px-2 py-1 rounded bg-zinc-950 border border-zinc-800" /> - updateProxyField(name, 'protocol', e.target.value)} placeholder={t('configLabels.protocol')} className="md:col-span-1 px-2 py-1 rounded bg-zinc-950 border border-zinc-800" /> updateProxyField(name, 'models', e.target.value.split(',').map(s=>s.trim()).filter(Boolean))} placeholder={`${t('configLabels.models')}${t('configCommaSeparatedHint')}`} className="md:col-span-1 px-2 py-1 rounded bg-zinc-950 border border-zinc-800" />