From 2e0222afa8a503abd1fe21d17be6add839e94e89 Mon Sep 17 00:00:00 2001 From: HowieDuhzit Date: Sat, 28 Dec 2024 03:47:53 -0500 Subject: [PATCH] feat: add direct speech handling endpoint with ElevenLabs integration - Implemented a new POST endpoint `/:agentId/speak` for processing user messages and generating speech responses. - Integrated ElevenLabs API for text-to-speech conversion, including error handling for API responses. - Enhanced message processing and memory management for agent interactions. - Added necessary imports for `node-fetch` and `formdata-node` to support new functionality. --- packages/client-direct/src/index.ts | 159 ++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) diff --git a/packages/client-direct/src/index.ts b/packages/client-direct/src/index.ts index 11ee361568e..eea5d4d9aa5 100644 --- a/packages/client-direct/src/index.ts +++ b/packages/client-direct/src/index.ts @@ -19,6 +19,8 @@ import { settings } from "@elizaos/core"; import { createApiRouter } from "./api.ts"; import * as fs from "fs"; import * as path from "path"; +import fetch from "node-fetch"; +import { FormData, Blob } from 'formdata-node'; const upload = multer({ storage: multer.memoryStorage() }); export const messageHandlerTemplate = @@ -370,6 +372,163 @@ export class DirectClient { } } ); + + this.app.post("/:agentId/speak", async (req, res) => { + const agentId = req.params.agentId; + const roomId = stringToUuid(req.body.roomId ?? "default-room-" + agentId); + const userId = stringToUuid(req.body.userId ?? "user"); + const text = req.body.text; + + if (!text) { + res.status(400).send("No text provided"); + return; + } + + let runtime = this.agents.get(agentId); + + // if runtime is null, look for runtime with the same name + if (!runtime) { + runtime = Array.from(this.agents.values()).find( + (a) => a.character.name.toLowerCase() === agentId.toLowerCase() + ); + } + + if (!runtime) { + res.status(404).send("Agent not found"); + return; + } + + try { + // Process message through agent (same as /message endpoint) + await runtime.ensureConnection( + userId, + roomId, + req.body.userName, + req.body.name, + "direct" + ); + + const messageId = stringToUuid(Date.now().toString()); + + const content: Content = { + text, + attachments: [], + source: "direct", + inReplyTo: undefined, + }; + + const userMessage = { + content, + userId, + roomId, + agentId: runtime.agentId, + }; + + const memory: Memory = { + id: messageId, + agentId: runtime.agentId, + userId, + roomId, + content, + createdAt: Date.now(), + }; + + await runtime.messageManager.createMemory(memory); + + const state = await runtime.composeState(userMessage, { + agentName: runtime.character.name, + }); + + const context = composeContext({ + state, + template: messageHandlerTemplate, + }); + + const response = await generateMessageResponse({ + runtime: runtime, + context, + modelClass: ModelClass.LARGE, + }); + + // save response to memory + const responseMessage = { + ...userMessage, + userId: runtime.agentId, + content: response, + }; + + await runtime.messageManager.createMemory(responseMessage); + + if (!response) { + res.status(500).send("No response from generateMessageResponse"); + return; + } + + let message = null as Content | null; + + await runtime.evaluate(memory, state); + + const _result = await runtime.processActions( + memory, + [responseMessage], + state, + async (newMessages) => { + message = newMessages; + return [memory]; + } + ); + + // Get the text to convert to speech + const textToSpeak = response.text; + + // Convert to speech using ElevenLabs + const elevenLabsApiUrl = `https://api.elevenlabs.io/v1/text-to-speech/${process.env.ELEVENLABS_VOICE_ID}`; + const apiKey = process.env.ELEVENLABS_XI_API_KEY; + + if (!apiKey) { + throw new Error("ELEVENLABS_XI_API_KEY not configured"); + } + + const speechResponse = await fetch(elevenLabsApiUrl, { + method: "POST", + headers: { + "Content-Type": "application/json", + "xi-api-key": apiKey, + }, + body: JSON.stringify({ + text: textToSpeak, + model_id: process.env.ELEVENLABS_MODEL_ID || "eleven_multilingual_v2", + voice_settings: { + stability: parseFloat(process.env.ELEVENLABS_VOICE_STABILITY || "0.5"), + similarity_boost: parseFloat(process.env.ELEVENLABS_VOICE_SIMILARITY_BOOST || "0.9"), + style: parseFloat(process.env.ELEVENLABS_VOICE_STYLE || "0.66"), + use_speaker_boost: process.env.ELEVENLABS_VOICE_USE_SPEAKER_BOOST === "true", + }, + }), + }); + + if (!speechResponse.ok) { + throw new Error(`ElevenLabs API error: ${speechResponse.statusText}`); + } + + const audioBuffer = await speechResponse.arrayBuffer(); + + // Set appropriate headers for audio streaming + res.set({ + 'Content-Type': 'audio/mpeg', + 'Transfer-Encoding': 'chunked' + }); + + res.send(Buffer.from(audioBuffer)); + + } catch (error) { + console.error("Error processing message or generating speech:", error); + res.status(500).json({ + error: "Error processing message or generating speech", + details: error.message + }); + } + }); } // agent/src/index.ts:startAgent calls this