From 2e0222afa8a503abd1fe21d17be6add839e94e89 Mon Sep 17 00:00:00 2001
From: HowieDuhzit <howieduhzit@gmail.com>
Date: Sat, 28 Dec 2024 03:47:53 -0500
Subject: [PATCH] feat: add direct speech handling endpoint with ElevenLabs
 integration

- Implemented a new POST endpoint `/:agentId/speak` for processing user messages and generating speech responses.
- Integrated ElevenLabs API for text-to-speech conversion, including error handling for API responses.
- Enhanced message processing and memory management for agent interactions.
- Added necessary imports for `node-fetch` and `formdata-node` to support new functionality.
---
 packages/client-direct/src/index.ts | 159 ++++++++++++++++++++++++++++
 1 file changed, 159 insertions(+)

diff --git a/packages/client-direct/src/index.ts b/packages/client-direct/src/index.ts
index 11ee361568e..eea5d4d9aa5 100644
--- a/packages/client-direct/src/index.ts
+++ b/packages/client-direct/src/index.ts
@@ -19,6 +19,8 @@ import { settings } from "@elizaos/core";
 import { createApiRouter } from "./api.ts";
 import * as fs from "fs";
 import * as path from "path";
+import fetch from "node-fetch";
+import { FormData, Blob } from 'formdata-node';
 const upload = multer({ storage: multer.memoryStorage() });
 
 export const messageHandlerTemplate =
@@ -370,6 +372,163 @@ export class DirectClient {
                 }
             }
         );
+
+        this.app.post("/:agentId/speak", async (req, res) => {
+            const agentId = req.params.agentId;
+            const roomId = stringToUuid(req.body.roomId ?? "default-room-" + agentId);
+            const userId = stringToUuid(req.body.userId ?? "user");
+            const text = req.body.text;
+
+            if (!text) {
+                res.status(400).send("No text provided");
+                return;
+            }
+
+            let runtime = this.agents.get(agentId);
+
+            // if runtime is null, look for runtime with the same name
+            if (!runtime) {
+                runtime = Array.from(this.agents.values()).find(
+                    (a) => a.character.name.toLowerCase() === agentId.toLowerCase()
+                );
+            }
+
+            if (!runtime) {
+                res.status(404).send("Agent not found");
+                return;
+            }
+
+            try {
+                // Process message through agent (same as /message endpoint)
+                await runtime.ensureConnection(
+                    userId,
+                    roomId,
+                    req.body.userName,
+                    req.body.name,
+                    "direct"
+                );
+
+                const messageId = stringToUuid(Date.now().toString());
+
+                const content: Content = {
+                    text,
+                    attachments: [],
+                    source: "direct",
+                    inReplyTo: undefined,
+                };
+
+                const userMessage = {
+                    content,
+                    userId,
+                    roomId,
+                    agentId: runtime.agentId,
+                };
+
+                const memory: Memory = {
+                    id: messageId,
+                    agentId: runtime.agentId,
+                    userId,
+                    roomId,
+                    content,
+                    createdAt: Date.now(),
+                };
+
+                await runtime.messageManager.createMemory(memory);
+
+                const state = await runtime.composeState(userMessage, {
+                    agentName: runtime.character.name,
+                });
+
+                const context = composeContext({
+                    state,
+                    template: messageHandlerTemplate,
+                });
+
+                const response = await generateMessageResponse({
+                    runtime: runtime,
+                    context,
+                    modelClass: ModelClass.LARGE,
+                });
+
+                // save response to memory
+                const responseMessage = {
+                    ...userMessage,
+                    userId: runtime.agentId,
+                    content: response,
+                };
+
+                await runtime.messageManager.createMemory(responseMessage);
+
+                if (!response) {
+                    res.status(500).send("No response from generateMessageResponse");
+                    return;
+                }
+
+                let message = null as Content | null;
+
+                await runtime.evaluate(memory, state);
+
+                const _result = await runtime.processActions(
+                    memory,
+                    [responseMessage],
+                    state,
+                    async (newMessages) => {
+                        message = newMessages;
+                        return [memory];
+                    }
+                );
+
+                // Get the text to convert to speech
+                const textToSpeak = response.text;
+
+                // Convert to speech using ElevenLabs
+                const elevenLabsApiUrl = `https://api.elevenlabs.io/v1/text-to-speech/${process.env.ELEVENLABS_VOICE_ID}`;
+                const apiKey = process.env.ELEVENLABS_XI_API_KEY;
+
+                if (!apiKey) {
+                    throw new Error("ELEVENLABS_XI_API_KEY not configured");
+                }
+
+                const speechResponse = await fetch(elevenLabsApiUrl, {
+                    method: "POST",
+                    headers: {
+                        "Content-Type": "application/json",
+                        "xi-api-key": apiKey,
+                    },
+                    body: JSON.stringify({
+                        text: textToSpeak,
+                        model_id: process.env.ELEVENLABS_MODEL_ID || "eleven_multilingual_v2",
+                        voice_settings: {
+                            stability: parseFloat(process.env.ELEVENLABS_VOICE_STABILITY || "0.5"),
+                            similarity_boost: parseFloat(process.env.ELEVENLABS_VOICE_SIMILARITY_BOOST || "0.9"),
+                            style: parseFloat(process.env.ELEVENLABS_VOICE_STYLE || "0.66"),
+                            use_speaker_boost: process.env.ELEVENLABS_VOICE_USE_SPEAKER_BOOST === "true",
+                        },
+                    }),
+                });
+
+                if (!speechResponse.ok) {
+                    throw new Error(`ElevenLabs API error: ${speechResponse.statusText}`);
+                }
+
+                const audioBuffer = await speechResponse.arrayBuffer();
+
+                // Set appropriate headers for audio streaming
+                res.set({
+                    'Content-Type': 'audio/mpeg',
+                    'Transfer-Encoding': 'chunked'
+                });
+
+                res.send(Buffer.from(audioBuffer));
+
+            } catch (error) {
+                console.error("Error processing message or generating speech:", error);
+                res.status(500).json({
+                    error: "Error processing message or generating speech",
+                    details: error.message
+                });
+            }
+        });
     }
 
     // agent/src/index.ts:startAgent calls this