Loading...
Loading...
Text-to-speech (TTS) and speech-to-text (STT) via Together AI. TTS models include Orpheus, Kokoro, Cartesia Sonic, Rime, MiniMax with REST, streaming, and WebSocket support. STT models include Whisper and Voxtral. Use when users need voice synthesis, audio generation, speech recognition, transcription, TTS, STT, or real-time voice applications.
npx skill4agent add zainhas/togetherai-skills together-audio/v1/audio/speechwss://api.together.xyz/v1/audio/speech/websocket/v1/audio/transcriptionsfrom together import Together
client = Together()
response = client.audio.speech.create(
model="canopylabs/orpheus-3b-0.1-ft",
input="Today is a wonderful day to build something people love!",
voice="tara",
response_format="mp3",
)
response.stream_to_file("speech.mp3")import Together from "together-ai";
import { Readable } from "stream";
import { createWriteStream } from "fs";
const together = new Together();
async function generateAudio() {
const res = await together.audio.create({
input: "Today is a wonderful day to build something people love!",
voice: "tara",
response_format: "mp3",
sample_rate: 44100,
stream: false,
model: "canopylabs/orpheus-3b-0.1-ft",
});
if (res.body) {
const nodeStream = Readable.from(res.body as ReadableStream);
const fileStream = createWriteStream("./speech.mp3");
nodeStream.pipe(fileStream);
}
}
generateAudio();curl -X POST "https://api.together.xyz/v1/audio/speech" \
-H "Authorization: Bearer $TOGETHER_API_KEY" \
-H "Content-Type: application/json" \
-d '{"model":"canopylabs/orpheus-3b-0.1-ft","input":"Hello world","voice":"tara","response_format":"mp3"}' \
--output speech.mp3response = client.audio.speech.create(
model="canopylabs/orpheus-3b-0.1-ft",
input="The quick brown fox jumps over the lazy dog",
voice="tara",
stream=True,
response_format="raw",
response_encoding="pcm_s16le",
)
response.stream_to_file("speech.wav", response_format="wav")import Together from "together-ai";
const together = new Together();
async function streamAudio() {
const response = await together.audio.speech.create({
model: "canopylabs/orpheus-3b-0.1-ft",
input: "The quick brown fox jumps over the lazy dog",
voice: "tara",
stream: true,
response_format: "raw",
response_encoding: "pcm_s16le",
});
const chunks = [];
for await (const chunk of response) {
chunks.push(chunk);
}
console.log("Streaming complete!");
}
streamAudio();import asyncio, websockets, json, base64
async def generate_speech():
url = "wss://api.together.ai/v1/audio/speech/websocket?model=hexgrad/Kokoro-82M&voice=af_alloy"
headers = {"Authorization": f"Bearer {api_key}"}
async with websockets.connect(url, additional_headers=headers) as ws:
session = json.loads(await ws.recv())
await ws.send(json.dumps({"type": "input_text_buffer.append", "text": "Hello!"}))
await ws.send(json.dumps({"type": "input_text_buffer.commit"}))
audio_data = bytearray()
async for msg in ws:
data = json.loads(msg)
if data["type"] == "conversation.item.audio_output.delta":
audio_data.extend(base64.b64decode(data["delta"]))
elif data["type"] == "conversation.item.audio_output.done":
break| Model | API String | Endpoints | Price |
|---|---|---|---|
| Orpheus 3B | | REST, Streaming, WebSocket | $15/1M chars |
| Kokoro | | REST, Streaming, WebSocket | $4/1M chars |
| Cartesia Sonic 2 | | REST | $65/1M chars |
| Cartesia Sonic | | REST | - |
| Rime Arcana v3 Turbo | | REST, Streaming, WebSocket | DE only |
| MiniMax Speech 2.6 | | REST, Streaming, WebSocket | DE only |
| Parameter | Type | Description | Default |
|---|---|---|---|
| string | TTS model (required) | - |
| string | Text to synthesize (required) | - |
| string | Voice ID (required) | - |
| string | | |
| bool | Enable streaming (raw format only) | false |
| string | | - |
| string | Language of input text: en, de, fr, es, hi, it, ja, ko, nl, pl, pt, ru, sv, tr, zh | "en" |
| int | Audio sample rate (e.g., 44100) | - |
response = client.audio.voices.list()
for model_voices in response.data:
print(f"Model: {model_voices.model}")
for voice in model_voices.voices:
print(f" - {voice.name}")taraleahleodanmiazacaf_alloyaf_bellaam_adamam_echoresponse = client.audio.transcriptions.create(
model="openai/whisper-large-v3",
file=open("audio.mp3", "rb"),
)
print(response.text)import Together from "together-ai";
const together = new Together();
const transcription = await together.audio.transcriptions.create({
file: "path/to/audio.mp3",
model: "openai/whisper-large-v3",
language: "en",
});
console.log(transcription.text);curl -X POST "https://api.together.xyz/v1/audio/transcriptions" \
-H "Authorization: Bearer $TOGETHER_API_KEY" \
-F model="openai/whisper-large-v3" \
-F file=@audio.mp3| Model | API String |
|---|---|
| Whisper Large v3 | |
| Voxtral Mini 3B | |