{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"text":{"description":"Text to convert to speech. Maximum 15,000 characters. Supports inline speech tags: [pause], [laugh], <whisper>…</whisper>, etc. Required for REST mode, mutually exclusive with websocket.","type":"string","minLength":1,"maxLength":15000},"language":{"description":"BCP-47 language code (e.g. \"en\", \"zh\", \"pt-BR\") or \"auto\" for automatic language detection. Required for both REST and WebSocket modes. Supported codes: auto, en, ar-EG, ar-SA, ar-AE, bn, zh, fr, de, hi, id, it, ja, ko, pt-BR, pt-PT, ru, es-MX, es-ES, tr, vi.","type":"string"},"websocket":{"description":"Enable WebSocket streaming for text-to-speech. When true, establishes a bidirectional WebSocket connection. Mutually exclusive with text.","type":"boolean"},"voice_id":{"description":"Voice for synthesis. Defaults to \"eve\". Built-in voices: eve (energetic), ara (warm), rex (confident), sal (balanced), leo (authoritative). Custom voice IDs from /v1/tts/voices are also accepted. Case-insensitive — \"Eve\", \"EVE\", and \"eve\" are equivalent.","type":"string","minLength":1},"output_format":{"description":"Output audio format. Defaults to MP3 at 24 kHz / 128 kbps when omitted.","type":"object","properties":{"codec":{"description":"Audio codec. Defaults to \"mp3\". mp3 → audio/mpeg (general use); wav → audio/wav (lossless); pcm → audio/pcm (raw 16-bit LE, real-time pipelines); mulaw/ulaw → audio/basic (G.711 μ-law, telephony); alaw → audio/alaw (G.711 A-law, telephony).","type":"string","enum":["mp3","wav","pcm","mulaw","ulaw","alaw"]},"sample_rate":{"description":"Sample rate in Hz. Defaults to 24000. Supported: 8000, 16000, 22050, 24000, 44100, 48000. Telephony codecs (mulaw, alaw) typically use 8000.","anyOf":[{"type":"number","const":8000},{"type":"number","const":16000},{"type":"number","const":22050},{"type":"number","const":24000},{"type":"number","const":44100},{"type":"number","const":48000}]},"bit_rate":{"description":"Bit rate in bps. MP3 only. Defaults to 128000. Supported: 32000, 64000, 96000, 128000, 192000.","anyOf":[{"type":"number","const":32000},{"type":"number","const":64000},{"type":"number","const":96000},{"type":"number","const":128000},{"type":"number","const":192000}]}},"additionalProperties":false},"optimize_streaming_latency":{"description":"Latency optimization for streaming synthesis. 0 (default): no optimization, best audio quality. 1: reduced first-chunk size for lower time-to-first-audio with minor quality tradeoff.","anyOf":[{"type":"number","const":0},{"type":"number","const":1}]},"text_normalization":{"description":"When true, normalizes written-form text into spoken-form before synthesis (e.g. \"Dr.\" → \"Doctor\", \"100\" → \"one hundred\"). Defaults to false.","type":"boolean"},"speed":{"description":"Speech speed multiplier. 1.0 is normal speed. Range: 0.7 to 1.5. Defaults to 1.0. Only used in WebSocket mode.","type":"number","minimum":0.7,"maximum":1.5}},"required":["language"],"additionalProperties":false}