{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"file":{"description":"Audio file as a data URI (data:audio/...;base64,...) or an HTTPS URL the gateway fetches and uploads. Supported container formats: flac, mp3, mp4, m4a, mkv, ogg, opus, wav, aac. Raw formats (pcm, mulaw, alaw) also accepted — supply audio_format and sample_rate. Gateway-side size limit: 25 MB. Mutually exclusive with `url`.","type":"string"},"url":{"description":"HTTPS URL of an audio file for xAI to fetch server-side. Mutually exclusive with `file` and `websocket`. No gateway-side size limit applies.","type":"string","format":"uri"},"websocket":{"description":"Enable WebSocket streaming for speech-to-text. When true, establishes a bidirectional WebSocket connection for real-time audio transcription. Mutually exclusive with `file` and `url`.","type":"boolean"},"audio_format":{"description":"Format hint for raw/headerless audio. Required for pcm, mulaw, alaw. Omit for container formats (mp3, wav, etc.) — xAI auto-detects them.","type":"string","enum":["pcm","mulaw","alaw"]},"sample_rate":{"description":"Sample rate in Hz. Required when audio_format is set.","type":"integer","minimum":-9007199254740991,"maximum":9007199254740991},"language":{"description":"Language code (e.g. \"en\", \"fr\", \"de\"). Used with format=true to enable Inverse Text Normalization. xAI transcribes in any language regardless — supplying this enables number/currency formatting in the transcript.","type":"string"},"format":{"description":"When true, enables Inverse Text Normalization — spoken numbers and currencies are converted to written form (e.g. \"one hundred dollars\" → \"$100\"). Requires language to be set.","type":"boolean"},"diarize":{"description":"When true, enables speaker diarization. Each word in the response includes a `speaker` integer identifying the detected speaker.","type":"boolean"},"filler_words":{"description":"When true, filler words (uh, um, er) are included in the transcript. Defaults to false — filler words are removed.","type":"boolean"},"multichannel":{"description":"When true, each audio channel is transcribed independently. Results are returned in the `channels` array. Requires channels ≥ 2.","type":"boolean"},"channels":{"description":"Number of audio channels (2–8). Required only for multichannel raw audio; auto-detected for container formats.","type":"integer","minimum":2,"maximum":8},"keyterm":{"description":"Key terms to bias transcription toward (e.g. product names, proper nouns). Each term up to 50 characters, max 100 terms. Sent as repeated form fields: keyterm=Term+One&keyterm=Term+Two.","maxItems":100,"type":"array","items":{"type":"string","maxLength":50}}},"additionalProperties":false}