Listening...
Listening...
"use client"
import { useRef, useState } from "react"
import { toast } from "sonner"
import { getScribeToken } from "@/registry/elevenlabs-ui/blocks/realtime-transcriber-01/actions/get-scribe-token"
import { Input } from "@/components/ui/input"
import {
SpeechInput,
SpeechInputCancelButton,
SpeechInputPreview,
SpeechInputRecordButton,
} from "@/components/ui/speech-input"
import { Textarea } from "@/components/ui/textarea"
async function getToken() {
const result = await getScribeToken()
if (result.error) {
throw new Error(result.error)
}
return result.token!
}
function TextareaWithSpeechInputRight() {
const [value, setValue] = useState("")
const valueAtStartRef = useRef("")
return (
<div className="relative">
<Textarea
value={value}
onChange={(event) => {
setValue(event.target.value)
}}
placeholder="Jot down some thoughts..."
className="min-h-[120px] resize-none rounded-2xl px-3.5 pt-3 pb-14"
/>
<div className="absolute right-3 bottom-3 flex items-center gap-2">
<SpeechInput
size="sm"
getToken={getToken}
onStart={() => {
valueAtStartRef.current = value
}}
onChange={({ transcript }) => {
setValue(valueAtStartRef.current + transcript)
}}
onStop={({ transcript }) => {
setValue(valueAtStartRef.current + transcript)
}}
onCancel={() => {
setValue(valueAtStartRef.current)
}}
onError={(error) => {
toast.error(String(error))
}}
>
<SpeechInputCancelButton />
<SpeechInputPreview placeholder="Listening..." />
<SpeechInputRecordButton />
</SpeechInput>
</div>
</div>
)
}
function TextareaWithSpeechInputLeft() {
const [value, setValue] = useState("")
const valueAtStartRef = useRef("")
return (
<div className="relative">
<Textarea
value={value}
onChange={(event) => {
setValue(event.target.value)
}}
placeholder="Jot down some thoughts..."
className="min-h-[120px] resize-none rounded-2xl px-3.5 pt-3 pb-14"
/>
<div className="absolute bottom-3 left-3 flex items-center gap-2">
<SpeechInput
size="sm"
getToken={getToken}
onStart={() => {
valueAtStartRef.current = value
}}
onChange={({ transcript }) => {
setValue(valueAtStartRef.current + transcript)
}}
onStop={({ transcript }) => {
setValue(valueAtStartRef.current + transcript)
}}
onCancel={() => {
setValue(valueAtStartRef.current)
}}
onError={(error) => {
toast.error(String(error))
}}
>
<SpeechInputRecordButton />
<SpeechInputPreview placeholder="Listening..." />
<SpeechInputCancelButton />
</SpeechInput>
</div>
</div>
)
}
function InputWithSpeechInput() {
const [value, setValue] = useState("")
const valueAtStartRef = useRef("")
return (
<div className="flex items-center gap-2.5">
<Input
value={value}
onChange={(event) => {
setValue(event.target.value)
}}
placeholder="Give this idea a title..."
className="min-w-0 flex-1 px-3.5 text-base transition-[flex-basis] duration-200 md:text-sm"
/>
<SpeechInput
getToken={getToken}
className="shrink-0"
onStart={() => {
valueAtStartRef.current = value
}}
onChange={({ transcript }) => {
setValue(valueAtStartRef.current + transcript)
}}
onStop={({ transcript }) => {
setValue(valueAtStartRef.current + transcript)
}}
onCancel={() => {
setValue(valueAtStartRef.current)
}}
onError={(error) => {
toast.error(String(error))
}}
>
<SpeechInputCancelButton />
<SpeechInputRecordButton />
</SpeechInput>
</div>
)
}
export function SpeechInputDemo() {
return (
<div className="absolute inset-0 space-y-4 overflow-auto rounded-2xl p-10">
<TextareaWithSpeechInputRight />
<TextareaWithSpeechInputLeft />
<InputWithSpeechInput />
</div>
)
}
Installation
pnpm dlx @elevenlabs/cli@latest components add speech-input
Usage
import {
SpeechInput,
SpeechInputCancelButton,
SpeechInputPreview,
SpeechInputRecordButton,
} from "@/components/ui/speech-input"Basic Usage
async function getToken() {
const response = await fetch("/api/get-scribe-token", { method: "POST" })
const json = await response.json()
return json.token
}
export default function Example() {
return (
<SpeechInput
getToken={getToken}
onChange={(data) => console.log(data.transcript)}
onStop={(data) => console.log("Final:", data.transcript)}
>
<SpeechInputRecordButton />
<SpeechInputPreview placeholder="Start speaking..." />
<SpeechInputCancelButton />
</SpeechInput>
)
}With Form Input
import { useState } from "react"
export default function Example() {
const [value, setValue] = useState("")
return (
<div className="flex items-center gap-2">
<input
value={value}
onChange={(e) => setValue(e.target.value)}
className="flex-1 rounded border px-3 py-2"
/>
<SpeechInput
getToken={getToken}
onStop={(data) => setValue((prev) => prev + " " + data.transcript)}
>
<SpeechInputRecordButton />
<SpeechInputPreview />
<SpeechInputCancelButton />
</SpeechInput>
</div>
)
}Reversed Layout
The component automatically adjusts its layout based on child order:
<SpeechInput getToken={getToken}>
<SpeechInputCancelButton />
<SpeechInputPreview />
<SpeechInputRecordButton />
</SpeechInput>Minimal (Record Button Only)
<SpeechInput
getToken={getToken}
onStop={(data) => console.log(data.transcript)}
>
<SpeechInputRecordButton />
</SpeechInput>Custom Placeholder
<SpeechInput getToken={getToken}>
<SpeechInputRecordButton />
<SpeechInputPreview placeholder="Say something..." />
<SpeechInputCancelButton />
</SpeechInput>Using the Hook
Access the speech input context in child components:
import { useSpeechInput } from "@/components/ui/speech-input"
function TranscriptDisplay() {
const { transcript, isConnected, isConnecting } = useSpeechInput()
return (
<div>
<p>
Status:{" "}
{isConnecting ? "Connecting" : isConnected ? "Recording" : "Idle"}
</p>
<p>Transcript: {transcript}</p>
</div>
)
}Server Action for Token
Create a server action to securely fetch the Scribe token:
"use server"
export async function getScribeToken() {
const response = await fetch(
"https://api.elevenlabs.io/v1/speech-to-text/get-realtime-token",
{
method: "POST",
headers: {
"Content-Type": "application/json",
"xi-api-key": process.env.ELEVENLABS_API_KEY!,
},
body: JSON.stringify({
model_id: "scribe_v2_realtime",
ttl_secs: 300,
}),
}
)
const data = await response.json()
return data.token
}API Reference
SpeechInput
The root component that manages speech-to-text state and provides context to child components.
Props
| Prop | Type | Default | Description |
|---|---|---|---|
| children | ReactNode | - | Child components (record button, preview, cancel) |
| getToken | () => Promise<string> | - | Function to fetch ElevenLabs Scribe token |
| onChange | (data: SpeechInputData) => void | - | Called when transcript changes |
| onStart | (data: SpeechInputData) => void | - | Called when recording starts |
| onStop | (data: SpeechInputData) => void | - | Called when recording stops |
| onCancel | (data: SpeechInputData) => void | - | Called when recording is cancelled |
| onError | (error: Error | Event) => void | - | Called on connection errors |
| onAuthError | (data: { error: string }) => void | - | Called on authentication errors |
| onQuotaExceededError | (data: { error: string }) => void | - | Called when quota is exceeded |
| modelId | string | "scribe_v2_realtime" | ElevenLabs model ID |
| baseUri | string | - | Custom WebSocket base URI |
| commitStrategy | CommitStrategy | "vad" | How transcripts are committed ("manual" or "vad") |
| vadSilenceThresholdSecs | number | - | VAD silence threshold (0.3-3.0) |
| vadThreshold | number | - | VAD threshold (0.1-0.9) |
| minSpeechDurationMs | number | - | Minimum speech duration (50-2000ms) |
| minSilenceDurationMs | number | - | Minimum silence duration (50-2000ms) |
| languageCode | string | - | ISO-639-1/3 language code |
| microphone | MicrophoneOptions | See below | Microphone configuration |
| audioFormat | AudioFormat | - | Audio format for manual streaming |
| sampleRate | number | - | Sample rate for manual streaming |
| className | string | - | Optional CSS classes |
Default Microphone Options
{
echoCancellation: true,
noiseSuppression: true
}SpeechInputRecordButton
Toggle button that switches between microphone icon (idle), connecting indicator, and stop icon (recording).
Props
| Prop | Type | Description |
|---|---|---|
| className | string | Optional CSS classes |
| disabled | boolean | Disable the button |
| onClick | (e: MouseEvent) => void | Additional click handler |
| ...props | ComponentPropsWithoutRef<"button"> | All button props |
SpeechInputPreview
Displays the current transcript with smooth text animations.
Props
| Prop | Type | Default | Description |
|---|---|---|---|
| placeholder | string | "Listening..." | Text shown when empty |
| className | string | - | Optional CSS classes |
| ...props | ComponentPropsWithoutRef<"div"> | - | All div props |
SpeechInputCancelButton
Button to cancel the current recording and clear the transcript.
Props
| Prop | Type | Description |
|---|---|---|
| className | string | Optional CSS classes |
| onClick | (e: MouseEvent) => void | Additional click handler |
| ...props | ComponentPropsWithoutRef<"button"> | All button props |
useSpeechInput
Hook to access speech input context from child components.
Returns
| Property | Type | Description |
|---|---|---|
| isConnected | boolean | Whether currently connected/recording |
| isConnecting | boolean | Whether connection is in progress |
| transcript | string | Full transcript (committed + partial) |
| partialTranscript | string | Current partial transcript |
| committedTranscripts | string[] | Array of committed transcripts |
| error | string | null | Current error message |
| start | () => Promise<void> | Start recording |
| stop | () => void | Stop recording |
| cancel | () => void | Cancel and clear transcript |
SpeechInputData
Data object passed to callbacks.
interface SpeechInputData {
partialTranscript: string
committedTranscripts: string[]
transcript: string // Combined full transcript
}CommitStrategy
enum CommitStrategy {
MANUAL = "manual",
VAD = "vad",
}AudioFormat
enum AudioFormat {
PCM_8000 = "pcm_8000",
PCM_16000 = "pcm_16000",
PCM_22050 = "pcm_22050",
PCM_24000 = "pcm_24000",
PCM_44100 = "pcm_44100",
PCM_48000 = "pcm_48000",
ULAW_8000 = "ulaw_8000",
}Features
- Real-time Transcription: Live speech-to-text using ElevenLabs Scribe
- Compound Components: Flexible composition with record button, preview, and cancel
- Animated Transitions: Smooth expand/collapse animations using Framer Motion
- Voice Activity Detection: Automatic transcript commits based on speech pauses
- Visual Feedback: Distinct states for idle, connecting, and recording
- Accessibility: Proper ARIA labels and keyboard interaction
Notes
- Requires an ElevenLabs API key for generating Scribe tokens
- Token generation should happen server-side to protect your API key
- The component automatically handles microphone permissions
- Uses WebSocket for real-time communication with ElevenLabs Scribe API
- VAD (Voice Activity Detection) mode automatically commits transcripts during pauses
- The preview component uses a gradient mask for text overflow
- Layout automatically adjusts based on whether the record button is first or last
On This Page
InstallationUsageBasic UsageWith Form InputReversed LayoutMinimal (Record Button Only)Custom PlaceholderUsing the HookServer Action for TokenAPI ReferenceSpeechInputPropsDefault Microphone OptionsSpeechInputRecordButtonPropsSpeechInputPreviewPropsSpeechInputCancelButtonPropsuseSpeechInputReturnsSpeechInputDataCommitStrategyAudioFormatFeaturesNotesDeploy and Scale Agents with ElevenLabs
ElevenLabs delivers the infrastructure and developer experience you need to ship reliable audio & agent applications at scale.
Talk to an expert