diff --git a/apps/llm/app/index.tsx b/apps/llm/app/index.tsx
index 72358ae72c..b67b3fa7ce 100644
--- a/apps/llm/app/index.tsx
+++ b/apps/llm/app/index.tsx
@@ -29,12 +29,6 @@ export default function Home() {
>
LLM Structured Output
- router.navigate('voice_chat/')}
- >
- Voice Chat
-
router.navigate('multimodal_llm/')}
diff --git a/apps/llm/app/voice_chat/index.tsx b/apps/llm/app/voice_chat/index.tsx
deleted file mode 100644
index 23ab70bff4..0000000000
--- a/apps/llm/app/voice_chat/index.tsx
+++ /dev/null
@@ -1,311 +0,0 @@
-import { useContext, useEffect, useState } from 'react';
-import {
- Keyboard,
- KeyboardAvoidingView,
- Platform,
- StyleSheet,
- Text,
- TouchableOpacity,
- TouchableWithoutFeedback,
- View,
-} from 'react-native';
-import SWMIcon from '../../assets/icons/swm_icon.svg';
-import Spinner from '../../components/Spinner';
-import ErrorBanner from '../../components/ErrorBanner';
-import {
- useSpeechToText,
- useLLM,
- QWEN3_0_6B_QUANTIZED,
- QWEN3_1_7B_QUANTIZED,
- LLAMA3_2_1B_SPINQUANT,
- WHISPER_TINY_EN,
- WHISPER_TINY_EN_QUANTIZED,
- WHISPER_BASE_EN,
- WHISPER_SMALL_EN,
- LLMProps,
- SpeechToTextProps,
-} from 'react-native-executorch';
-import { ModelPicker, ModelOption } from '../../components/ModelPicker';
-import PauseIcon from '../../assets/icons/pause_icon.svg';
-import MicIcon from '../../assets/icons/mic_icon.svg';
-import StopIcon from '../../assets/icons/stop_icon.svg';
-import ColorPalette from '../../colors';
-import Messages from '../../components/Messages';
-import { AudioManager, AudioRecorder } from 'react-native-audio-api';
-import DeviceInfo from 'react-native-device-info';
-import { useIsFocused } from '@react-navigation/native';
-import { useSafeAreaInsets } from 'react-native-safe-area-context';
-import { GeneratingContext } from '../../context';
-
-type LLMModelSources = LLMProps['model'];
-type STTModelSources = SpeechToTextProps['model'];
-
-const LLM_MODELS: ModelOption[] = [
- { label: 'Qwen3 0.6B', value: QWEN3_0_6B_QUANTIZED },
- { label: 'Qwen3 1.7B', value: QWEN3_1_7B_QUANTIZED },
- { label: 'Llama 1B', value: LLAMA3_2_1B_SPINQUANT },
-];
-
-const STT_MODELS: ModelOption[] = [
- { label: 'Whisper Tiny', value: WHISPER_TINY_EN },
- { label: 'Whisper Tiny Q', value: WHISPER_TINY_EN_QUANTIZED },
- { label: 'Whisper Base', value: WHISPER_BASE_EN },
- { label: 'Whisper Small', value: WHISPER_SMALL_EN },
-];
-
-export default function VoiceChatScreenWrapper() {
- const isFocused = useIsFocused();
-
- return isFocused ? : null;
-}
-
-function VoiceChatScreen() {
- const { bottom } = useSafeAreaInsets();
- const [isRecording, setIsRecording] = useState(false);
- const [liveTranscription, setLiveTranscription] = useState('');
- const [selectedLLM, setSelectedLLM] =
- useState(QWEN3_0_6B_QUANTIZED);
- const [selectedSTT, setSelectedSTT] =
- useState(WHISPER_TINY_EN);
- const [error, setError] = useState(null);
-
- const [recorder] = useState(() => new AudioRecorder());
-
- const { setGlobalGenerating } = useContext(GeneratingContext);
-
- const llm = useLLM({ model: selectedLLM });
- const speechToText = useSpeechToText({
- model: selectedSTT,
- });
-
- useEffect(() => {
- setGlobalGenerating(llm.isGenerating || speechToText.isGenerating);
- }, [llm.isGenerating, speechToText.isGenerating, setGlobalGenerating]);
-
- useEffect(() => {
- AudioManager.setAudioSessionOptions({
- iosCategory: 'playAndRecord',
- iosMode: 'spokenAudio',
- iosOptions: ['allowBluetoothHFP', 'defaultToSpeaker'],
- });
- AudioManager.requestRecordingPermissions();
- }, []);
-
- const handleRecordPress = async () => {
- if (isRecording) {
- setIsRecording(false);
- recorder.stop();
- speechToText.streamStop();
- } else {
- setIsRecording(true);
- setLiveTranscription('');
-
- const sampleRate = 16000;
- recorder.onAudioReady(
- {
- sampleRate,
- bufferLength: 0.1 * sampleRate,
- channelCount: 1,
- },
- ({ buffer }) => {
- speechToText.streamInsert(buffer.getChannelData(0));
- }
- );
- recorder.start();
-
- let finalResult = '';
-
- try {
- for await (const result of speechToText.stream()) {
- const text = result.committed.text + result.nonCommitted.text;
- setLiveTranscription(text);
- finalResult = text;
- }
- } catch (e) {
- setError(e instanceof Error ? e.message : String(e));
- } finally {
- if (finalResult.trim().length > 0) {
- await llm.sendMessage(finalResult);
- setLiveTranscription('');
- }
- }
- }
- };
-
- useEffect(() => {
- if (llm.error) setError(String(llm.error));
- }, [llm.error]);
-
- useEffect(() => {
- if (speechToText.error) setError(String(speechToText.error));
- }, [speechToText.error]);
-
- return (!llm.isReady || !speechToText.isReady) &&
- !llm.error &&
- !speechToText.error ? (
-
- ) : (
-
-
-
-
- Qwen 3 x Whisper
-
- setError(null)} />
- {llm.messageHistory.length > 0 || liveTranscription.length > 0 ? (
-
- 0
- ? [
- ...llm.messageHistory,
- {
- role: 'user',
- content: liveTranscription,
- },
- ]
- : llm.messageHistory
- }
- llmResponse={llm.response}
- isGenerating={llm.isGenerating}
- deleteMessage={llm.deleteMessage}
- />
-
- ) : (
-
- Hello! 👋
-
- Tap the mic and speak to me. I'll transcribe your voice and
- respond using a language model — all on-device.
-
-
- )}
-
- setSelectedLLM(m)}
- />
- setSelectedSTT(m)}
- />
-
-
- {DeviceInfo.isEmulatorSync() ? (
-
-
- recording disabled on emulator
-
-
- ) : (
- <>
- {llm.isGenerating ? (
-
-
-
- ) : (
-
- {isRecording ? (
-
- ) : (
-
- )}
-
- )}
- >
- )}
-
-
-
- );
-}
-
-const styles = StyleSheet.create({
- keyboardAvoidingView: {
- flex: 1,
- },
- topContainer: {
- height: 68,
- width: '100%',
- alignItems: 'center',
- justifyContent: 'center',
- },
- chatContainer: {
- flex: 10,
- width: '100%',
- },
- textModelName: {
- color: ColorPalette.primary,
- },
- helloMessageContainer: {
- flex: 10,
- width: '100%',
- alignItems: 'center',
- justifyContent: 'center',
- },
- helloText: {
- fontFamily: 'medium',
- fontSize: 30,
- color: ColorPalette.primary,
- },
- bottomHelloText: {
- fontFamily: 'regular',
- fontSize: 20,
- lineHeight: 28,
- textAlign: 'center',
- color: ColorPalette.primary,
- },
- bottomContainer: {
- height: 100,
- width: '100%',
- justifyContent: 'center',
- alignItems: 'center',
- paddingHorizontal: 16,
- },
- recordTouchable: {
- height: '100%',
- justifyContent: 'center',
- alignItems: 'center',
- },
- recordingInfo: {
- width: '100%',
- display: 'flex',
- justifyContent: 'center',
- alignItems: 'center',
- },
- emulatorBox: {
- padding: 10,
- margin: 10,
- borderWidth: 1,
- borderRadius: 8,
- borderColor: 'gray',
- justifyContent: 'center',
- alignItems: 'center',
- },
- emulatorWarning: {
- color: 'gray',
- fontSize: 16,
- },
-});
diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index dfd39c15b4..2942d5e718 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -14,9 +14,11 @@ import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
import {
useSpeechToText,
WHISPER_TINY_EN,
- WHISPER_TINY_EN_QUANTIZED,
+ WHISPER_TINY_EN_COREML,
WHISPER_BASE_EN,
+ WHISPER_BASE_EN_COREML,
WHISPER_SMALL_EN,
+ WHISPER_SMALL_EN_COREML,
TranscriptionResult,
SpeechToTextProps,
} from 'react-native-executorch';
@@ -25,10 +27,12 @@ import { ModelPicker, ModelOption } from '../components/ModelPicker';
type STTModelSources = SpeechToTextProps['model'];
const MODELS: ModelOption[] = [
- { label: 'Whisper Tiny', value: WHISPER_TINY_EN },
- { label: 'Whisper Tiny Q', value: WHISPER_TINY_EN_QUANTIZED },
- { label: 'Whisper Base', value: WHISPER_BASE_EN },
- { label: 'Whisper Small', value: WHISPER_SMALL_EN },
+ { label: 'Whisper Tiny EN (XNNPACK)', value: WHISPER_TINY_EN },
+ { label: 'Whisper Tiny EN (CoreML)', value: WHISPER_TINY_EN_COREML },
+ { label: 'Whisper Base EN (XNNPACK)', value: WHISPER_BASE_EN },
+ { label: 'Whisper Base EN (CoreML)', value: WHISPER_BASE_EN_COREML },
+ { label: 'Whisper Small EN (XNNPACK)', value: WHISPER_SMALL_EN },
+ { label: 'Whisper Small EN (CoreML)', value: WHISPER_SMALL_EN_COREML },
];
import FontAwesome from '@expo/vector-icons/FontAwesome';
import {
@@ -45,9 +49,12 @@ import ErrorBanner from '../components/ErrorBanner';
const isSimulator = DeviceInfo.isEmulatorSync();
+const DEFAULT_MODEL =
+ Platform.OS === 'ios' ? WHISPER_BASE_EN_COREML : WHISPER_TINY_EN;
+
export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
const [selectedModel, setSelectedModel] =
- useState(WHISPER_TINY_EN);
+ useState(DEFAULT_MODEL);
const model = useSpeechToText({
model: selectedModel,
@@ -148,7 +155,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
recorder.current.onAudioReady(
{
sampleRate,
- bufferLength: 0.1 * sampleRate,
+ bufferLength: 0.1 * sampleRate, // 100 ms
channelCount: 1,
},
({ buffer }) => {
@@ -178,6 +185,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
try {
const streamIter = model.stream({
verbose: enableTimestamps,
+ timeout: 100,
});
for await (const { committed, nonCommitted } of streamIter) {
diff --git a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
index 089b844eb0..dc9f88179c 100644
--- a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
+++ b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
@@ -17,20 +17,31 @@ keywords:
description: "Learn how to use speech-to-text models in your React Native applications with React Native ExecuTorch's useSpeechToText hook."
---
-Speech to text is a task that allows to transform spoken language to written text. It is commonly used to implement features such as transcription or voice assistants.
+Speech to text (STT) converts spoken audio into written text. This hook allows you to implement features like voice assistants, real-time transcription, and audio file processing directly on-device.
:::info
-It is recommended to use models provided by us, which are available at our [Hugging Face repository](https://huggingface.co/collections/software-mansion/speech-to-text-68d0ec99ed794250491b8bbe). You can also use [constants](https://github.com/software-mansion/react-native-executorch/blob/main/packages/react-native-executorch/src/constants/modelUrls.ts) shipped with our library.
+We recommend using our optimized models available on [Hugging Face](https://huggingface.co/collections/software-mansion/speech-to-text-68d0ec99ed794250491b8bbe). You can also use pre-defined [constants](https://github.com/software-mansion/react-native-executorch/blob/main/packages/react-native-executorch/src/constants/modelUrls.ts) included in the library.
:::
## API Reference
-- For detailed API Reference for `useSpeechToText` see: [`useSpeechToText` API Reference](../../06-api-reference/functions/useSpeechToText.md).
-- For all speech to text models available out-of-the-box in React Native ExecuTorch see: [STT Models](../../06-api-reference/index.md#models---speech-to-text).
+- [`useSpeechToText` API Reference](../../06-api-reference/functions/useSpeechToText.md)
+- [STT Models List](../../06-api-reference/index.md#models---speech-to-text)
-## High Level Overview
+## Basic Usage (File Transcription)
-You can obtain waveform from audio in any way most suitable to you, however in the snippet below we utilize [`react-native-audio-api`](https://docs.swmansion.com/react-native-audio-api/) library to process a `.mp3` file.
+Use `transcribe` for processing pre-recorded audio or short clips. The input should be a `Float32Array` of audio samples at **16 kHz**.
+
+### Transcribe Options
+
+The `transcribe()` function accepts an optional configuration object:
+
+- `language`: The language code (e.g., `'es'`, `'fr'`). Required for multilingual models.
+- `verbose`: If `true`, the method returns a detailed `TranscriptionResult` object following the OpenAI Whisper `verbose_json` format (including segments and word-level timestamps).
+
+In this example, we use [`react-native-audio-api`](https://docs.swmansion.com/react-native-audio-api/) to decode an audio file into the required format.
+
+### Example
```typescript
import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch';
@@ -41,181 +52,46 @@ const model = useSpeechToText({
model: WHISPER_TINY_EN,
});
+// 1. Get audio file
const { uri } = await FileSystem.downloadAsync(
'https://some-audio-url.com/file.mp3',
- FileSystem.cacheDirectory + 'audio_file'
+ `${FileSystem.cacheDirectory}audio_file`
);
+// 2. Decode to 16kHz PCM Float32Array
const audioContext = new AudioContext({ sampleRate: 16000 });
const decodedAudioData = await audioContext.decodeAudioData(uri);
const audioBuffer = decodedAudioData.getChannelData(0);
+// 3. Transcribe
try {
- const transcription = await model.transcribe(audioBuffer);
- console.log(transcription.text);
+ const result = await model.transcribe(audioBuffer);
+ console.log('Transcription:', result.text);
} catch (error) {
- console.error('Error during audio transcription', error);
+ console.error('Transcription failed:', error);
}
```
-### Streaming
-
-Since speech-to-text models can only process audio segments up to 30 seconds long, we need to split longer inputs into chunks. However, simple chunking may cut speech mid-sentence, making it harder for the model to understand. To address this, we use the [whisper-streaming](https://aclanthology.org/2023.ijcnlp-demo.3.pdf) algorithm. While this introduces some overhead, it enables accurate processing of audio inputs of arbitrary length.
-
-### Arguments
-
-`useSpeechToText` takes [`SpeechToTextProps`](../../06-api-reference/interfaces/SpeechToTextProps.md) that consists of:
-
-- `model` of type [`SpeechToTextConfig`](../../06-api-reference/interfaces/SpeechToTextModelConfig.md), containing the [`isMultilingual` flag](../../06-api-reference/interfaces/SpeechToTextModelConfig.md#ismultilingual), [tokenizer source](../../06-api-reference/interfaces/SpeechToTextModelConfig.md#tokenizersource) and [model source](../../06-api-reference/interfaces/SpeechToTextModelConfig.md#modelsource).
-- An optional flag [`preventLoad`](../../06-api-reference/interfaces/SpeechToTextProps.md#preventload) which prevents auto-loading of the model.
-
-You need more details? Check the following resources:
+## Live Streaming Transcription
-- For detailed information about `useSpeechToText` arguments check this section: [`useSpeechToText` arguments](../../06-api-reference/functions/useSpeechToText.md#parameters)
-- For all speech to text models available out-of-the-box in React Native ExecuTorch see: [STT Models](../../06-api-reference/index.md#models---speech-to-text).
-- For more information on loading resources, take a look at [loading models](../../01-fundamentals/02-loading-models.md) page.
-
-### Returns
-
-`useSpeechToText` returns an object called `SpeechToTextType` containing bunch of functions to interact with STT.
-
-Please note, that both [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) and [`stream`](../../06-api-reference/interfaces/SpeechToTextType.md#stream) functions accept [`DecodingOptions`](../../06-api-reference/interfaces/DecodingOptions.md) type as an argument. It accepts language abbreviation, you can check them out in [`language`](../../06-api-reference/interfaces/DecodingOptions.md#language) property of this config of type [`SpeechToTextLanguage`](../../06-api-reference/type-aliases/SpeechToTextLanguage.md).
-
-To get more details please read: [`SpeechToTextType` API Reference](../../06-api-reference/interfaces/SpeechToTextType.md).
-
-## Running the model
-
-Before running the model's [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) method, make sure to extract the audio waveform you want to transcribe. You'll need to handle this step yourself, ensuring the audio is sampled at 16 kHz. Once you have the waveform, pass it as an argument to the transcribe method. The method returns a promise that resolves to the generated transcription on success, or an error if inference fails.
-
-### Multilingual transcription
-
-If you want to transcribe speech in languages other than English, use the multilingual version of Whisper. To generate the output in your desired language, pass the [`language`](../../06-api-reference/interfaces/DecodingOptions.md#language) option to the [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) method.
-
-```typescript
-import { useSpeechToText, WHISPER_TINY } from 'react-native-executorch';
+For real-time applications or audio streams of arbitrary length, use the **Streaming API**. This is optimized for live input, handling the 30-second window limitation of Whisper models automatically to ensure context isn't lost between chunks.
-const model = useSpeechToText({
- model: WHISPER_TINY,
-});
+### How it works:
-const transcription = await model.transcribe(spanishAudio, { language: 'es' });
-```
+1. **Feed audio**: Use `streamInsert` to push small chunks of audio (e.g., 100ms) as they arrive from the microphone.
+2. **Get results**: The `stream` generator yields two types of text:
+ - `committed`: Finalized text that won't change.
+ - `nonCommitted`: Temporary text that might update as the model gets more context from the audio.
-### Timestamps & Transcription Stat Data
+### Streaming Options
-You can obtain word-level timestamps and other useful parameters from transcription ([`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) and [`stream`](../../06-api-reference/interfaces/SpeechToTextType.md#stream) methods) by setting `verbose: true` in the options. The result mimics the _verbose_json_ format from OpenAI Whisper API. For more information please read [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe), [`stream`](../../06-api-reference/interfaces/SpeechToTextType.md#stream), and [`TranscriptionResult`](../../06-api-reference/interfaces/TranscriptionResult.md) API References.
+The `stream()` function accepts several optional parameters:
-```typescript
-const transcription = await model.transcribe(audioBuffer, { verbose: true });
-// Example result
-//
-// transcription: {
-// task: "transcription",
-// text: "Example text for a ...",
-// duration: 9.05,
-// language: "en",
-// segments: [
-// {
-// start: 0,
-// end: 5.4,
-// text: "Example text for",
-// words: [
-// {
-// word: "Example",
-// start: 0,
-// end: 1.4
-// },
-// ...
-// ]
-// tokens: [1, 32, 45, ...],
-// temperature: 0.0,
-// avgLogprob: -1.235,
-// compressionRatio: 1.632
-// },
-// ...
-// ]
-// }
-```
+- `language`: The language code (e.g., `'es'`, `'fr'`). Required for multilingual models.
+- `verbose`: If `true`, includes word-level timestamps and segment metadata in the result objects.
+- `timeout`: (Advanced) The interval (in milliseconds) between processing consecutive audio chunks in streaming mode. Lower values provide more frequent updates and lower latency, while higher values reduce CPU consumption. Defaults to `100`.
-## Example
-
-```tsx
-import React, { useState } from 'react';
-import { Button, Text, View } from 'react-native';
-import {
- useSpeechToText,
- WHISPER_TINY_EN,
- TranscriptionResult,
-} from 'react-native-executorch';
-import { AudioContext } from 'react-native-audio-api';
-import * as FileSystem from 'expo-file-system';
-
-function App() {
- const model = useSpeechToText({
- model: WHISPER_TINY_EN,
- });
-
- const [transcription, setTranscription] = useState(null);
-
- const loadAudio = async () => {
- const { uri } = await FileSystem.downloadAsync(
- 'https://some-audio-url.com/file.mp3',
- FileSystem.cacheDirectory + 'audio_file'
- );
-
- const audioContext = new AudioContext({ sampleRate: 16000 });
- const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
- const audioBuffer = decodedAudioData.getChannelData(0);
-
- return audioBuffer;
- };
-
- const handleTranscribe = async () => {
- const audio = await loadAudio();
- // Default text transcription
- const result = await model.transcribe(audio);
- setTranscription(result);
- };
-
- const handleTranscribeWithTimestamps = async () => {
- const audio = await loadAudio();
- // Transcription with timestamps
- const result = await model.transcribe(audio, { verbose: true });
- setTranscription(result);
- };
-
- // Custom logic for printing transcription
- // e.g.
-
- const renderContent = () => {
- if (!transcription) return Press a button to transcribe;
-
- if (transcription.segments && transcription.segments.length > 0) {
- return (
-
- {transcription.text +
- '\n\nNum segments: ' +
- transcription.segments.length.toString()}
-
- );
- }
- return {transcription.text};
- };
-
- return (
-
- {renderContent()}
-
-
-
- );
-}
-```
-
-### Streaming transcription
+### Example
```tsx
import React, { useEffect, useState, useRef } from 'react';
@@ -223,70 +99,41 @@ import { Text, Button, View, SafeAreaView } from 'react-native';
import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch';
import { AudioManager, AudioRecorder } from 'react-native-audio-api';
-export default function App() {
- const model = useSpeechToText({
- model: WHISPER_TINY_EN,
- });
-
- const [transcribedText, setTranscribedText] = useState('');
-
+export default function LiveTranscriber() {
+ const model = useSpeechToText({ model: WHISPER_TINY_EN });
+ const [text, setText] = useState('');
const isRecordingRef = useRef(false);
-
const [recorder] = useState(() => new AudioRecorder());
- useEffect(() => {
- AudioManager.setAudioSessionOptions({
- iosCategory: 'playAndRecord',
- iosMode: 'spokenAudio',
- iosOptions: ['allowBluetooth', 'defaultToSpeaker'],
- });
- AudioManager.requestRecordingPermissions();
- }, []);
-
- const handleStartStreamingTranscribe = async () => {
+ const startLiveStreaming = async () => {
isRecordingRef.current = true;
- setTranscribedText('');
-
- const sampleRate = 16000;
+ setText('');
+ // 1. Capture microphone input
recorder.onAudioReady(
- {
- sampleRate,
- bufferLength: 0.1 * sampleRate,
- channelCount: 1,
- },
- (chunk) => {
- model.streamInsert(chunk.buffer.getChannelData(0));
- }
+ { sampleRate: 16000, bufferLength: 1600, channelCount: 1 },
+ (chunk) => model.streamInsert(chunk.buffer.getChannelData(0))
);
- try {
- await recorder.start();
- } catch (e) {
- console.error('Recorder failed:', e);
- return;
- }
+ await recorder.start();
+ // 2. Process the stream
try {
- let accumulatedCommitted = '';
-
+ let finalizedText = '';
const streamIter = model.stream({ verbose: false });
for await (const { committed, nonCommitted } of streamIter) {
if (!isRecordingRef.current) break;
- if (committed.text) {
- accumulatedCommitted += committed.text;
- }
-
- setTranscribedText(accumulatedCommitted + nonCommitted.text);
+ if (committed.text) finalizedText += committed.text;
+ setText(finalizedText + nonCommitted.text);
}
} catch (error) {
- console.error('Error during streaming transcription:', error);
+ console.error('Streaming error:', error);
}
};
- const handleStopStreamingTranscribe = () => {
+ const stopLiveStreaming = () => {
isRecordingRef.current = false;
recorder.stop();
model.streamStop();
@@ -294,28 +141,58 @@ export default function App() {
return (
-
-
- {transcribedText || 'Press start to speak...'}
-
-
-
-
-
-
+ {text || 'Press start and speak...'}
+
+
);
}
```
+## Advanced Features
+
+### Multilingual Transcription
+
+To transcribe languages other than English, use a multilingual model (e.g., `WHISPER_TINY`) and specify the corresponding language code:
+
+```typescript
+// Transcribe in Spanish
+const result = await model.transcribe(spanishAudio, { language: 'es' });
+```
+
+### Timestamps & Metadata
+
+Set `verbose: true` to receive word-level timestamps and confidence scores. The output follows the OpenAI Whisper `verbose_json` format.
+
+```typescript
+const result = await model.transcribe(audioBuffer, { verbose: true });
+// result.segments[0].words -> [{ word: "Hello", start: 0.5, end: 1.0 }, ...]
+```
+
+## Configuration
+
+### Arguments
+
+`useSpeechToText` accepts a configuration object:
+
+- `model`: Model source and tokenizer settings (see [ModelConfig](../../06-api-reference/interfaces/SpeechToTextModelConfig.md)).
+- `preventLoad`: (Optional) If `true`, the model won't load until you call `load()`.
+
+### Returns
+
+The hook returns an object with:
+
+- `transcribe(audio, options)`: One-shot transcription.
+- `stream(options)`: Async generator for streaming results.
+- `streamInsert(audio)`: Push audio to the stream buffer.
+- `streamStop()`: Finish the current stream.
+- `isGenerating`: Boolean indicating if the model is busy.
+- `loading`: Boolean indicating if the model is being loaded.
+
## Supported models
| Model | Language |
diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
index d4d8897e7c..f190af8234 100644
--- a/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
+++ b/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
@@ -2,47 +2,43 @@
title: SpeechToTextModule
---
-TypeScript API implementation of the [useSpeechToText](../../03-hooks/01-natural-language-processing/useSpeechToText.md) hook.
+The `SpeechToTextModule` class provides a direct interface to the library's speech-to-text (STT) capabilities. While [`useSpeechToText`](../../03-hooks/01-natural-language-processing/useSpeechToText.md) is the preferred way for React components, this module offers full control over the model's lifecycle and is suitable for non-React contexts or advanced use cases.
## API Reference
-- For detailed API Reference for `SpeechToTextModule` see: [`SpeechToTextModule` API Reference](../../06-api-reference/classes/SpeechToTextModule.md).
-- For all speech to text models available out-of-the-box in React Native ExecuTorch see: [STT Models](../../06-api-reference/index.md#models---speech-to-text).
+- [`SpeechToTextModule` API Reference](../../06-api-reference/classes/SpeechToTextModule.md)
+- [STT Models List](../../06-api-reference/index.md#models---speech-to-text)
## High Level Overview
+You can transcribe audio in two ways: **one-shot** (for files/short clips) and **streaming** (for live microphone input).
+
```typescript
import { SpeechToTextModule, WHISPER_TINY_EN } from 'react-native-executorch';
+// Initialize the model
const model = await SpeechToTextModule.fromModelName(
WHISPER_TINY_EN,
(progress) => {
- console.log(progress);
+ console.log(`Loading: ${progress * 100}%`);
}
);
-// Standard transcription (returns string)
-const text = await model.transcribe(waveform);
+// 1. One-shot transcription (returns TranscriptionResult)
+const result = await model.transcribe(waveform);
+console.log(result.text);
-// Transcription with timestamps (returns Word[])
-const textWithTimestamps = await model.transcribe(waveform, {
- enableTimestamps: true,
-});
+// 2. Live streaming (yields partial/stable results)
+model.streamInsert(audioChunk);
+const stream = model.stream();
+for await (const { committed, nonCommitted } of stream) {
+ // Update UI live with stable and partial text
+}
```
-### Methods
-
-All methods of `SpeechToTextModule` are explained in details here: [`SpeechToTextModule API Reference`](../../06-api-reference/classes/SpeechToTextModule.md)
-
-:::info
-
-- `committed` contains the latest part of the transcription that is finalized and will not change. To obtain the full transcription during streaming, concatenate all the `committed` values yielded over time. Useful for displaying stable results during streaming.
-- `nonCommitted` contains the part of the transcription that is still being processed and may change. Useful for displaying live, partial results during streaming.
- :::
-
## Loading the model
-Use the static [`fromModelName`](../../06-api-reference/classes/SpeechToTextModule.md#frommodelname) factory method. It accepts an object with the following fields:
+Use the static [`fromModelName`](../../06-api-reference/classes/SpeechToTextModule.md#frommodelname) factory method. It accepts a configuration object with the following fields:
- [`isMultilingual`](../../06-api-reference/interfaces/SpeechToTextModelConfig.md#ismultilingual) - Flag indicating if model is multilingual.
- [`modelSource`](../../06-api-reference/interfaces/SpeechToTextModelConfig.md#modelsource) - The location of the used model (bundled encoder + decoder functionality).
@@ -50,132 +46,73 @@ Use the static [`fromModelName`](../../06-api-reference/classes/SpeechToTextModu
And an optional second argument:
-- `onDownloadProgress` - Callback to track download progress.
+- `onDownloadProgress` - Callback to track download progress (returns a value between 0 and 1).
-This method returns a promise resolving to a `SpeechToTextModule` instance.
+For more information on resource management, see [loading models](../../01-fundamentals/02-loading-models.md).
-For more information on loading resources, take a look at [loading models](../../01-fundamentals/02-loading-models.md) page.
+## Transcription (Files & Short Clips)
-## Running the model
+To run transcription on a complete audio clip, use the [`transcribe`](../../06-api-reference/classes/SpeechToTextModule.md#transcribe) method. It accepts a `Float32Array` representing a waveform at **16kHz sampling rate**.
-To run the model, you can use the [`transcribe`](../../06-api-reference/classes/SpeechToTextModule.md#transcribe) method. It accepts one argument, which is an array of type `Float32Array` representing a waveform at 16kHz sampling rate. The method returns a promise, which can resolve either to an error or a string containing the output text.
+### Transcribe Options
-### Multilingual transcription
+The `transcribe()` function accepts an optional configuration object:
-If you aim to obtain a transcription in other languages than English, use the multilingual version of whisper. To obtain the output text in your desired language, pass the [`DecodingOptions`](../../06-api-reference/interfaces/DecodingOptions.md) object with the [`language`](../../06-api-reference/interfaces/DecodingOptions.md#language) field set to your desired language code.
+- `language`: The language code (e.g., `'es'`, `'fr'`). Required for multilingual models.
+- `verbose`: If `true`, the method returns a detailed `TranscriptionResult` object following the OpenAI Whisper `verbose_json` format (including segments and word-level timestamps).
-```typescript
-import { SpeechToTextModule, WHISPER_TINY } from 'react-native-executorch';
+### Multilingual transcription
-const model = await SpeechToTextModule.fromModelName(
- WHISPER_TINY,
- (progress) => {
- console.log(progress);
- }
-);
+If you aim to obtain a transcription in languages other than English, use a multilingual Whisper model. To get the output in your desired language, pass the [`DecodingOptions`](../../06-api-reference/interfaces/DecodingOptions.md) object with the [`language`](../../06-api-reference/interfaces/DecodingOptions.md#language) field set to the target language code.
+```typescript
const transcription = await model.transcribe(spanishAudio, { language: 'es' });
```
-### Timestamps & Transcription Stat Data
+### Timestamps & Detailed Results
-You can obtain word-level timestamps and other useful parameters from transcription ([`transcribe`](../../06-api-reference/classes/SpeechToTextModule.md#transcribe) and [`stream`](../../06-api-reference/classes/SpeechToTextModule.md#stream) methods) by setting `verbose: true` in the options. The result mimics the _verbose_json_ format from OpenAI Whisper API. For more information please read [`transcribe`](../../06-api-reference/classes/SpeechToTextModule.md#transcribe), [`stream`](../../06-api-reference/classes/SpeechToTextModule.md#stream), and [`TranscriptionResult`](../../06-api-reference/interfaces/TranscriptionResult.md) API References.
+Set `verbose: true` in the options to obtain word-level timestamps and other parameters. The result mimics the _verbose_json_ format from OpenAI Whisper API.
```typescript
-const transcription = await model.transcribe(audioBuffer, { verbose: true });
-// Example result
-//
-// transcription: {
-// task: "transcription",
-// text: "Example text for a ...",
-// duration: 9.05,
-// language: "en",
+const result = await model.transcribe(audioBuffer, { verbose: true });
+// Example result:
+// {
+// text: "Example text...",
// segments: [
-// {
-// start: 0,
-// end: 5.4,
-// text: "Example text for",
-// words: [
-// {
-// word: "Example",
-// start: 0,
-// end: 1.4
-// },
-// ...
-// ]
-// tokens: [1, 32, 45, ...],
-// temperature: 0.0,
-// avgLogprob: -1.235,
-// compressionRatio: 1.632
-// },
-// ...
-// ]
+// { start: 0, end: 5.4, text: "Example text", words: [...] }
+// ],
+// language: "en"
// }
```
-## Example
+## Live Streaming Transcription
-### Transcription
+The **Streaming API** is optimized for live microphone input or real-time audio feeds. It handles audio inputs of arbitrary length by automatically managing context windows to bypass the standard 30-second limit.
-```tsx
-import { SpeechToTextModule, WHISPER_TINY_EN } from 'react-native-executorch';
-import { AudioContext } from 'react-native-audio-api';
-import * as FileSystem from 'expo-file-system';
-
-const transcribeAudio = async () => {
- // Initialize with the model config
- const model = await SpeechToTextModule.fromModelName(
- WHISPER_TINY_EN,
- (progress) => {
- console.log(progress);
- }
- );
-
- // Download the audio file
- const { uri } = await FileSystem.downloadAsync(
- 'https://some-audio-url.com/file.mp3',
- FileSystem.cacheDirectory + 'audio_file'
- );
-
- // Decode the audio data (Correct as per your previous code)
- const audioContext = new AudioContext({ sampleRate: 16000 });
- const decodedAudioData = await audioContext.decodeAudioData(uri);
- const audioBuffer = decodedAudioData.getChannelData(0);
-
- // Transcribe the audio
- try {
- // Option 1: Text only
- const resultText = await model.transcribe(audioBuffer);
- console.log('Text:', resultText.text); // .text is the standard property now
-
- // Option 2: With timestamps (Use 'verbose' instead of 'enableTimestamps')
- const resultVerbose = await model.transcribe(audioBuffer, {
- verbose: true,
- });
-
- console.log('Full Text:', resultVerbose.text);
- console.log('Segments:', resultVerbose.segments); // Contains start/end/more parameters
- } catch (error) {
- console.error('Error during audio transcription', error);
- }
-};
-```
+:::iStreaming Options
+The `stream()` function accepts several optional parameters:
+
+- `language`: The language code (e.g., `'es'`, `'fr'`). Required for multilingual models.
+- `verbose`: If `true`, includes word-level timestamps and segment metadata in the result objects.
+- `timeout`: (Advanced) The interval (in milliseconds) between processing consecutive audio chunks. Lower values provide more frequent updates, while higher values reduce CPU consumption. Defaults to `100`.
+
+### nfo
+
+- **`committed`**: Finalized transcription that is stable and will not change. Useful for building a persistent transcript record.
+- **`nonCommitted`**: Partial transcription that is still being processed and may update as more context arrives. Useful for live UI updates.
+ :::
+
+### Live Example
-### Streaming Transcription
+In this example, we use [`react-native-audio-api`](https://docs.swmansion.com/react-native-audio-api/) to feed live audio into the model.
```tsx
import { SpeechToTextModule, WHISPER_TINY_EN } from 'react-native-executorch';
import { AudioManager, AudioRecorder } from 'react-native-audio-api';
-// Load the model
-const model = await SpeechToTextModule.fromModelName(
- WHISPER_TINY_EN,
- (progress) => {
- console.log(progress);
- }
-);
+const model = await SpeechToTextModule.fromModelName(WHISPER_TINY_EN);
-// Configure audio session
+// 1. Configure audio session & permissions
AudioManager.setAudioSessionOptions({
iosCategory: 'playAndRecord',
iosMode: 'spokenAudio',
@@ -183,44 +120,46 @@ AudioManager.setAudioSessionOptions({
});
await AudioManager.requestRecordingPermissions();
-// Initialize audio recorder with FULL config in constructor
+// 2. Setup Audio Recorder
const recorder = new AudioRecorder({
sampleRate: 16000,
channelCount: 1,
- bitsPerSample: 16,
- bufferLengthInSamples: 16000, // e.g. 1 second buffer
});
-// Pass ONLY the callback to onAudioReady
recorder.onAudioReady((chunk) => {
- // Insert the audio into the streaming transcription
+ // Feed chunks directly into the model's buffer
model.streamInsert(chunk.buffer.getChannelData(0));
});
await recorder.start();
-// Start streaming transcription
+// 3. Process the Stream
try {
- let finalTranscription = '';
-
- // Use 'verbose' flag for timestamps/segments
- const streamIter = model.stream({ verbose: true });
+ let stableTranscript = '';
+ const streamIter = model.stream({ verbose: false });
for await (const { committed, nonCommitted } of streamIter) {
- // Note: committed/nonCommitted are objects { text, segments } now
- console.log('Committed Text:', committed.text);
- console.log('Live Text:', nonCommitted.text);
+ if (committed.text) stableTranscript += committed.text;
- if (committed.text) {
- finalTranscription += committed.text;
- }
+ // UI should display: stableTranscript + nonCommitted.text
+ console.log('Live Transcript:', stableTranscript + nonCommitted.text);
}
- console.log('Final transcription:', finalTranscription);
} catch (error) {
- console.error('Error during streaming transcription:', error);
+ console.error('Streaming error:', error);
}
-// Stop streaming transcription
+// 4. Cleanup
model.streamStop();
recorder.stop();
```
+
+## Supported models
+
+| Model | Language |
+| ------------------------------------------------------------------ | :----------: |
+| [whisper-tiny.en](https://huggingface.co/openai/whisper-tiny.en) | English |
+| [whisper-tiny](https://huggingface.co/openai/whisper-tiny) | Multilingual |
+| [whisper-base.en](https://huggingface.co/openai/whisper-base.en) | English |
+| [whisper-base](https://huggingface.co/openai/whisper-base) | Multilingual |
+| [whisper-small.en](https://huggingface.co/openai/whisper-small.en) | English |
+| [whisper-small](https://huggingface.co/openai/whisper-small) | Multilingual |
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index 077d426c8f..c50410a4f7 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -591,8 +591,7 @@ inline jsi::Value getJsiValue(const Segment &seg, jsi::Runtime &runtime) {
jsi::Object wordObj(runtime);
wordObj.setProperty(
runtime, "word",
- jsi::String::createFromUtf8(runtime, seg.words[i].content +
- seg.words[i].punctations));
+ jsi::String::createFromUtf8(runtime, seg.words[i].content));
wordObj.setProperty(runtime, "start",
static_cast(seg.words[i].start));
wordObj.setProperty(runtime, "end", static_cast(seg.words[i].end));
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index 4b58c5039b..3acd076779 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -94,7 +94,7 @@ TranscriptionResult wordsToResult(const std::vector &words,
std::string fullText;
for (const auto &w : words) {
- fullText += w.content + w.punctations;
+ fullText += w.content;
}
res.text = fullText;
@@ -115,7 +115,8 @@ TranscriptionResult wordsToResult(const std::vector &words,
} // namespace
void SpeechToText::stream(std::shared_ptr callback,
- std::string languageOption, bool verbose) {
+ std::string languageOption, bool verbose,
+ uint32_t timeout) {
if (isStreaming_) {
throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress,
"Streaming is already in progress!");
@@ -157,11 +158,15 @@ void SpeechToText::stream(std::shared_ptr callback,
// The reasoning is very simple: with the current liberal threshold values,
// running transcriptions too rapidly (before the audio buffer is filled
// with significant amount of new data) can cause streamer to commit wrong
- // phrases.
- std::this_thread::sleep_for(std::chrono::milliseconds(100));
+ // phrases. We wait on a condition_variable so streamStop() can break the
+ // pause immediately — inserts intentionally do not wake us, to preserve
+ // the throttle.
+ std::unique_lock lock(streamCvMutex_);
+ streamCv_.wait_for(lock, std::chrono::milliseconds(timeout),
+ [this] { return !isStreaming_.load(); });
}
- std::vector finalWords = streamer_->finish();
+ std::vector finalWords = streamer_->finish(options);
TranscriptionResult finalRes =
wordsToResult(finalWords, languageOption, verbose);
@@ -169,7 +174,10 @@ void SpeechToText::stream(std::shared_ptr callback,
resetStreamState();
}
-void SpeechToText::streamStop() { isStreaming_ = false; }
+void SpeechToText::streamStop() {
+ isStreaming_ = false;
+ streamCv_.notify_all();
+}
void SpeechToText::streamInsert(std::span waveform) {
streamer_->insertAudioChunk(waveform);
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
index ade835869c..adcfd8ae99 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
@@ -1,6 +1,8 @@
#pragma once
#include
+#include
+#include
#include
#include
#include
@@ -34,15 +36,11 @@ class SpeechToText {
std::string languageOption,
bool verbose) const;
- [[nodiscard("Registered non-void function")]]
- std::vector transcribeStringOnly(std::span waveform,
- std::string languageOption) const;
-
size_t getMemoryLowerBound() const noexcept;
// Stream
void stream(std::shared_ptr callback,
- std::string languageOption, bool enableTimestamps);
+ std::string languageOption, bool verbose, uint32_t timeout);
void streamStop();
void streamInsert(std::span waveform);
@@ -58,6 +56,11 @@ class SpeechToText {
std::unique_ptr streamer_ = nullptr;
std::atomic isStreaming_ = false;
std::atomic readyToProcess_ = false;
+
+ // Lets streamStop() wake the streaming loop immediately instead of
+ // waiting for the next throttling interval to expire.
+ std::mutex streamCvMutex_;
+ std::condition_variable streamCv_;
};
} // namespace models::speech_to_text
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h
index 357309391d..efe6cc2819 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h
@@ -36,7 +36,7 @@ class OnlineASR {
virtual ProcessResult process(const DecodingOptions &options) = 0;
- virtual std::vector finish() = 0;
+ virtual std::vector finish(const DecodingOptions &options) = 0;
virtual void reset() = 0;
};
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h
index e7319f95b5..fcf7759b24 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h
@@ -4,13 +4,14 @@
namespace rnexecutorch::models::speech_to_text {
+/**
+ * Different representation of a token,
+ * with timestamps calculated.
+ */
struct Word {
std::string content;
float start;
float end;
-
- std::string
- punctations; // Trailing punctations which appear after the main content
};
} // namespace rnexecutorch::models::speech_to_text
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
index d1debeb0f0..a9f2b152b4 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
@@ -1,8 +1,3 @@
-#include
-#include
-#include
-#include
-
#include "ASR.h"
#include "Constants.h"
#include "Params.h"
@@ -11,6 +6,12 @@
#include
#include
+#include
+#include
+#include
+#include
+#include
+
namespace rnexecutorch::models::speech_to_text::whisper {
using executorch::runtime::etensor::ScalarType;
@@ -138,8 +139,9 @@ executorch::aten::Tensor ASR::decode(std::span tokens,
positionShape, cachePositions.data(), ScalarType::Long);
const auto encoderOutputSize = static_cast(encoderOutput.size());
- std::vector encShape = {1, constants::kNumFrames,
- encoderOutputSize / constants::kNumFrames};
+ std::vector encShape = {
+ 1, static_cast(constants::kNumFrames),
+ encoderOutputSize / static_cast(constants::kNumFrames)};
auto encoderTensor = executorch::extension::make_tensor_ptr(
std::move(encShape), const_cast(encoderOutput.data()),
ScalarType::Float);
@@ -212,7 +214,9 @@ std::vector ASR::generate(std::span waveform,
scores.begin(), scores.end(), 0.0f, std::plus<>(),
[](float s) { return std::log(std::max(s, 1e-9f)); });
- const float avgLogProb = cumLogProb / static_cast(tokens.size() + 1);
+ // Match whisper.cpp: divide by the number of summed log-probs.
+ const float avgLogProb =
+ cumLogProb / static_cast(std::max(1, scores.size()));
const std::string text = tokenizer_->decode(tokens, true);
const float compressionRatio = this->calculateCompressionRatio(text);
@@ -262,11 +266,20 @@ ASR::generate(std::span waveform, const DecodingOptions &options,
std::vector scores;
uint64_t startPos = 0;
- while (std::cmp_less_equal(startPos + sequenceIds.size(),
- constants::kMaxDecodeLength)) {
- executorch::aten::Tensor logitsTensor =
- this->decode(sequenceIds, encoderFeatures, startPos);
+ // Prefill: feed each initial token individually so decode() always sees 1
+ // token.
+ executorch::aten::Tensor logitsTensor{nullptr};
+ for (size_t i = 0; i < sequenceIds.size(); i++, startPos++) {
+ std::span single(sequenceIds.data() + i, 1);
+ logitsTensor = this->decode(single, encoderFeatures, startPos);
+ }
+
+ // Seed once per generate() call rather than per sampled token.
+ std::mt19937 gen(std::random_device{}());
+
+ // Autoregressive decoding: always 1 token at a time
+ while (std::cmp_less(startPos, constants::kMaxDecodeLength)) {
const size_t logitsInnerDim = logitsTensor.size(1);
const size_t logitsDictSize = logitsTensor.size(2);
const float *logitsData = logitsTensor.const_data_ptr() +
@@ -297,20 +310,20 @@ ASR::generate(std::span waveform, const DecodingOptions &options,
nextProb = *maxIt;
} else {
std::discrete_distribution<> dist(probs.begin(), probs.end());
- std::mt19937 gen((std::random_device{}()));
nextId = dist(gen);
nextProb = probs[nextId];
}
- // Move the startPos pointer by the amount of tokens we processed
- startPos += sequenceIds.size();
- sequenceIds = {nextId};
cachedTokens.push_back(nextId);
scores.push_back(nextProb);
if (nextId == endOfTranscriptionToken_) {
break;
}
+
+ std::span single(&cachedTokens.back(), 1);
+ logitsTensor = this->decode(single, encoderFeatures, startPos);
+ ++startPos;
}
return {.tokens = std::vector(cachedTokens.cbegin() +
@@ -437,15 +450,22 @@ ASR::estimateWordLevelTimestampsLinear(std::span tokens,
const float wEnd = wStart + timePerChar * wSize;
prevCharCount += wSize;
- // We store punctations separately to other characters.
+ // Detect and extract trailing punctuations.
std::string puncts = "";
while (!w.empty() && constants::kPunctations.contains(w.back())) {
puncts += w.back();
w.pop_back();
}
- std::reverse(puncts.begin(), puncts.end());
+ std::ranges::reverse(puncts);
- wordObjs.emplace_back(std::move(w), wStart, wEnd, std::move(puncts));
+ // Add the core word.
+ wordObjs.emplace_back(std::move(w), wStart, wEnd);
+
+ // If punctuation was present, add it as a separate "word" with an
+ // instantaneous timestamp at the end of the original word.
+ if (!puncts.empty()) {
+ wordObjs.emplace_back(std::move(puncts), wEnd, wEnd);
+ }
}
return wordObjs;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h
index 0b284345ec..62a9f968f7 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h
@@ -9,34 +9,37 @@ namespace rnexecutorch::models::speech_to_text::whisper::constants {
// Maximum duration of each audio chunk to process (in seconds)
// It is intentionally set to 29 since otherwise only the last chunk would be
// correctly transcribe due to the model's positional encoding limit
-constexpr static int32_t kChunkSize = 29;
+inline constexpr size_t kChunkSize = 29;
// Sampling rate expected by Whisper and the model's audio pipeline (16 kHz)
-constexpr static int32_t kSamplingRate = 16000;
-constexpr static int32_t kSamplesPerMilisecond = kSamplingRate / 1000;
+inline constexpr size_t kSamplingRate = 16000;
+inline constexpr size_t kSamplesPerMilisecond = kSamplingRate / 1000;
+
+inline constexpr size_t kMaxSamples = kChunkSize * kSamplingRate;
// The maximum number of tokens the decoder can generate per chunk
-constexpr static int32_t kMaxDecodeLength = 128;
+inline constexpr size_t kMaxDecodeLength = 128;
// Minimum allowed chunk length before processing (in audio samples)
-constexpr static int32_t kMinChunkSamples = 1 * kSamplingRate;
+inline constexpr size_t kMinChunkSamples = 1 * kSamplingRate;
// Number of mel frames output by the encoder (derived from input spectrogram)
-constexpr static int32_t kNumFrames = 1500;
+inline constexpr size_t kNumFrames = 1500;
// Time precision used by Whisper timestamps: each token spans 0.02 seconds
-constexpr static float kTimePrecision = 0.02f;
+inline constexpr float kTimePrecision = 0.02f;
// Special characters serving as pause / end of sentence
-static const std::unordered_set kPunctations = {',', '.', '?',
+inline const std::unordered_set kPunctations = {',', '.', '?',
'!', ':', ';'};
+inline const std::unordered_set kEosPunctations = {'.', '?', '!', ';'};
// Special token constants
namespace tokens {
-static const std::string kStartOfTranscript = "<|startoftranscript|>";
-static const std::string kEndOfTranscript = "<|endoftext|>";
-static const std::string kBeginTimestamp = "<|0.00|>";
-static const std::string kBlankAudio = "[BLANK_AUDIO]";
+inline const std::string kStartOfTranscript = "<|startoftranscript|>";
+inline const std::string kEndOfTranscript = "<|endoftext|>";
+inline const std::string kBeginTimestamp = "<|0.00|>";
+inline const std::string kBlankAudio = "[BLANK_AUDIO]";
} // namespace tokens
} // namespace rnexecutorch::models::speech_to_text::whisper::constants
\ No newline at end of file
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp
deleted file mode 100644
index ce365e4e44..0000000000
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-#include "HypothesisBuffer.h"
-#include "Params.h"
-#include "Utils.h"
-
-#include
-#include
-
-namespace rnexecutorch::models::speech_to_text::whisper::stream {
-
-void HypothesisBuffer::insert(std::span words, float offset) {
- // Step 1 - decide which words should be considered as fresh.
- fresh_.clear();
-
- // We try to find the last committed word in a transcription string.
- // Everything beyond that word will be considered as fresh.
- // To make the algorithm more resilient to repeated strings of words,
- // we check also the preceeding words as well as timestamps (with liberal
- // range).
- size_t firstFreshWordIdx = 0;
- if (!committed_.empty()) {
- std::optional lastMatchingWordIdx =
- findCommittedSuffix(words, params::kStreamCommitedSuffixSearchSize,
- params::kStreamMaxOverlapTimestampDiff1,
- params::kStreamWordsPerErrorRate);
- firstFreshWordIdx = lastMatchingWordIdx.value_or(0);
- }
-
- bool isCompletelyFresh = firstFreshWordIdx == 0;
- for (size_t i = firstFreshWordIdx; i < words.size(); i++) {
- const auto &word = words[i];
-
- // Global start is a beginning timestamp relative only to the beginning of
- // the current streaming process.
- const float startGlobal = word.start + offset;
- const float endGlobal = word.end + offset;
-
- if (!isCompletelyFresh ||
- startGlobal > lastCommittedTime_ - params::kStreamFreshThreshold) {
- fresh_.emplace_back(word.content, startGlobal, endGlobal,
- word.punctations);
- }
- }
-
- // Step 2 - we have already selected the fresh words. Now it's time to
- // correct any mistakes and remove the words which overlap with already
- // commited segments - to avoid duplicates.
- if (!fresh_.empty() && !committed_.empty()) {
- // Calculate the largest overlapping fragment size.
- // Note that we use size limit (kStreamMaxOverlapSize) for efficiency of the
- // algorithm, and timestamp difference limit
- // (kStreamMaxOverlapTimestampDiff) to avoid removing correct fragments
- // which were just repeated after some time.
- size_t overlapSize = utils::findLargestOverlapingFragment(
- committed_, fresh_, params::kStreamMaxOverlapSize,
- params::kStreamMaxOverlapTimestampDiff2);
-
- if (overlapSize > 0) {
- fresh_.erase(fresh_.begin(), fresh_.begin() + overlapSize);
- }
- }
-}
-
-std::deque HypothesisBuffer::commit() {
- std::deque toCommit = {};
-
- // Find a stable prefix: words that haven't changed between last and current
- // iteration.
- while (!fresh_.empty() && !hypothesis_.empty() &&
- fresh_.front().content == hypothesis_.front().content) {
- // The last word from the fresh_ buffer must also match punctations with the
- // hypothesis. This is done in order to ensure correct punctation marks in
- // the resulting transcription.
- if (fresh_.size() == 1 &&
- fresh_.front().punctations != hypothesis_.front().punctations) {
- break;
- }
-
- // Take timestamps from the hypothesis, but actual content from the fresh
- // buffer.
- toCommit.emplace_back(std::move(fresh_.front().content),
- hypothesis_.front().start, hypothesis_.front().end,
- std::move(fresh_.front().punctations));
- fresh_.pop_front();
- hypothesis_.pop_front();
- }
-
- // Save the last committed word timestamp.
- // This will mark the end of the entire committed sequence.
- if (!toCommit.empty()) {
- lastCommittedTime_ = toCommit.back().end;
- }
-
- // The remaining words from the fresh buffer (uncommitted phrase)
- // become a hypothesis for the next iteration.
- hypothesis_ = std::move(fresh_);
- fresh_.clear();
-
- // The last step is to commit the selected words.
- committed_.insert(committed_.end(), toCommit.cbegin(), toCommit.cend());
-
- return toCommit;
-}
-
-void HypothesisBuffer::releaseCommits(size_t wordsToKeep) {
- if (committed_.size() > wordsToKeep) {
- size_t nWordsToErase = committed_.size() - wordsToKeep;
- committed_.erase(committed_.begin(), committed_.begin() + nWordsToErase);
- }
-}
-
-void HypothesisBuffer::reset() {
- fresh_.clear();
- hypothesis_.clear();
- committed_.clear();
-
- lastCommittedTime_ = 0.f;
-}
-
-std::optional HypothesisBuffer::findCommittedSuffix(
- std::span words, size_t nCommitted,
- float timestampDiffTolerance, size_t wordsPerMistake) {
- if (words.empty() || committed_.empty() || nCommitted == 0) {
- return std::nullopt;
- }
-
- // Determine the subset size of committed words to check against.
- size_t committedToMatchSize = std::min(nCommitted, committed_.size());
-
- // Iterate backwards through 'words' to find the most recent occurrence of a
- // suffix of 'committed_' (or the full 'committed_' sequence).
- for (int32_t i = static_cast(words.size()) - 1; i >= 0; --i) {
- bool match = true;
- size_t matchedCount = 0;
- size_t contentMistakeCount = 0;
-
- // Linearly interpolate tolerance if we are at the beginning and can't check
- // all committed words.
- float effectiveTolerance = timestampDiffTolerance;
- if (i < static_cast(committedToMatchSize) - 1) {
- effectiveTolerance *=
- static_cast(i + 1) / static_cast(committedToMatchSize);
- }
-
- // Try to match backwards from words[i] and committed_.back()
- for (size_t j = 0; j < committedToMatchSize; ++j) {
- int32_t wordsIdx = i - static_cast(j);
- int32_t committedIdx =
- static_cast(committed_.size()) - 1 - static_cast(j);
-
- if (wordsIdx < 0) {
- // We reached the beginning of the words span.
- // The algorithm allows matching a partial prefix if it's at the start.
- break;
- }
-
- const Word &w1 = words[wordsIdx];
- const Word &w2 = committed_[committedIdx];
-
- // Check timestamps within tolerance
- if (std::max(std::abs(w1.start - w2.start), std::abs(w1.end - w2.end)) >
- effectiveTolerance) {
- match = false;
- break;
- }
-
- // Allow sparse content mismatches while still treating the overall
- // sequence as matching.
- if (utils::equalsIgnoreCase(w1.content, w2.content)) {
- matchedCount++;
- } else {
- contentMistakeCount++;
- }
-
- // Early exit if mistake count already exceeds what we can recover from
- // given the remaining words to check.
- if (wordsPerMistake > 0) {
- size_t remainingToMatch = committedToMatchSize - 1 - j;
- size_t maxPossibleMatched = matchedCount + remainingToMatch;
- if (contentMistakeCount > (maxPossibleMatched / wordsPerMistake)) {
- match = false;
- break;
- }
- }
- }
-
- // One content mistake is allowed per M matched words.
- size_t maxAllowedMistakes =
- (wordsPerMistake == 0) ? 0 : (matchedCount / wordsPerMistake);
-
- if (match && matchedCount > 0 &&
- contentMistakeCount <= maxAllowedMistakes) {
- return static_cast(i);
- }
- }
-
- return std::nullopt;
-}
-
-} // namespace rnexecutorch::models::speech_to_text::whisper::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.h
deleted file mode 100644
index 25833ec01b..0000000000
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#pragma once
-
-#include
-#include
-#include
-
-#include "../common/types/Word.h"
-
-namespace rnexecutorch::models::speech_to_text::whisper::stream {
-
-/**
- * A buffer for managing streaming transcription hypotheses.
- * This class handles stabilization of the transcription result by tracking
- * "fresh" hypotheses and "committing" them once they are stable across updates.
- */
-class HypothesisBuffer {
-public:
- /**
- * Inserts new words into the fresh_ buffer.
- * Words are filtered based on the last committed time and checked for
- * overlaps with existing committed words to prevent duplicates.
- *
- * @param newWords A span of recently generated words.
- * @param offset Time offset to adjust the word timestamps.
- */
- void insert(std::span words, float offset);
-
- /**
- * Attempts to commit words present in the fresh_ buffer.
- * A phrase from fresh_ buffer can only be committed if it also appears
- * in the hypothesis_ buffer (uncommitted words from previous iteration).
- *
- * Uncommitted words become a 'hypothesis' and are moved into the hypothesis_
- * buffer.
- *
- * @return A sequence of words committed in the current iteration.
- */
- std::deque commit();
-
- /**
- * Shrinks the committed_ buffer by erasing all words except N latest ones.
- *
- * Used primarily to relieve increasing memory usage during very
- * long streaming sessions.
- *
- * @param wordsToKeep - number of trailing words to be kept in.
- */
- void releaseCommits(size_t wordsToKeep);
-
- /**
- * Resets all the stored buffers and state variables to the initial state
- */
- void reset();
-
- // Declare a friendship with OnlineASR to allow it to access the internal
- // state of stored buffers.
- friend class OnlineASR;
-
-private:
- // Finds the most recent occurance of given committed string of words
- // in a custom span of words.
- // Returns the index of the last matching word (or nullopt if not present).
- std::optional findCommittedSuffix(std::span words,
- size_t nCommitted,
- float timestampDiffTolerance = 1.F,
- size_t wordsPerMistake = 4);
-
- // Stored buffers
- // The lifecycle of a correct result word looks as following:
- // fresh buffer -> hypothesis buffer -> commited
- std::deque
- fresh_; // 'New' words from current iterations, which require some checks
- // before they go into hypothesis_ buffer.
- std::deque
- hypothesis_; // Words potentially to be commited, stored between
- // iterations (obtained from fresh_ buffer).
- std::deque committed_; // A history of already commited words.
-
- float lastCommittedTime_ = 0.0f;
-};
-
-} // namespace rnexecutorch::models::speech_to_text::whisper::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
index ded2183201..e663c5bfab 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
@@ -1,163 +1,260 @@
-#include
-#include
-#include
-#include
-
-#include "Constants.h"
#include "OnlineASR.h"
+#include "Constants.h"
#include "Params.h"
#include "Utils.h"
+#include
+#include
+#include
+#include
+
namespace rnexecutorch::models::speech_to_text::whisper::stream {
-namespace {
-std::vector move_to_vector(std::deque &container) {
- return std::vector(std::make_move_iterator(container.begin()),
- std::make_move_iterator(container.end()));
+OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) {
+ audioBuffer_.reserve((constants::kChunkSize + 1) * constants::kSamplingRate);
}
-} // namespace
-OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) {
- // Reserve a minimal expected amount of memory for audio buffer.
- audioBuffer_.reserve(static_cast(2 * params::kStreamChunkThreshold *
- constants::kSamplingRate));
+bool OnlineASR::isReady() const {
+ std::scoped_lock lock(streamingMutex);
+
+ return audioBuffer_.size() >= constants::kMinChunkSamples;
}
void OnlineASR::insertAudioChunk(std::span audio) {
- std::scoped_lock lock(audioBufferMutex_);
+ std::scoped_lock lock(streamingMutex);
+
audioBuffer_.insert(audioBuffer_.end(), audio.begin(), audio.end());
-}
-bool OnlineASR::isReady() const {
- return audioBuffer_.size() >= constants::kMinChunkSamples;
+ // Automatic buffer cleanup.
+ //
+ // This prevents the audio buffer from growing indefinitely during continuous
+ // streaming. It is particularly useful when VAD (Voice Activity Detection)
+ // is used and elements are inserted but not processed for a long time.
+ // It should not pass the condition in a normal streaming, that is when
+ // process() method is called regularly within reasonable steps of time.
+ if (audioBuffer_.size() > constants::kMaxSamples) {
+ // Note that results are not actually committed now, but saved for
+ // a later call of process(). Append rather than assign so that two
+ // back-to-back buffer-cap hits (e.g. while VAD is muted) don't drop the
+ // first batch.
+ auto pending = commitAndClean(memory_.transcript);
+ std::ranges::move(pending, std::back_inserter(memory_.toCommit));
+ }
}
ProcessResult OnlineASR::process(const DecodingOptions &options) {
+ constexpr size_t kStreamSafeBufferMaxSamples = static_cast(
+ params::kStreamSafeBufferDuration * constants::kSamplingRate);
+
std::vector audioCopy;
// Copy the audio buffer to avoid keeping the lock during the entire
// transcription process.
{
- std::scoped_lock lock(audioBufferMutex_);
+ std::scoped_lock lock(streamingMutex);
audioCopy = audioBuffer_;
}
- std::vector transcriptions = asr_->transcribe(audioBuffer_, options);
+ // Obtain a transcription for current audio buffer state.
+ // It's very unlikely that buffer will exceed whisper's maximum capacity, but
+ // for absolute safety we can additionally clip the buffer.
+ std::span input(
+ audioCopy.begin(),
+ audioCopy.begin() + std::min(constants::kMaxSamples, audioCopy.size()));
- if (transcriptions.empty()) {
- return {.committed = {}, .nonCommitted = {}};
- }
+ std::vector transcriptions = asr_->transcribe(input, options);
// Flatten segments into a single word sequence.
+ // This is our 'nonCommitted' part for now.
std::vector words;
- words.reserve(transcriptions.front().words.size());
-
for (auto &segment : transcriptions) {
- words.insert(words.end(), std::make_move_iterator(segment.words.begin()),
- std::make_move_iterator(segment.words.end()));
+ std::ranges::move(segment.words, std::back_inserter(words));
}
- hypothesisBuffer_.insert(words, bufferTimeOffset_);
-
- // Apply fix for timestamps.
- if (!hypothesisBuffer_.fresh_.empty()) {
- size_t noNewWords = hypothesisBuffer_.fresh_.size();
- float establishedEnd = hypothesisBuffer_.lastCommittedTime_;
- float newBegin = hypothesisBuffer_.fresh_.front().start;
- const float newEnd = hypothesisBuffer_.fresh_.back().end;
- float shift = 0.F;
- for (size_t i = 0; i < hypothesisBuffer_.fresh_.size(); i++) {
- const float originalEnd = hypothesisBuffer_.fresh_[i].end;
-
- if (i < hypothesisBuffer_.hypothesis_.size() &&
- utils::equalsIgnoreCase(hypothesisBuffer_.fresh_[i].content,
- hypothesisBuffer_.hypothesis_[i].content)) {
- hypothesisBuffer_.fresh_[i].start =
- hypothesisBuffer_.hypothesis_[i].start;
- hypothesisBuffer_.fresh_[i].end = hypothesisBuffer_.hypothesis_[i].end;
- shift = hypothesisBuffer_.fresh_[i].end - originalEnd;
-
- establishedEnd = hypothesisBuffer_.hypothesis_[i].end;
- newBegin = hypothesisBuffer_.fresh_[i].end;
- noNewWords--;
- continue;
- }
-
- // In case of a new word, we apply timestamp range scaling
- // based on timestamps established in previous iterations.
- const float freshDuration = newEnd - establishedEnd;
- const float epsilon = std::max(
- 0.F, 0.85F * (freshDuration -
- static_cast(noNewWords /
- params::kStreamWordsPerSecond)));
- float scale =
- (freshDuration - epsilon) / std::max(newEnd - newBegin, 0.2F);
- hypothesisBuffer_.fresh_[i].start =
- shift + (hypothesisBuffer_.fresh_[i].start - newEnd) * scale + newEnd;
- hypothesisBuffer_.fresh_[i].end =
- shift + (hypothesisBuffer_.fresh_[i].end - newEnd) * scale + newEnd;
+ // Aquire lock for the rest of the method (extensive usage of audioBuffer_).
+ std::scoped_lock lock(streamingMutex);
+
+ // Step 1: examine all previously saved EOS points.
+ // The idea is to remove entries which have changed or no longer exist
+ // due to model correcting it's output.
+ for (auto it = memory_.eos.begin(); it != memory_.eos.end(); it++) {
+ if (it->position >= words.size() || !utils::isEos(words[it->position]) ||
+ (it->position > 0 &&
+ it->preceeding != words[it->position - 1].content)) {
+ memory_.eos.erase(it, memory_.eos.end());
+ break;
}
}
- auto committed = hypothesisBuffer_.commit();
- auto nonCommitted = hypothesisBuffer_.hypothesis_;
+ // Step 2: check if the newest EOS character from transcript should be
+ // saved to eos_ vector.
+ auto lastEosIt = std::find_if(words.rbegin(), words.rend(), utils::isEos);
+ if (lastEosIt != words.rend()) {
+ size_t lastEosIndex = std::distance(words.begin(), lastEosIt.base()) - 1;
- // We want to save the most recent end of sentence word
- // to improve the audio cutting mechanism.
- for (const auto &word : committed) {
- if (!word.punctations.empty()) {
- lastSentenceEnd_ = word.end;
+ // Because of step 1, we know that if the last EOS exist in eos_,
+ // then it must be the last entry.
+ if (memory_.eos.empty() || memory_.eos.back().position != lastEosIndex) {
+ std::string preceeding =
+ lastEosIndex > 0 ? words[lastEosIndex - 1].content : "";
+ memory_.eos.emplace_back(lastEosIndex, preceeding, lastEosIt->end);
}
}
- // Since Whisper does not accept waveforms longer than 30 seconds, we need
- // to cut the audio at some safe point.
- {
- std::scoped_lock lock(audioBufferMutex_);
-
- const float audioDuration =
- static_cast(audioBuffer_.size()) / constants::kSamplingRate;
- if (audioDuration > params::kStreamChunkThreshold) {
- // Leave some portion of audio in, to improve model behavior
- // in future iterations.
- const float erasePoint =
- hypothesisBuffer_.lastCommittedTime_ == lastSentenceEnd_
- ? audioDuration
- : std::min(lastSentenceEnd_, params::kStreamChunkThreshold);
- const float minEraseDuration =
- audioDuration - params::kStreamAudioBufferMaxReserve;
- const float maxEraseDuration =
- audioDuration - params::kStreamAudioBufferMinReserve;
- const float eraseDuration = std::clamp(
- erasePoint - bufferTimeOffset_, minEraseDuration, maxEraseDuration);
- const size_t nSamplesToErase =
- static_cast(eraseDuration * constants::kSamplingRate);
+ std::vector committed;
- audioBuffer_.erase(audioBuffer_.begin(),
- audioBuffer_.begin() + nSamplesToErase);
- bufferTimeOffset_ += eraseDuration;
- }
+ // Step 3: collect all the words which could possible get committed
+ // in-between iterations.
+ if (!memory_.toCommit.empty()) {
+ committed.insert(committed.end(),
+ std::make_move_iterator(memory_.toCommit.begin()),
+ std::make_move_iterator(memory_.toCommit.end()));
+ memory_.toCommit.clear();
}
- return {.committed = move_to_vector(committed),
- .nonCommitted = move_to_vector(nonCommitted)};
+ // Step 4: clear the buffer if it is getting too large.
+ // The idea is to use the saved EOS entries and try to cut the buffer
+ // in a 'good' spot - where it will remove a significant audio chunk, yet
+ // won't affect most recent, unfinished speech samples.
+ size_t bufferSize = audioBuffer_.size();
+ if (bufferSize > kStreamSafeBufferMaxSamples) {
+ auto newCommitted = commitAndClean(words);
+
+ committed.insert(committed.end(),
+ std::make_move_iterator(newCommitted.begin()),
+ std::make_move_iterator(newCommitted.end()));
+ }
+
+ // Save the uncommitted part to streamer's memory,
+ // cause it might be necessary when committing inside streamInsert().
+ memory_.transcript = words;
+
+ // Note that uncommitted part represented by recent transcription (words)
+ // is already shrinked if something has been committed during the cleanup
+ // phase.
+ return {.committed = std::move(committed), .nonCommitted = std::move(words)};
}
-std::vector OnlineASR::finish() {
- // We always push the last remaining hypothesis, even if it's not
- // confirmed in second iteration, to avoid ending up with broken sentences.
- std::deque remaining = hypothesisBuffer_.hypothesis_;
+std::vector OnlineASR::finish(const DecodingOptions &options) {
+ ProcessResult result = process(options);
- return move_to_vector(remaining);
+ // Last-tick committed delta + whatever never made it past the commit
+ // threshold.
+ std::vector residual{std::move(result.committed)};
+ residual.insert(residual.end(),
+ std::make_move_iterator(result.nonCommitted.begin()),
+ std::make_move_iterator(result.nonCommitted.end()));
+
+ reset();
+
+ return residual;
}
void OnlineASR::reset() {
- std::scoped_lock lock(audioBufferMutex_);
-
- hypothesisBuffer_.reset();
- bufferTimeOffset_ = 0.f;
+ std::scoped_lock lock(streamingMutex);
audioBuffer_.clear();
+
+ // Reset memory.
+ memory_.transcript.clear();
+ memory_.eos.clear();
+ memory_.toCommit.clear();
+}
+
+std::vector OnlineASR::commitAndClean(std::vector &transcript) {
+ constexpr float kMidpointAnchorTime = params::kStreamMaxDuration / 2.0F;
+ constexpr size_t kMidpointAnchorSamples =
+ static_cast(kMidpointAnchorTime * constants::kSamplingRate);
+ constexpr size_t kSafetyMarginSamples = static_cast(
+ params::kStreamSafetyThreshold * constants::kSamplingRate);
+ constexpr float kMaxSafeEosTime =
+ params::kStreamSafeBufferDuration - params::kStreamSafetyThreshold;
+ constexpr float kMinDurationToCalculateDensity = 0.1F;
+
+ const size_t bufferSize = audioBuffer_.size();
+
+ std::vector committed;
+
+ // If we don't have any EOS entries, then we most likely have not
+ // recorded any speech. In this case we can safely cut the maximum amount of
+ // audio data.
+ if (memory_.eos.empty()) {
+ size_t cut =
+ bufferSize - params::kStreamSafetyThreshold * constants::kSamplingRate;
+
+ audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut);
+ }
+
+ // If we have exactly one (most recent) EOS entry in the eos_, then
+ // we need to be more careful.
+ // Normally we want to keep at least one sentence in, but if the sentence
+ // covers a significant amount of buffer, we have no choice.
+ else if (memory_.eos.size() == 1) {
+ const float eosTimestamp = memory_.eos[0].tmstpend;
+
+ const float upperHalfDuration =
+ std::max(0.0F, eosTimestamp - kMidpointAnchorTime);
+ const float wordsPerSecond =
+ upperHalfDuration > kMinDurationToCalculateDensity
+ ? static_cast(transcript.size()) / upperHalfDuration
+ : 0.0F;
+
+ // The EOS sits early enough that cutting up to the safety margin won't
+ // touch the ongoing (post-EOS) speech.
+ const bool eosSafe = eosTimestamp < kMaxSafeEosTime;
+
+ if (!eosSafe && wordsPerSecond < params::kWordsPerSecondLow) {
+ // EOS lies past the midpoint, but a low word density implies the spoken
+ // audio is concentrated in the upper half. Drop the lower half and
+ // shift the EOS accordingly.
+ audioBuffer_.erase(audioBuffer_.begin(),
+ audioBuffer_.begin() + kMidpointAnchorSamples);
+ memory_.eos[0].tmstpend -= kMidpointAnchorTime;
+ } else {
+ // Cut everything up to and including the sentence — either by the
+ // safety margin (when EOS is early) or (more aggresively) right at the
+ // EOS boundary — and commit its words.
+ const size_t cut =
+ eosSafe
+ ? bufferSize - kSafetyMarginSamples
+ : static_cast(eosTimestamp * constants::kSamplingRate);
+
+ audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut);
+
+ committed.insert(committed.end(),
+ std::make_move_iterator(transcript.begin()),
+ std::make_move_iterator(transcript.end()));
+
+ transcript.clear();
+ memory_.eos.clear();
+ }
+ }
+
+ // In case of 2 or more sentences, we generally want to keep the last one
+ // intact. This would provide a bit of stability to the algorithm.
+ else {
+ const auto &secondTolastEntry = memory_.eos[memory_.eos.size() - 2];
+
+ const size_t cut = static_cast(secondTolastEntry.tmstpend *
+ constants::kSamplingRate);
+ const size_t lastCommittedPos = secondTolastEntry.position;
+
+ audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut);
+
+ committed.insert(
+ committed.end(), std::make_move_iterator(transcript.begin()),
+ std::make_move_iterator(transcript.begin() + lastCommittedPos + 1));
+ transcript.erase(transcript.begin(),
+ transcript.begin() + lastCommittedPos + 1);
+
+ // Retain only the most recent EOS entry, shifting both its timestamp
+ // and its position to match the new (truncated) transcript origin.
+ memory_.eos.erase(memory_.eos.begin(), memory_.eos.end() - 1);
+ memory_.eos[0].tmstpend -= secondTolastEntry.tmstpend;
+ memory_.eos[0].position -= lastCommittedPos + 1;
+ }
+
+ return committed;
}
} // namespace rnexecutorch::models::speech_to_text::whisper::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h
index df6d469e39..7547d16bd5 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h
@@ -1,13 +1,13 @@
#pragma once
+#include
+#include
+#include
+
#include "../common/schema/OnlineASR.h"
#include "../common/types/ProcessResult.h"
-#include "../common/types/Segment.h"
#include "../common/types/Word.h"
#include "ASR.h"
-#include "HypothesisBuffer.h"
-
-#include
namespace rnexecutorch::models::speech_to_text::whisper::stream {
@@ -21,60 +21,65 @@ class OnlineASR : public schema::OnlineASR {
OnlineASR(const ASR *asr);
/**
- * Appends new audio samples to the internal processing buffer.
- *
- * @param audio A span of PCM float samples (expected 16kHz).
+ * Checks if the buffer contains enough audio for the next processing step.
+ * @return True if ready, false otherwise.
*/
- void insertAudioChunk(std::span audio) override;
+ bool isReady() const override;
/**
- * Determines whether the model is ready to process the next iteration.
- *
- * @return True if audioBuffer has enough samples, False otherwise
+ * Appends audio samples to the internal buffer.
+ * @param audio Span containing the audio data.
*/
- bool isReady() const override;
+ void insertAudioChunk(std::span audio) override;
/**
- * Processes the current audio buffer and returns new transcription results.
- * Stability is managed by an internal HypothesisBuffer to ensure that
- * only confirmed (stable) text is returned as "committed".
- *
- * @param options Decoding configuration (language, etc.).
- * @return A ProcessResult containing newly committed and uncommitted
- * words.
+ * Processes the current buffered audio and returns transcription results.
+ * @param options Decoding options for the transcription.
+ * @return Transcription result containing committed and volatile tokens.
*/
ProcessResult process(const DecodingOptions &options) override;
/**
- * Finalizes the current streaming session.
- * Flushes any remaining words from the hypothesis buffer.
- *
- * @return A vector of remaining transcribed words.
+ * Finalizes the current stream and returns all words.
+ * @return Vector of detected words.
*/
- std::vector finish() override;
+ std::vector finish(const DecodingOptions &options) override;
/**
- * Reset the streaming state by resetting the buffers
+ * Resets the internal state and clears buffers.
*/
void reset() override;
private:
+ // Cleans up the buffer and returns committed words based on given transcript.
+ std::vector commitAndClean(std::vector &transcript);
+
// ASR module connection for transcribing the audio
const ASR *asr_;
- // Helper buffers - audio buffer
- // Stores the increasing amounts of streamed audio.
- // Cleared from time to time after reaching a threshold size.
+ // Audio buffer (input) - accumulates obtained audio samples.
std::vector audioBuffer_ = {};
- mutable std::mutex audioBufferMutex_;
- float bufferTimeOffset_ = 0.F; // Audio buffer offset
+ mutable std::mutex streamingMutex; // Covers both buffer & memory
- // Helper buffers - hypothesis buffer
- // Manages the whisper streaming hypothesis mechanism.
- HypothesisBuffer hypothesisBuffer_;
+ // Streaming memory.
+ // In general, helps to navigate continous streaming state and improve buffer
+ // handling algorithms.
+ struct Memory {
+ // State management helper.
+ struct EOSEntry {
+ size_t position; // An absolute position (index) in the transcription
+ // (word sequence).
+ std::string preceeding; // A preceeding word in the transcription
+ float tmstpend; // Ending timestamp of the sentence.
+ };
- // State members to keep track of specyfic aspects of buffer state
- float lastSentenceEnd_ = 0.F;
+ std::vector
+ transcript; // The most recent transcription result (uncommitted only!).
+ std::vector
+ eos; // End of sentence points from the most recent transcription.
+ std::vector toCommit; // Words to be committed in the next iteration
+ // (next process() call).
+ } memory_;
};
-} // namespace rnexecutorch::models::speech_to_text::whisper::stream
\ No newline at end of file
+} // namespace rnexecutorch::models::speech_to_text::whisper::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h
index 5eb74c06cc..847a22b1e0 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h
@@ -1,6 +1,9 @@
#pragma once
+#include "Constants.h"
+
#include
+#include
/**
* Hyperparameters
@@ -11,90 +14,50 @@
namespace rnexecutorch::models::speech_to_text::whisper::params {
/**
- * Determines the range of buffer left when skipping an audio chunk
- * of size lower than maximum allowed chunk size.
- *
- * If the audio length does not exceed [kChunkSize * kSamplingRate] - [buffer],
- * then instead of moving to the last returned timestamp, we jump across the
- * entire 30 seconds chunk. This resolves the issue of multiple redundant
- * segments being produced by the transcription algorithm.
+ * Maximum duration of audio that the streaming buffer keeps before forcing
+ * a cleanup. Aligned with Whisper's maximum supported input length.
*/
-constexpr static int32_t kChunkBreakBuffer = 2; // [s]
+constexpr inline float kStreamMaxDuration =
+ static_cast(constants::kChunkSize);
/**
- * Determines the maximum timestamp difference available for a word to be
- * considered as fresh in streaming algorithm.
+ * The minimum amount of recent audio always kept in the buffer when a blind
+ * cut is performed. Acts as the lower bound on what survives a cleanup.
*/
-constexpr static float kStreamFreshThreshold = 3.F; // [s], originally 0.5
+constexpr inline float kStreamSafetyThreshold = 3.F; // [s]
/**
- * The size of the most recent committed suffix searched in
- * fresh words string.
- *
- * For example, if the committed buffer contains ["I", "did" "a" "very" "nasty"
- * "thing."], and kStreamCommitedSuffixSearchSize = 3, then we search for
- * ["very" "nasty" "thing."] suffix.
+ * Forced-cleanup threshold. Once the buffer grows past this duration we run
+ * the EOS-anchored cleanup routine.
*/
-constexpr static size_t kStreamCommitedSuffixSearchSize = 5;
+constexpr inline float kStreamSafeBufferDuration =
+ kStreamMaxDuration - kStreamSafetyThreshold; // [s]
/**
- * Determines the maximum expected size of overlapping fragments between
- * fresh words buffer and commited words buffer in streaming mode.
- *
- * It is a limit of maximum amount of erased repeated words from fresh buffer.
- * The bigger it gets, the less probable it is to commit the same phrase twice.
+ * An estimate of the number of words spoken per second.
+ * Used for estimating transcription progress and buffer management heuristics.
*/
-constexpr static size_t kStreamMaxOverlapSize =
- 12; // Number of overlaping words
+constexpr inline float kWordsPerSecondEstimation = 2.25F;
/**
- * Similar to kMaxStreamOverlapSize, but this one determines
- * the maximum allowed timestamp difference between the overlaping fragments.
- *
- * It's the first, more strict threshold, used when searching for recently
- * committed entries.
+ * Upper bound for words per second estimate in fast speech.
*/
-constexpr static float kStreamMaxOverlapTimestampDiff1 = 6.F; // [s]
+constexpr inline float kWordsPerSecondHigh = 4.F;
/**
- * Similar to kMaxStreamOverlapSize, but this one determines
- * the maximum allowed timestamp difference between the overlaping fragments.
- *
- * It's the second, more liberal threshold, used in overlap correction
- * algorithm.
+ * Lower bound for words per second estimate in slow speech.
*/
-constexpr static float kStreamMaxOverlapTimestampDiff2 = 15.F; // [s]
+constexpr inline float kWordsPerSecondLow = 1.5F;
/**
- * Number of words per 1 allowed mistake (error correction).
+ * Determines the range of buffer left when skipping an audio chunk
+ * of size lower than maximum allowed chunk size.
*
- * For example, if kStreamWordsPerErrorRate = 4, then we allow maximum 1 mistake
- * in a 4 word string.
- */
-constexpr static size_t kStreamWordsPerErrorRate = 5;
-
-/**
- * A threshold which exceeded causes the main streaming audio buffer to be
- * cleared.
- */
-constexpr static float kStreamChunkThreshold = 20.F; // [s]
-
-/**
- * Decides how much of recent audio waveform is always kept in when
- * clearing the audio buffer in streaming algorithm.
- */
-constexpr static float kStreamAudioBufferMinReserve = 2.F; // [s]
-
-/**
- * Decides how much of recent audio waveform can be kept in when
- * clearing the audio buffer in streaming algorithm.
- */
-constexpr static float kStreamAudioBufferMaxReserve = 6.F; // [s]
-
-/**
- * An estimate of number of words per second produced in a standard
- * human conversation speech.
+ * If the audio length does not exceed [kChunkSize * kSamplingRate] - [buffer],
+ * then instead of moving to the last returned timestamp, we jump across the
+ * entire 30 seconds chunk. This resolves the issue of multiple redundant
+ * segments being produced by the transcription algorithm.
*/
-constexpr static float kStreamWordsPerSecond = 2.5F;
+constexpr inline int32_t kChunkBreakBuffer = 2; // [s]
-} // namespace rnexecutorch::models::speech_to_text::whisper::params
\ No newline at end of file
+} // namespace rnexecutorch::models::speech_to_text::whisper::params
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h
index 2e4e3b5076..ae461c27cf 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h
@@ -1,6 +1,7 @@
#pragma once
#include "../common/types/Word.h"
+#include "Constants.h"
#include
#include
#include
@@ -8,70 +9,14 @@
namespace rnexecutorch::models::speech_to_text::whisper::utils {
-// Compares two strings without case-sensitivity.
-inline bool equalsIgnoreCase(const std::string &a, const std::string &b) {
- if (a.size() != b.size()) {
- return false;
- }
- return std::equal(a.begin(), a.end(), b.begin(), [](char c1, char c2) {
- return std::tolower(static_cast(c1)) ==
- std::tolower(static_cast(c2));
- });
-}
-
/**
- * Finds the largest (in number of words) overlaping fragment between word
- * vectors A (suffix) and B (prefix).
+ * Checks if the given word represents an End-of-Sentence (EOS) punctuation.
*
- * An overlaping fragment is any fragment C, which can be simultaneously a
- * suffix of A and a prefix of B. Example: A = 'Jane likes food and playing
- * games', B = 'playing games and sleeping', the overlap fragment C = 'playing
- * games'.
- *
- * @param suffixVec An input vector, where only suffixes can overlap.
- * Typically the 'commited' buffer in streaming algorithm.
- * @param preffixVec An input vector, where only prefixes can overlap.
- * Typically the 'fresh' buffer in streaming algorithm.
- * @param maxCheckRange The maximum size of overlapping fragment. Determines the
- * range of search.
- * @param maxTimestampDiff The maximum allowed timestamp difference between
- * overlaping fragments. If exceeded, the fragment are not considered as
- * overlaping.
- * @return The size of the largest found overlaping fragment.
+ * @param word The word to check.
*/
-template
-inline size_t findLargestOverlapingFragment(const Container &suffixVec,
- const Container &prefixVec,
- size_t maxCheckRange = 10,
- float maxTimestampDiff = 100.f) {
- size_t range = std::min({suffixVec.size(), prefixVec.size(), maxCheckRange});
-
- if (range == 0) {
- return 0;
- }
-
- // i starts at the index where the suffix of length 'range' begins.
- for (size_t i = suffixVec.size() - range; i < suffixVec.size(); ++i) {
- // We search for overlaps by searching for the first word of prefixVec
- if (equalsIgnoreCase(suffixVec[i].content, prefixVec[0].content)) {
- size_t calculatedSize = suffixVec.size() - i;
-
- bool isEqual =
- std::equal(suffixVec.begin() + i, suffixVec.end(), prefixVec.begin(),
- [maxTimestampDiff](const Word &sWord, const Word &pWord) {
- return equalsIgnoreCase(sWord.content, pWord.content) &&
- std::max(std::fabs(sWord.start - pWord.start),
- std::fabs(sWord.end - pWord.end)) <=
- maxTimestampDiff;
- });
-
- if (isEqual) {
- return calculatedSize;
- }
- }
- }
-
- return 0;
+inline bool isEos(const Word &word) {
+ return word.content.size() == 1 &&
+ constants::kEosPunctations.contains(word.content[0]);
}
} // namespace rnexecutorch::models::speech_to_text::whisper::utils
\ No newline at end of file
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
index 06a30a13f7..1fcad420cc 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
@@ -262,7 +262,6 @@ add_rn_test(SpeechToTextTests integration/SpeechToTextTest.cpp
SOURCES
${RNEXECUTORCH_DIR}/models/speech_to_text/SpeechToText.cpp
${RNEXECUTORCH_DIR}/models/speech_to_text/whisper/ASR.cpp
- ${RNEXECUTORCH_DIR}/models/speech_to_text/whisper/HypothesisBuffer.cpp
${RNEXECUTORCH_DIR}/models/speech_to_text/whisper/OnlineASR.cpp
${RNEXECUTORCH_DIR}/data_processing/gzip.cpp
${TOKENIZER_SOURCES}
diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
index 159396add8..aec9da1c0f 100644
--- a/packages/react-native-executorch/src/constants/modelUrls.ts
+++ b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -773,32 +773,29 @@ export const STYLE_TRANSFER_UDNIE_QUANTIZED = {
} as const;
// S2T
-const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-const WHISPER_TINY_EN_MODEL = `${URL_PREFIX}-whisper-tiny.en/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack.pte`;
+const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/tokenizer.json`;
+const WHISPER_TINY_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack_fp32.pte`;
+const WHISPER_TINY_EN_MODEL_COREML = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/coreml/whisper_tiny_en_coreml_fp32.pte`;
-const WHISPER_TINY_EN_QUANTIZED_TOKENIZER = `${URL_PREFIX}-whisper-tiny-quantized.en/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-const WHISPER_TINY_EN_QUANTIZED_MODEL = `${URL_PREFIX}-whisper-tiny-quantized.en/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_tiny_en_quantized_xnnpack.pte`;
+const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/tokenizer.json`;
+const WHISPER_BASE_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/xnnpack/whisper_base_en_xnnpack_fp32.pte`;
+const WHISPER_BASE_EN_MODEL_COREML = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/coreml/whisper_base_en_coreml_fp32.pte`;
-const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-const WHISPER_BASE_EN_MODEL = `${URL_PREFIX}-whisper-base.en/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_base_en_xnnpack.pte`;
+const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/tokenizer.json`;
+const WHISPER_SMALL_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/xnnpack/whisper_small_en_xnnpack_fp32.pte`;
+const WHISPER_SMALL_EN_MODEL_COREML = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/coreml/whisper_small_en_coreml_fp32.pte`;
-const WHISPER_BASE_EN_QUANTIZED_TOKENIZER = `${URL_PREFIX}-whisper-base-quantized.en/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-const WHISPER_BASE_EN_QUANTIZED_MODEL = `${URL_PREFIX}-whisper-base-quantized.en/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_base_en_quantized_xnnpack.pte`;
+const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/tokenizer.json`;
+const WHISPER_TINY_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/xnnpack/whisper_tiny_xnnpack_fp32.pte`;
+const WHISPER_TINY_MODEL_COREML = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/coreml/whisper_tiny_coreml_fp32.pte`;
-const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-const WHISPER_SMALL_EN_MODEL = `${URL_PREFIX}-whisper-small.en/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_small_en_xnnpack.pte`;
+const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/tokenizer.json`;
+const WHISPER_BASE_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/xnnpack/whisper_base_xnnpack_fp32.pte`;
+const WHISPER_BASE_MODEL_COREML = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/coreml/whisper_base_coreml_fp32.pte`;
-const WHISPER_SMALL_EN_QUANTIZED_TOKENIZER = `${URL_PREFIX}-whisper-small-quantized.en/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-const WHISPER_SMALL_EN_QUANTIZED_MODEL = `${URL_PREFIX}-whisper-small-quantized.en/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_small_en_quantized_xnnpack.pte`;
-
-const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-const WHISPER_TINY_MODEL = `${URL_PREFIX}-whisper-tiny/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_tiny_xnnpack.pte`;
-
-const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-const WHISPER_BASE_MODEL = `${URL_PREFIX}-whisper-base/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_base_xnnpack.pte`;
-
-const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-const WHISPER_SMALL_MODEL = `${URL_PREFIX}-whisper-small/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_small_xnnpack.pte`;
+const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/tokenizer.json`;
+const WHISPER_SMALL_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/xnnpack/whisper_small_xnnpack_fp32.pte`;
+const WHISPER_SMALL_MODEL_COREML = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/coreml/whisper_small_coreml_fp32.pte`;
/**
* @category Models - Speech To Text
@@ -806,18 +803,15 @@ const WHISPER_SMALL_MODEL = `${URL_PREFIX}-whisper-small/${PREVIOUS_VERSION_TAG}
export const WHISPER_TINY_EN = {
modelName: 'whisper-tiny-en',
isMultilingual: false,
- modelSource: WHISPER_TINY_EN_MODEL,
+ modelSource: WHISPER_TINY_EN_MODEL_XNNPACK,
tokenizerSource: WHISPER_TINY_EN_TOKENIZER,
} as const;
-/**
- * @category Models - Speech To Text
- */
-export const WHISPER_TINY_EN_QUANTIZED = {
- modelName: 'whisper-tiny-en-quantized',
+export const WHISPER_TINY_EN_COREML = {
+ modelName: 'whisper-tiny-en',
isMultilingual: false,
- modelSource: WHISPER_TINY_EN_QUANTIZED_MODEL,
- tokenizerSource: WHISPER_TINY_EN_QUANTIZED_TOKENIZER,
+ modelSource: WHISPER_TINY_EN_MODEL_COREML,
+ tokenizerSource: WHISPER_TINY_EN_TOKENIZER,
} as const;
/**
@@ -826,18 +820,18 @@ export const WHISPER_TINY_EN_QUANTIZED = {
export const WHISPER_BASE_EN = {
modelName: 'whisper-base-en',
isMultilingual: false,
- modelSource: WHISPER_BASE_EN_MODEL,
+ modelSource: WHISPER_BASE_EN_MODEL_XNNPACK,
tokenizerSource: WHISPER_BASE_EN_TOKENIZER,
} as const;
/**
* @category Models - Speech To Text
*/
-export const WHISPER_BASE_EN_QUANTIZED = {
- modelName: 'whisper-base-en-quantized',
+export const WHISPER_BASE_EN_COREML = {
+ modelName: 'whisper-base-en',
isMultilingual: false,
- modelSource: WHISPER_BASE_EN_QUANTIZED_MODEL,
- tokenizerSource: WHISPER_BASE_EN_QUANTIZED_TOKENIZER,
+ modelSource: WHISPER_BASE_EN_MODEL_COREML,
+ tokenizerSource: WHISPER_BASE_EN_TOKENIZER,
} as const;
/**
@@ -846,18 +840,18 @@ export const WHISPER_BASE_EN_QUANTIZED = {
export const WHISPER_SMALL_EN = {
modelName: 'whisper-small-en',
isMultilingual: false,
- modelSource: WHISPER_SMALL_EN_MODEL,
+ modelSource: WHISPER_SMALL_EN_MODEL_XNNPACK,
tokenizerSource: WHISPER_SMALL_EN_TOKENIZER,
} as const;
/**
* @category Models - Speech To Text
*/
-export const WHISPER_SMALL_EN_QUANTIZED = {
- modelName: 'whisper-small-en-quantized',
+export const WHISPER_SMALL_EN_COREML = {
+ modelName: 'whisper-small-en',
isMultilingual: false,
- modelSource: WHISPER_SMALL_EN_QUANTIZED_MODEL,
- tokenizerSource: WHISPER_SMALL_EN_QUANTIZED_TOKENIZER,
+ modelSource: WHISPER_SMALL_EN_MODEL_COREML,
+ tokenizerSource: WHISPER_SMALL_EN_TOKENIZER,
} as const;
/**
@@ -866,7 +860,17 @@ export const WHISPER_SMALL_EN_QUANTIZED = {
export const WHISPER_TINY = {
modelName: 'whisper-tiny',
isMultilingual: true,
- modelSource: WHISPER_TINY_MODEL,
+ modelSource: WHISPER_TINY_MODEL_XNNPACK,
+ tokenizerSource: WHISPER_TINY_TOKENIZER,
+} as const;
+
+/**
+ * @category Models - Speech To Text
+ */
+export const WHISPER_TINY_COREML = {
+ modelName: 'whisper-tiny',
+ isMultilingual: true,
+ modelSource: WHISPER_TINY_MODEL_COREML,
tokenizerSource: WHISPER_TINY_TOKENIZER,
} as const;
@@ -876,7 +880,17 @@ export const WHISPER_TINY = {
export const WHISPER_BASE = {
modelName: 'whisper-base',
isMultilingual: true,
- modelSource: WHISPER_BASE_MODEL,
+ modelSource: WHISPER_BASE_MODEL_XNNPACK,
+ tokenizerSource: WHISPER_BASE_TOKENIZER,
+} as const;
+
+/**
+ * @category Models - Speech To Text
+ */
+export const WHISPER_BASE_COREML = {
+ modelName: 'whisper-base',
+ isMultilingual: true,
+ modelSource: WHISPER_BASE_MODEL_COREML,
tokenizerSource: WHISPER_BASE_TOKENIZER,
} as const;
@@ -886,7 +900,17 @@ export const WHISPER_BASE = {
export const WHISPER_SMALL = {
modelName: 'whisper-small',
isMultilingual: true,
- modelSource: WHISPER_SMALL_MODEL,
+ modelSource: WHISPER_SMALL_MODEL_XNNPACK,
+ tokenizerSource: WHISPER_SMALL_TOKENIZER,
+} as const;
+
+/**
+ * @category Models - Speech To Text
+ */
+export const WHISPER_SMALL_COREML = {
+ modelName: 'whisper-small',
+ isMultilingual: true,
+ modelSource: WHISPER_SMALL_MODEL_COREML,
tokenizerSource: WHISPER_SMALL_TOKENIZER,
} as const;
@@ -1351,14 +1375,17 @@ export const MODEL_REGISTRY = {
STYLE_TRANSFER_UDNIE,
STYLE_TRANSFER_UDNIE_QUANTIZED,
WHISPER_TINY_EN,
- WHISPER_TINY_EN_QUANTIZED,
+ WHISPER_TINY_EN_COREML,
WHISPER_BASE_EN,
- WHISPER_BASE_EN_QUANTIZED,
+ WHISPER_BASE_EN_COREML,
WHISPER_SMALL_EN,
- WHISPER_SMALL_EN_QUANTIZED,
+ WHISPER_SMALL_EN_COREML,
WHISPER_TINY,
+ WHISPER_TINY_COREML,
WHISPER_BASE,
+ WHISPER_BASE_COREML,
WHISPER_SMALL,
+ WHISPER_SMALL_COREML,
DEEPLAB_V3_RESNET50,
DEEPLAB_V3_RESNET101,
DEEPLAB_V3_MOBILENET_V3_LARGE,
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index c906851380..229bba73e3 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -5,6 +5,7 @@ import {
SpeechToTextType,
SpeechToTextProps,
TranscriptionResult,
+ StreamingOptions,
} from '../../types/stt';
import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
@@ -101,7 +102,7 @@ export const useSpeechToText = ({
);
const stream = useCallback(
- async function* (options: DecodingOptions = {}): AsyncGenerator<
+ async function* (options: StreamingOptions = {}): AsyncGenerator<
{
committed: TranscriptionResult;
nonCommitted: TranscriptionResult;
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
index 273264e0e2..3890c9ae50 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
@@ -2,6 +2,7 @@ import {
DecodingOptions,
SpeechToTextModelConfig,
SpeechToTextModelName,
+ StreamingOptions,
TranscriptionResult,
} from '../../types/stt';
import { ResourceFetcher } from '../../utils/ResourceFetcher';
@@ -174,7 +175,7 @@ export class SpeechToTextModule {
* @yields An object containing `committed` and `nonCommitted` transcription results.
* @returns An async generator yielding transcription updates.
*/
- public async *stream(options: DecodingOptions = {}): AsyncGenerator<{
+ public async *stream(options: StreamingOptions = {}): AsyncGenerator<{
committed: TranscriptionResult;
nonCommitted: TranscriptionResult;
}> {
@@ -182,6 +183,7 @@ export class SpeechToTextModule {
const verbose = !!options.verbose;
const language = options.language || '';
+ const timeout = options.timeout || 100;
const queue: {
committed: TranscriptionResult;
@@ -216,7 +218,8 @@ export class SpeechToTextModule {
wake();
},
language,
- verbose
+ verbose,
+ timeout
);
finished = true;
diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts
index 0a6ed11f70..f9a2fb56d8 100644
--- a/packages/react-native-executorch/src/types/stt.ts
+++ b/packages/react-native-executorch/src/types/stt.ts
@@ -7,11 +7,8 @@ import { RnExecutorchError } from '../errors/errorUtils';
*/
export type SpeechToTextModelName =
| 'whisper-tiny-en'
- | 'whisper-tiny-en-quantized'
| 'whisper-base-en'
- | 'whisper-base-en-quantized'
| 'whisper-small-en'
- | 'whisper-small-en-quantized'
| 'whisper-tiny'
| 'whisper-base'
| 'whisper-small';
@@ -94,7 +91,7 @@ export interface SpeechToTextType {
* @returns Asynchronous generator that returns `committed` and `nonCommitted` transcription.
* Both `committed` and `nonCommitted` are of type `TranscriptionResult`
*/
- stream(options?: DecodingOptions | undefined): AsyncGenerator<
+ stream(options?: StreamingOptions | undefined): AsyncGenerator<
{
committed: TranscriptionResult;
nonCommitted: TranscriptionResult;
@@ -208,6 +205,15 @@ export interface DecodingOptions {
verbose?: boolean;
}
+/**
+ * Configuration options for the speech-to-text streaming process.
+ * @category Types
+ * @property {number} [timeout] - Specifies (in milliseconds) how much does streamer wait between model inferences.
+ */
+export interface StreamingOptions extends DecodingOptions {
+ timeout?: number;
+}
+
/**
* Structure that represent single token with timestamp information.
* @category Types
diff --git a/skills/canary/react-native-executorch/references/reference-models.md b/skills/canary/react-native-executorch/references/reference-models.md
index f6010a7793..02134f4513 100644
--- a/skills/canary/react-native-executorch/references/reference-models.md
+++ b/skills/canary/react-native-executorch/references/reference-models.md
@@ -195,18 +195,21 @@ For a list of all available Speech to Text models reference [this Hugging Face c
### Whisper Models (English only)
-- **WHISPER_TINY_EN** - Whisper Tiny English-only
-- **WHISPER_TINY_EN_QUANTIZED** - Whisper Tiny English-only quantized
-- **WHISPER_BASE_EN** - Whisper Base English-only
-- **WHISPER_BASE_EN_QUANTIZED** - Whisper Base English-only quantized
-- **WHISPER_SMALL_EN** - Whisper Small English-only
-- **WHISPER_SMALL_EN_QUANTIZED** - Whisper Small English-only quantized
+- **WHISPER_TINY_EN** - Whisper Tiny English-only (XNNPACK)
+- **WHISPER_TINY_EN_COREML** - Whisper Tiny English-only (CoreML)
+- **WHISPER_BASE_EN** - Whisper Base English-only (XNNPACK)
+- **WHISPER_BASE_EN_COREML** - Whisper Base English-only (CoreML)
+- **WHISPER_SMALL_EN** - Whisper Small English-only (XNNPACK)
+- **WHISPER_SMALL_EN_COREML** - Whisper Small English-only (CoreML)
### Whisper Models (Multilingual)
-- **WHISPER_TINY** - Whisper Tiny multilingual
-- **WHISPER_BASE** - Whisper Base multilingual
-- **WHISPER_SMALL** - Whisper Small multilingual
+- **WHISPER_TINY** - Whisper Tiny multilingual (XNNPACK)
+- **WHISPER_TINY_COREML** - Whisper Tiny multilingual (CoreML)
+- **WHISPER_BASE** - Whisper Base multilingual (XNNPACK)
+- **WHISPER_BASE_COREML** - Whisper Base multilingual (CoreML)
+- **WHISPER_SMALL** - Whisper Small multilingual (XNNPACK)
+- **WHISPER_SMALL_COREML** - Whisper Small multilingual (CoreML)
---
diff --git a/skills/react-native-executorch/references/reference-models.md b/skills/react-native-executorch/references/reference-models.md
index f6010a7793..02134f4513 100644
--- a/skills/react-native-executorch/references/reference-models.md
+++ b/skills/react-native-executorch/references/reference-models.md
@@ -195,18 +195,21 @@ For a list of all available Speech to Text models reference [this Hugging Face c
### Whisper Models (English only)
-- **WHISPER_TINY_EN** - Whisper Tiny English-only
-- **WHISPER_TINY_EN_QUANTIZED** - Whisper Tiny English-only quantized
-- **WHISPER_BASE_EN** - Whisper Base English-only
-- **WHISPER_BASE_EN_QUANTIZED** - Whisper Base English-only quantized
-- **WHISPER_SMALL_EN** - Whisper Small English-only
-- **WHISPER_SMALL_EN_QUANTIZED** - Whisper Small English-only quantized
+- **WHISPER_TINY_EN** - Whisper Tiny English-only (XNNPACK)
+- **WHISPER_TINY_EN_COREML** - Whisper Tiny English-only (CoreML)
+- **WHISPER_BASE_EN** - Whisper Base English-only (XNNPACK)
+- **WHISPER_BASE_EN_COREML** - Whisper Base English-only (CoreML)
+- **WHISPER_SMALL_EN** - Whisper Small English-only (XNNPACK)
+- **WHISPER_SMALL_EN_COREML** - Whisper Small English-only (CoreML)
### Whisper Models (Multilingual)
-- **WHISPER_TINY** - Whisper Tiny multilingual
-- **WHISPER_BASE** - Whisper Base multilingual
-- **WHISPER_SMALL** - Whisper Small multilingual
+- **WHISPER_TINY** - Whisper Tiny multilingual (XNNPACK)
+- **WHISPER_TINY_COREML** - Whisper Tiny multilingual (CoreML)
+- **WHISPER_BASE** - Whisper Base multilingual (XNNPACK)
+- **WHISPER_BASE_COREML** - Whisper Base multilingual (CoreML)
+- **WHISPER_SMALL** - Whisper Small multilingual (XNNPACK)
+- **WHISPER_SMALL_COREML** - Whisper Small multilingual (CoreML)
---