diff --git a/apps/llm/app/index.tsx b/apps/llm/app/index.tsx index 72358ae72c..b67b3fa7ce 100644 --- a/apps/llm/app/index.tsx +++ b/apps/llm/app/index.tsx @@ -29,12 +29,6 @@ export default function Home() { > LLM Structured Output - router.navigate('voice_chat/')} - > - Voice Chat - router.navigate('multimodal_llm/')} diff --git a/apps/llm/app/voice_chat/index.tsx b/apps/llm/app/voice_chat/index.tsx deleted file mode 100644 index 23ab70bff4..0000000000 --- a/apps/llm/app/voice_chat/index.tsx +++ /dev/null @@ -1,311 +0,0 @@ -import { useContext, useEffect, useState } from 'react'; -import { - Keyboard, - KeyboardAvoidingView, - Platform, - StyleSheet, - Text, - TouchableOpacity, - TouchableWithoutFeedback, - View, -} from 'react-native'; -import SWMIcon from '../../assets/icons/swm_icon.svg'; -import Spinner from '../../components/Spinner'; -import ErrorBanner from '../../components/ErrorBanner'; -import { - useSpeechToText, - useLLM, - QWEN3_0_6B_QUANTIZED, - QWEN3_1_7B_QUANTIZED, - LLAMA3_2_1B_SPINQUANT, - WHISPER_TINY_EN, - WHISPER_TINY_EN_QUANTIZED, - WHISPER_BASE_EN, - WHISPER_SMALL_EN, - LLMProps, - SpeechToTextProps, -} from 'react-native-executorch'; -import { ModelPicker, ModelOption } from '../../components/ModelPicker'; -import PauseIcon from '../../assets/icons/pause_icon.svg'; -import MicIcon from '../../assets/icons/mic_icon.svg'; -import StopIcon from '../../assets/icons/stop_icon.svg'; -import ColorPalette from '../../colors'; -import Messages from '../../components/Messages'; -import { AudioManager, AudioRecorder } from 'react-native-audio-api'; -import DeviceInfo from 'react-native-device-info'; -import { useIsFocused } from '@react-navigation/native'; -import { useSafeAreaInsets } from 'react-native-safe-area-context'; -import { GeneratingContext } from '../../context'; - -type LLMModelSources = LLMProps['model']; -type STTModelSources = SpeechToTextProps['model']; - -const LLM_MODELS: ModelOption[] = [ - { label: 'Qwen3 0.6B', value: QWEN3_0_6B_QUANTIZED }, - { label: 'Qwen3 1.7B', value: QWEN3_1_7B_QUANTIZED }, - { label: 'Llama 1B', value: LLAMA3_2_1B_SPINQUANT }, -]; - -const STT_MODELS: ModelOption[] = [ - { label: 'Whisper Tiny', value: WHISPER_TINY_EN }, - { label: 'Whisper Tiny Q', value: WHISPER_TINY_EN_QUANTIZED }, - { label: 'Whisper Base', value: WHISPER_BASE_EN }, - { label: 'Whisper Small', value: WHISPER_SMALL_EN }, -]; - -export default function VoiceChatScreenWrapper() { - const isFocused = useIsFocused(); - - return isFocused ? : null; -} - -function VoiceChatScreen() { - const { bottom } = useSafeAreaInsets(); - const [isRecording, setIsRecording] = useState(false); - const [liveTranscription, setLiveTranscription] = useState(''); - const [selectedLLM, setSelectedLLM] = - useState(QWEN3_0_6B_QUANTIZED); - const [selectedSTT, setSelectedSTT] = - useState(WHISPER_TINY_EN); - const [error, setError] = useState(null); - - const [recorder] = useState(() => new AudioRecorder()); - - const { setGlobalGenerating } = useContext(GeneratingContext); - - const llm = useLLM({ model: selectedLLM }); - const speechToText = useSpeechToText({ - model: selectedSTT, - }); - - useEffect(() => { - setGlobalGenerating(llm.isGenerating || speechToText.isGenerating); - }, [llm.isGenerating, speechToText.isGenerating, setGlobalGenerating]); - - useEffect(() => { - AudioManager.setAudioSessionOptions({ - iosCategory: 'playAndRecord', - iosMode: 'spokenAudio', - iosOptions: ['allowBluetoothHFP', 'defaultToSpeaker'], - }); - AudioManager.requestRecordingPermissions(); - }, []); - - const handleRecordPress = async () => { - if (isRecording) { - setIsRecording(false); - recorder.stop(); - speechToText.streamStop(); - } else { - setIsRecording(true); - setLiveTranscription(''); - - const sampleRate = 16000; - recorder.onAudioReady( - { - sampleRate, - bufferLength: 0.1 * sampleRate, - channelCount: 1, - }, - ({ buffer }) => { - speechToText.streamInsert(buffer.getChannelData(0)); - } - ); - recorder.start(); - - let finalResult = ''; - - try { - for await (const result of speechToText.stream()) { - const text = result.committed.text + result.nonCommitted.text; - setLiveTranscription(text); - finalResult = text; - } - } catch (e) { - setError(e instanceof Error ? e.message : String(e)); - } finally { - if (finalResult.trim().length > 0) { - await llm.sendMessage(finalResult); - setLiveTranscription(''); - } - } - } - }; - - useEffect(() => { - if (llm.error) setError(String(llm.error)); - }, [llm.error]); - - useEffect(() => { - if (speechToText.error) setError(String(speechToText.error)); - }, [speechToText.error]); - - return (!llm.isReady || !speechToText.isReady) && - !llm.error && - !speechToText.error ? ( - - ) : ( - - - - - Qwen 3 x Whisper - - setError(null)} /> - {llm.messageHistory.length > 0 || liveTranscription.length > 0 ? ( - - 0 - ? [ - ...llm.messageHistory, - { - role: 'user', - content: liveTranscription, - }, - ] - : llm.messageHistory - } - llmResponse={llm.response} - isGenerating={llm.isGenerating} - deleteMessage={llm.deleteMessage} - /> - - ) : ( - - Hello! 👋 - - Tap the mic and speak to me. I'll transcribe your voice and - respond using a language model — all on-device. - - - )} - - setSelectedLLM(m)} - /> - setSelectedSTT(m)} - /> - - - {DeviceInfo.isEmulatorSync() ? ( - - - recording disabled on emulator - - - ) : ( - <> - {llm.isGenerating ? ( - - - - ) : ( - - {isRecording ? ( - - ) : ( - - )} - - )} - - )} - - - - ); -} - -const styles = StyleSheet.create({ - keyboardAvoidingView: { - flex: 1, - }, - topContainer: { - height: 68, - width: '100%', - alignItems: 'center', - justifyContent: 'center', - }, - chatContainer: { - flex: 10, - width: '100%', - }, - textModelName: { - color: ColorPalette.primary, - }, - helloMessageContainer: { - flex: 10, - width: '100%', - alignItems: 'center', - justifyContent: 'center', - }, - helloText: { - fontFamily: 'medium', - fontSize: 30, - color: ColorPalette.primary, - }, - bottomHelloText: { - fontFamily: 'regular', - fontSize: 20, - lineHeight: 28, - textAlign: 'center', - color: ColorPalette.primary, - }, - bottomContainer: { - height: 100, - width: '100%', - justifyContent: 'center', - alignItems: 'center', - paddingHorizontal: 16, - }, - recordTouchable: { - height: '100%', - justifyContent: 'center', - alignItems: 'center', - }, - recordingInfo: { - width: '100%', - display: 'flex', - justifyContent: 'center', - alignItems: 'center', - }, - emulatorBox: { - padding: 10, - margin: 10, - borderWidth: 1, - borderRadius: 8, - borderColor: 'gray', - justifyContent: 'center', - alignItems: 'center', - }, - emulatorWarning: { - color: 'gray', - fontSize: 16, - }, -}); diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index dfd39c15b4..2942d5e718 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -14,9 +14,11 @@ import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; import { useSpeechToText, WHISPER_TINY_EN, - WHISPER_TINY_EN_QUANTIZED, + WHISPER_TINY_EN_COREML, WHISPER_BASE_EN, + WHISPER_BASE_EN_COREML, WHISPER_SMALL_EN, + WHISPER_SMALL_EN_COREML, TranscriptionResult, SpeechToTextProps, } from 'react-native-executorch'; @@ -25,10 +27,12 @@ import { ModelPicker, ModelOption } from '../components/ModelPicker'; type STTModelSources = SpeechToTextProps['model']; const MODELS: ModelOption[] = [ - { label: 'Whisper Tiny', value: WHISPER_TINY_EN }, - { label: 'Whisper Tiny Q', value: WHISPER_TINY_EN_QUANTIZED }, - { label: 'Whisper Base', value: WHISPER_BASE_EN }, - { label: 'Whisper Small', value: WHISPER_SMALL_EN }, + { label: 'Whisper Tiny EN (XNNPACK)', value: WHISPER_TINY_EN }, + { label: 'Whisper Tiny EN (CoreML)', value: WHISPER_TINY_EN_COREML }, + { label: 'Whisper Base EN (XNNPACK)', value: WHISPER_BASE_EN }, + { label: 'Whisper Base EN (CoreML)', value: WHISPER_BASE_EN_COREML }, + { label: 'Whisper Small EN (XNNPACK)', value: WHISPER_SMALL_EN }, + { label: 'Whisper Small EN (CoreML)', value: WHISPER_SMALL_EN_COREML }, ]; import FontAwesome from '@expo/vector-icons/FontAwesome'; import { @@ -45,9 +49,12 @@ import ErrorBanner from '../components/ErrorBanner'; const isSimulator = DeviceInfo.isEmulatorSync(); +const DEFAULT_MODEL = + Platform.OS === 'ios' ? WHISPER_BASE_EN_COREML : WHISPER_TINY_EN; + export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const [selectedModel, setSelectedModel] = - useState(WHISPER_TINY_EN); + useState(DEFAULT_MODEL); const model = useSpeechToText({ model: selectedModel, @@ -148,7 +155,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { recorder.current.onAudioReady( { sampleRate, - bufferLength: 0.1 * sampleRate, + bufferLength: 0.1 * sampleRate, // 100 ms channelCount: 1, }, ({ buffer }) => { @@ -178,6 +185,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { try { const streamIter = model.stream({ verbose: enableTimestamps, + timeout: 100, }); for await (const { committed, nonCommitted } of streamIter) { diff --git a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md index 089b844eb0..dc9f88179c 100644 --- a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md +++ b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md @@ -17,20 +17,31 @@ keywords: description: "Learn how to use speech-to-text models in your React Native applications with React Native ExecuTorch's useSpeechToText hook." --- -Speech to text is a task that allows to transform spoken language to written text. It is commonly used to implement features such as transcription or voice assistants. +Speech to text (STT) converts spoken audio into written text. This hook allows you to implement features like voice assistants, real-time transcription, and audio file processing directly on-device. :::info -It is recommended to use models provided by us, which are available at our [Hugging Face repository](https://huggingface.co/collections/software-mansion/speech-to-text-68d0ec99ed794250491b8bbe). You can also use [constants](https://github.com/software-mansion/react-native-executorch/blob/main/packages/react-native-executorch/src/constants/modelUrls.ts) shipped with our library. +We recommend using our optimized models available on [Hugging Face](https://huggingface.co/collections/software-mansion/speech-to-text-68d0ec99ed794250491b8bbe). You can also use pre-defined [constants](https://github.com/software-mansion/react-native-executorch/blob/main/packages/react-native-executorch/src/constants/modelUrls.ts) included in the library. ::: ## API Reference -- For detailed API Reference for `useSpeechToText` see: [`useSpeechToText` API Reference](../../06-api-reference/functions/useSpeechToText.md). -- For all speech to text models available out-of-the-box in React Native ExecuTorch see: [STT Models](../../06-api-reference/index.md#models---speech-to-text). +- [`useSpeechToText` API Reference](../../06-api-reference/functions/useSpeechToText.md) +- [STT Models List](../../06-api-reference/index.md#models---speech-to-text) -## High Level Overview +## Basic Usage (File Transcription) -You can obtain waveform from audio in any way most suitable to you, however in the snippet below we utilize [`react-native-audio-api`](https://docs.swmansion.com/react-native-audio-api/) library to process a `.mp3` file. +Use `transcribe` for processing pre-recorded audio or short clips. The input should be a `Float32Array` of audio samples at **16 kHz**. + +### Transcribe Options + +The `transcribe()` function accepts an optional configuration object: + +- `language`: The language code (e.g., `'es'`, `'fr'`). Required for multilingual models. +- `verbose`: If `true`, the method returns a detailed `TranscriptionResult` object following the OpenAI Whisper `verbose_json` format (including segments and word-level timestamps). + +In this example, we use [`react-native-audio-api`](https://docs.swmansion.com/react-native-audio-api/) to decode an audio file into the required format. + +### Example ```typescript import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch'; @@ -41,181 +52,46 @@ const model = useSpeechToText({ model: WHISPER_TINY_EN, }); +// 1. Get audio file const { uri } = await FileSystem.downloadAsync( 'https://some-audio-url.com/file.mp3', - FileSystem.cacheDirectory + 'audio_file' + `${FileSystem.cacheDirectory}audio_file` ); +// 2. Decode to 16kHz PCM Float32Array const audioContext = new AudioContext({ sampleRate: 16000 }); const decodedAudioData = await audioContext.decodeAudioData(uri); const audioBuffer = decodedAudioData.getChannelData(0); +// 3. Transcribe try { - const transcription = await model.transcribe(audioBuffer); - console.log(transcription.text); + const result = await model.transcribe(audioBuffer); + console.log('Transcription:', result.text); } catch (error) { - console.error('Error during audio transcription', error); + console.error('Transcription failed:', error); } ``` -### Streaming - -Since speech-to-text models can only process audio segments up to 30 seconds long, we need to split longer inputs into chunks. However, simple chunking may cut speech mid-sentence, making it harder for the model to understand. To address this, we use the [whisper-streaming](https://aclanthology.org/2023.ijcnlp-demo.3.pdf) algorithm. While this introduces some overhead, it enables accurate processing of audio inputs of arbitrary length. - -### Arguments - -`useSpeechToText` takes [`SpeechToTextProps`](../../06-api-reference/interfaces/SpeechToTextProps.md) that consists of: - -- `model` of type [`SpeechToTextConfig`](../../06-api-reference/interfaces/SpeechToTextModelConfig.md), containing the [`isMultilingual` flag](../../06-api-reference/interfaces/SpeechToTextModelConfig.md#ismultilingual), [tokenizer source](../../06-api-reference/interfaces/SpeechToTextModelConfig.md#tokenizersource) and [model source](../../06-api-reference/interfaces/SpeechToTextModelConfig.md#modelsource). -- An optional flag [`preventLoad`](../../06-api-reference/interfaces/SpeechToTextProps.md#preventload) which prevents auto-loading of the model. - -You need more details? Check the following resources: +## Live Streaming Transcription -- For detailed information about `useSpeechToText` arguments check this section: [`useSpeechToText` arguments](../../06-api-reference/functions/useSpeechToText.md#parameters) -- For all speech to text models available out-of-the-box in React Native ExecuTorch see: [STT Models](../../06-api-reference/index.md#models---speech-to-text). -- For more information on loading resources, take a look at [loading models](../../01-fundamentals/02-loading-models.md) page. - -### Returns - -`useSpeechToText` returns an object called `SpeechToTextType` containing bunch of functions to interact with STT. - -Please note, that both [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) and [`stream`](../../06-api-reference/interfaces/SpeechToTextType.md#stream) functions accept [`DecodingOptions`](../../06-api-reference/interfaces/DecodingOptions.md) type as an argument. It accepts language abbreviation, you can check them out in [`language`](../../06-api-reference/interfaces/DecodingOptions.md#language) property of this config of type [`SpeechToTextLanguage`](../../06-api-reference/type-aliases/SpeechToTextLanguage.md). - -To get more details please read: [`SpeechToTextType` API Reference](../../06-api-reference/interfaces/SpeechToTextType.md). - -## Running the model - -Before running the model's [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) method, make sure to extract the audio waveform you want to transcribe. You'll need to handle this step yourself, ensuring the audio is sampled at 16 kHz. Once you have the waveform, pass it as an argument to the transcribe method. The method returns a promise that resolves to the generated transcription on success, or an error if inference fails. - -### Multilingual transcription - -If you want to transcribe speech in languages other than English, use the multilingual version of Whisper. To generate the output in your desired language, pass the [`language`](../../06-api-reference/interfaces/DecodingOptions.md#language) option to the [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) method. - -```typescript -import { useSpeechToText, WHISPER_TINY } from 'react-native-executorch'; +For real-time applications or audio streams of arbitrary length, use the **Streaming API**. This is optimized for live input, handling the 30-second window limitation of Whisper models automatically to ensure context isn't lost between chunks. -const model = useSpeechToText({ - model: WHISPER_TINY, -}); +### How it works: -const transcription = await model.transcribe(spanishAudio, { language: 'es' }); -``` +1. **Feed audio**: Use `streamInsert` to push small chunks of audio (e.g., 100ms) as they arrive from the microphone. +2. **Get results**: The `stream` generator yields two types of text: + - `committed`: Finalized text that won't change. + - `nonCommitted`: Temporary text that might update as the model gets more context from the audio. -### Timestamps & Transcription Stat Data +### Streaming Options -You can obtain word-level timestamps and other useful parameters from transcription ([`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) and [`stream`](../../06-api-reference/interfaces/SpeechToTextType.md#stream) methods) by setting `verbose: true` in the options. The result mimics the _verbose_json_ format from OpenAI Whisper API. For more information please read [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe), [`stream`](../../06-api-reference/interfaces/SpeechToTextType.md#stream), and [`TranscriptionResult`](../../06-api-reference/interfaces/TranscriptionResult.md) API References. +The `stream()` function accepts several optional parameters: -```typescript -const transcription = await model.transcribe(audioBuffer, { verbose: true }); -// Example result -// -// transcription: { -// task: "transcription", -// text: "Example text for a ...", -// duration: 9.05, -// language: "en", -// segments: [ -// { -// start: 0, -// end: 5.4, -// text: "Example text for", -// words: [ -// { -// word: "Example", -// start: 0, -// end: 1.4 -// }, -// ... -// ] -// tokens: [1, 32, 45, ...], -// temperature: 0.0, -// avgLogprob: -1.235, -// compressionRatio: 1.632 -// }, -// ... -// ] -// } -``` +- `language`: The language code (e.g., `'es'`, `'fr'`). Required for multilingual models. +- `verbose`: If `true`, includes word-level timestamps and segment metadata in the result objects. +- `timeout`: (Advanced) The interval (in milliseconds) between processing consecutive audio chunks in streaming mode. Lower values provide more frequent updates and lower latency, while higher values reduce CPU consumption. Defaults to `100`. -## Example - -```tsx -import React, { useState } from 'react'; -import { Button, Text, View } from 'react-native'; -import { - useSpeechToText, - WHISPER_TINY_EN, - TranscriptionResult, -} from 'react-native-executorch'; -import { AudioContext } from 'react-native-audio-api'; -import * as FileSystem from 'expo-file-system'; - -function App() { - const model = useSpeechToText({ - model: WHISPER_TINY_EN, - }); - - const [transcription, setTranscription] = useState(null); - - const loadAudio = async () => { - const { uri } = await FileSystem.downloadAsync( - 'https://some-audio-url.com/file.mp3', - FileSystem.cacheDirectory + 'audio_file' - ); - - const audioContext = new AudioContext({ sampleRate: 16000 }); - const decodedAudioData = await audioContext.decodeAudioDataSource(uri); - const audioBuffer = decodedAudioData.getChannelData(0); - - return audioBuffer; - }; - - const handleTranscribe = async () => { - const audio = await loadAudio(); - // Default text transcription - const result = await model.transcribe(audio); - setTranscription(result); - }; - - const handleTranscribeWithTimestamps = async () => { - const audio = await loadAudio(); - // Transcription with timestamps - const result = await model.transcribe(audio, { verbose: true }); - setTranscription(result); - }; - - // Custom logic for printing transcription - // e.g. - - const renderContent = () => { - if (!transcription) return Press a button to transcribe; - - if (transcription.segments && transcription.segments.length > 0) { - return ( - - {transcription.text + - '\n\nNum segments: ' + - transcription.segments.length.toString()} - - ); - } - return {transcription.text}; - }; - - return ( - - {renderContent()} -