From bc31cd5528a71612225e4e67f98f1db15ccd8282 Mon Sep 17 00:00:00 2001 From: IgorSwat Date: Thu, 7 May 2026 12:10:30 +0200 Subject: [PATCH 01/20] Optimal streaming algorithm --- .../host_objects/JsiConversions.h | 3 +- .../models/speech_to_text/SpeechToText.cpp | 4 +- .../speech_to_text/common/schema/OnlineASR.h | 2 +- .../models/speech_to_text/common/types/Word.h | 7 +- .../models/speech_to_text/whisper/ASR.cpp | 16 +- .../models/speech_to_text/whisper/Constants.h | 15 +- .../whisper/HypothesisBuffer.cpp | 199 -------------- .../speech_to_text/whisper/HypothesisBuffer.h | 82 ------ .../speech_to_text/whisper/OnlineASR.cpp | 258 +++++++++++------- .../models/speech_to_text/whisper/OnlineASR.h | 65 ++--- .../models/speech_to_text/whisper/Params.h | 95 ++----- .../models/speech_to_text/whisper/Utils.h | 67 +---- 12 files changed, 245 insertions(+), 568 deletions(-) delete mode 100644 packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp delete mode 100644 packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.h diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h index 077d426c8f..c50410a4f7 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h @@ -591,8 +591,7 @@ inline jsi::Value getJsiValue(const Segment &seg, jsi::Runtime &runtime) { jsi::Object wordObj(runtime); wordObj.setProperty( runtime, "word", - jsi::String::createFromUtf8(runtime, seg.words[i].content + - seg.words[i].punctations)); + jsi::String::createFromUtf8(runtime, seg.words[i].content)); wordObj.setProperty(runtime, "start", static_cast(seg.words[i].start)); wordObj.setProperty(runtime, "end", static_cast(seg.words[i].end)); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp index 4b58c5039b..3133c0bb29 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp @@ -94,7 +94,7 @@ TranscriptionResult wordsToResult(const std::vector &words, std::string fullText; for (const auto &w : words) { - fullText += w.content + w.punctations; + fullText += w.content; } res.text = fullText; @@ -161,7 +161,7 @@ void SpeechToText::stream(std::shared_ptr callback, std::this_thread::sleep_for(std::chrono::milliseconds(100)); } - std::vector finalWords = streamer_->finish(); + std::vector finalWords = streamer_->finish(options); TranscriptionResult finalRes = wordsToResult(finalWords, languageOption, verbose); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h index 357309391d..efe6cc2819 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h @@ -36,7 +36,7 @@ class OnlineASR { virtual ProcessResult process(const DecodingOptions &options) = 0; - virtual std::vector finish() = 0; + virtual std::vector finish(const DecodingOptions &options) = 0; virtual void reset() = 0; }; diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h index e7319f95b5..2343d1faab 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h @@ -4,13 +4,14 @@ namespace rnexecutorch::models::speech_to_text { +/** + * Basically a different representation of token, + * with timestamps calculated. + */ struct Word { std::string content; float start; float end; - - std::string - punctations; // Trailing punctations which appear after the main content }; } // namespace rnexecutorch::models::speech_to_text diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp index d1debeb0f0..b0d08e419b 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp @@ -138,8 +138,9 @@ executorch::aten::Tensor ASR::decode(std::span tokens, positionShape, cachePositions.data(), ScalarType::Long); const auto encoderOutputSize = static_cast(encoderOutput.size()); - std::vector encShape = {1, constants::kNumFrames, - encoderOutputSize / constants::kNumFrames}; + std::vector encShape = { + 1, static_cast(constants::kNumFrames), + encoderOutputSize / static_cast(constants::kNumFrames)}; auto encoderTensor = executorch::extension::make_tensor_ptr( std::move(encShape), const_cast(encoderOutput.data()), ScalarType::Float); @@ -437,7 +438,7 @@ ASR::estimateWordLevelTimestampsLinear(std::span tokens, const float wEnd = wStart + timePerChar * wSize; prevCharCount += wSize; - // We store punctations separately to other characters. + // Detect and extract trailing punctuations. std::string puncts = ""; while (!w.empty() && constants::kPunctations.contains(w.back())) { puncts += w.back(); @@ -445,7 +446,14 @@ ASR::estimateWordLevelTimestampsLinear(std::span tokens, } std::reverse(puncts.begin(), puncts.end()); - wordObjs.emplace_back(std::move(w), wStart, wEnd, std::move(puncts)); + // Add the core word. + wordObjs.emplace_back(std::move(w), wStart, wEnd); + + // If punctuation was present, add it as a separate "word" with an + // instantaneous timestamp at the end of the original word. + if (!puncts.empty()) { + wordObjs.emplace_back(std::move(puncts), wEnd, wEnd); + } } return wordObjs; diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h index 0b284345ec..30062a75ba 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h @@ -9,20 +9,22 @@ namespace rnexecutorch::models::speech_to_text::whisper::constants { // Maximum duration of each audio chunk to process (in seconds) // It is intentionally set to 29 since otherwise only the last chunk would be // correctly transcribe due to the model's positional encoding limit -constexpr static int32_t kChunkSize = 29; +constexpr static size_t kChunkSize = 29; // Sampling rate expected by Whisper and the model's audio pipeline (16 kHz) -constexpr static int32_t kSamplingRate = 16000; -constexpr static int32_t kSamplesPerMilisecond = kSamplingRate / 1000; +constexpr static size_t kSamplingRate = 16000; +constexpr static size_t kSamplesPerMilisecond = kSamplingRate / 1000; + +constexpr static size_t kMaxSamples = kChunkSize * kSamplingRate; // The maximum number of tokens the decoder can generate per chunk -constexpr static int32_t kMaxDecodeLength = 128; +constexpr static size_t kMaxDecodeLength = 128; // Minimum allowed chunk length before processing (in audio samples) -constexpr static int32_t kMinChunkSamples = 1 * kSamplingRate; +constexpr static size_t kMinChunkSamples = 1 * kSamplingRate; // Number of mel frames output by the encoder (derived from input spectrogram) -constexpr static int32_t kNumFrames = 1500; +constexpr static size_t kNumFrames = 1500; // Time precision used by Whisper timestamps: each token spans 0.02 seconds constexpr static float kTimePrecision = 0.02f; @@ -30,6 +32,7 @@ constexpr static float kTimePrecision = 0.02f; // Special characters serving as pause / end of sentence static const std::unordered_set kPunctations = {',', '.', '?', '!', ':', ';'}; +static const std::unordered_set kEosPunctations = {'.', '?', '!', ';'}; // Special token constants namespace tokens { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp deleted file mode 100644 index ce365e4e44..0000000000 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp +++ /dev/null @@ -1,199 +0,0 @@ -#include "HypothesisBuffer.h" -#include "Params.h" -#include "Utils.h" - -#include -#include - -namespace rnexecutorch::models::speech_to_text::whisper::stream { - -void HypothesisBuffer::insert(std::span words, float offset) { - // Step 1 - decide which words should be considered as fresh. - fresh_.clear(); - - // We try to find the last committed word in a transcription string. - // Everything beyond that word will be considered as fresh. - // To make the algorithm more resilient to repeated strings of words, - // we check also the preceeding words as well as timestamps (with liberal - // range). - size_t firstFreshWordIdx = 0; - if (!committed_.empty()) { - std::optional lastMatchingWordIdx = - findCommittedSuffix(words, params::kStreamCommitedSuffixSearchSize, - params::kStreamMaxOverlapTimestampDiff1, - params::kStreamWordsPerErrorRate); - firstFreshWordIdx = lastMatchingWordIdx.value_or(0); - } - - bool isCompletelyFresh = firstFreshWordIdx == 0; - for (size_t i = firstFreshWordIdx; i < words.size(); i++) { - const auto &word = words[i]; - - // Global start is a beginning timestamp relative only to the beginning of - // the current streaming process. - const float startGlobal = word.start + offset; - const float endGlobal = word.end + offset; - - if (!isCompletelyFresh || - startGlobal > lastCommittedTime_ - params::kStreamFreshThreshold) { - fresh_.emplace_back(word.content, startGlobal, endGlobal, - word.punctations); - } - } - - // Step 2 - we have already selected the fresh words. Now it's time to - // correct any mistakes and remove the words which overlap with already - // commited segments - to avoid duplicates. - if (!fresh_.empty() && !committed_.empty()) { - // Calculate the largest overlapping fragment size. - // Note that we use size limit (kStreamMaxOverlapSize) for efficiency of the - // algorithm, and timestamp difference limit - // (kStreamMaxOverlapTimestampDiff) to avoid removing correct fragments - // which were just repeated after some time. - size_t overlapSize = utils::findLargestOverlapingFragment( - committed_, fresh_, params::kStreamMaxOverlapSize, - params::kStreamMaxOverlapTimestampDiff2); - - if (overlapSize > 0) { - fresh_.erase(fresh_.begin(), fresh_.begin() + overlapSize); - } - } -} - -std::deque HypothesisBuffer::commit() { - std::deque toCommit = {}; - - // Find a stable prefix: words that haven't changed between last and current - // iteration. - while (!fresh_.empty() && !hypothesis_.empty() && - fresh_.front().content == hypothesis_.front().content) { - // The last word from the fresh_ buffer must also match punctations with the - // hypothesis. This is done in order to ensure correct punctation marks in - // the resulting transcription. - if (fresh_.size() == 1 && - fresh_.front().punctations != hypothesis_.front().punctations) { - break; - } - - // Take timestamps from the hypothesis, but actual content from the fresh - // buffer. - toCommit.emplace_back(std::move(fresh_.front().content), - hypothesis_.front().start, hypothesis_.front().end, - std::move(fresh_.front().punctations)); - fresh_.pop_front(); - hypothesis_.pop_front(); - } - - // Save the last committed word timestamp. - // This will mark the end of the entire committed sequence. - if (!toCommit.empty()) { - lastCommittedTime_ = toCommit.back().end; - } - - // The remaining words from the fresh buffer (uncommitted phrase) - // become a hypothesis for the next iteration. - hypothesis_ = std::move(fresh_); - fresh_.clear(); - - // The last step is to commit the selected words. - committed_.insert(committed_.end(), toCommit.cbegin(), toCommit.cend()); - - return toCommit; -} - -void HypothesisBuffer::releaseCommits(size_t wordsToKeep) { - if (committed_.size() > wordsToKeep) { - size_t nWordsToErase = committed_.size() - wordsToKeep; - committed_.erase(committed_.begin(), committed_.begin() + nWordsToErase); - } -} - -void HypothesisBuffer::reset() { - fresh_.clear(); - hypothesis_.clear(); - committed_.clear(); - - lastCommittedTime_ = 0.f; -} - -std::optional HypothesisBuffer::findCommittedSuffix( - std::span words, size_t nCommitted, - float timestampDiffTolerance, size_t wordsPerMistake) { - if (words.empty() || committed_.empty() || nCommitted == 0) { - return std::nullopt; - } - - // Determine the subset size of committed words to check against. - size_t committedToMatchSize = std::min(nCommitted, committed_.size()); - - // Iterate backwards through 'words' to find the most recent occurrence of a - // suffix of 'committed_' (or the full 'committed_' sequence). - for (int32_t i = static_cast(words.size()) - 1; i >= 0; --i) { - bool match = true; - size_t matchedCount = 0; - size_t contentMistakeCount = 0; - - // Linearly interpolate tolerance if we are at the beginning and can't check - // all committed words. - float effectiveTolerance = timestampDiffTolerance; - if (i < static_cast(committedToMatchSize) - 1) { - effectiveTolerance *= - static_cast(i + 1) / static_cast(committedToMatchSize); - } - - // Try to match backwards from words[i] and committed_.back() - for (size_t j = 0; j < committedToMatchSize; ++j) { - int32_t wordsIdx = i - static_cast(j); - int32_t committedIdx = - static_cast(committed_.size()) - 1 - static_cast(j); - - if (wordsIdx < 0) { - // We reached the beginning of the words span. - // The algorithm allows matching a partial prefix if it's at the start. - break; - } - - const Word &w1 = words[wordsIdx]; - const Word &w2 = committed_[committedIdx]; - - // Check timestamps within tolerance - if (std::max(std::abs(w1.start - w2.start), std::abs(w1.end - w2.end)) > - effectiveTolerance) { - match = false; - break; - } - - // Allow sparse content mismatches while still treating the overall - // sequence as matching. - if (utils::equalsIgnoreCase(w1.content, w2.content)) { - matchedCount++; - } else { - contentMistakeCount++; - } - - // Early exit if mistake count already exceeds what we can recover from - // given the remaining words to check. - if (wordsPerMistake > 0) { - size_t remainingToMatch = committedToMatchSize - 1 - j; - size_t maxPossibleMatched = matchedCount + remainingToMatch; - if (contentMistakeCount > (maxPossibleMatched / wordsPerMistake)) { - match = false; - break; - } - } - } - - // One content mistake is allowed per M matched words. - size_t maxAllowedMistakes = - (wordsPerMistake == 0) ? 0 : (matchedCount / wordsPerMistake); - - if (match && matchedCount > 0 && - contentMistakeCount <= maxAllowedMistakes) { - return static_cast(i); - } - } - - return std::nullopt; -} - -} // namespace rnexecutorch::models::speech_to_text::whisper::stream diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.h deleted file mode 100644 index 25833ec01b..0000000000 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.h +++ /dev/null @@ -1,82 +0,0 @@ -#pragma once - -#include -#include -#include - -#include "../common/types/Word.h" - -namespace rnexecutorch::models::speech_to_text::whisper::stream { - -/** - * A buffer for managing streaming transcription hypotheses. - * This class handles stabilization of the transcription result by tracking - * "fresh" hypotheses and "committing" them once they are stable across updates. - */ -class HypothesisBuffer { -public: - /** - * Inserts new words into the fresh_ buffer. - * Words are filtered based on the last committed time and checked for - * overlaps with existing committed words to prevent duplicates. - * - * @param newWords A span of recently generated words. - * @param offset Time offset to adjust the word timestamps. - */ - void insert(std::span words, float offset); - - /** - * Attempts to commit words present in the fresh_ buffer. - * A phrase from fresh_ buffer can only be committed if it also appears - * in the hypothesis_ buffer (uncommitted words from previous iteration). - * - * Uncommitted words become a 'hypothesis' and are moved into the hypothesis_ - * buffer. - * - * @return A sequence of words committed in the current iteration. - */ - std::deque commit(); - - /** - * Shrinks the committed_ buffer by erasing all words except N latest ones. - * - * Used primarily to relieve increasing memory usage during very - * long streaming sessions. - * - * @param wordsToKeep - number of trailing words to be kept in. - */ - void releaseCommits(size_t wordsToKeep); - - /** - * Resets all the stored buffers and state variables to the initial state - */ - void reset(); - - // Declare a friendship with OnlineASR to allow it to access the internal - // state of stored buffers. - friend class OnlineASR; - -private: - // Finds the most recent occurance of given committed string of words - // in a custom span of words. - // Returns the index of the last matching word (or nullopt if not present). - std::optional findCommittedSuffix(std::span words, - size_t nCommitted, - float timestampDiffTolerance = 1.F, - size_t wordsPerMistake = 4); - - // Stored buffers - // The lifecycle of a correct result word looks as following: - // fresh buffer -> hypothesis buffer -> commited - std::deque - fresh_; // 'New' words from current iterations, which require some checks - // before they go into hypothesis_ buffer. - std::deque - hypothesis_; // Words potentially to be commited, stored between - // iterations (obtained from fresh_ buffer). - std::deque committed_; // A history of already commited words. - - float lastCommittedTime_ = 0.0f; -}; - -} // namespace rnexecutorch::models::speech_to_text::whisper::stream diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp index ded2183201..fb57fcb0f3 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp @@ -1,26 +1,22 @@ +#include "OnlineASR.h" + #include #include -#include -#include +#include #include "Constants.h" -#include "OnlineASR.h" #include "Params.h" #include "Utils.h" namespace rnexecutorch::models::speech_to_text::whisper::stream { -namespace { -std::vector move_to_vector(std::deque &container) { - return std::vector(std::make_move_iterator(container.begin()), - std::make_move_iterator(container.end())); +OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) { + // Reserve an expected amount of memory for audio buffer. + audioBuffer_.reserve((constants::kChunkSize + 1) * constants::kSamplingRate); } -} // namespace -OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) { - // Reserve a minimal expected amount of memory for audio buffer. - audioBuffer_.reserve(static_cast(2 * params::kStreamChunkThreshold * - constants::kSamplingRate)); +bool OnlineASR::isReady() const { + return audioBuffer_.size() >= constants::kMinChunkSamples; } void OnlineASR::insertAudioChunk(std::span audio) { @@ -28,10 +24,6 @@ void OnlineASR::insertAudioChunk(std::span audio) { audioBuffer_.insert(audioBuffer_.end(), audio.begin(), audio.end()); } -bool OnlineASR::isReady() const { - return audioBuffer_.size() >= constants::kMinChunkSamples; -} - ProcessResult OnlineASR::process(const DecodingOptions &options) { std::vector audioCopy; @@ -42,122 +34,176 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) { audioCopy = audioBuffer_; } - std::vector transcriptions = asr_->transcribe(audioBuffer_, options); + // Obtain a transcription for current audio buffer state. + // It's very unlikely that buffer will exceed whisper's maximum capacity, but + // for absolute safety we can additionally clip the buffer. + std::span input( + audioCopy.begin(), + audioCopy.begin() + std::min(constants::kMaxSamples, audioCopy.size())); - if (transcriptions.empty()) { - return {.committed = {}, .nonCommitted = {}}; - } + std::vector transcriptions = asr_->transcribe(input, options); // Flatten segments into a single word sequence. + // This is basically our 'nonCommitted' part for now. std::vector words; - words.reserve(transcriptions.front().words.size()); - for (auto &segment : transcriptions) { - words.insert(words.end(), std::make_move_iterator(segment.words.begin()), - std::make_move_iterator(segment.words.end())); + std::move(segment.words.begin(), segment.words.end(), + std::back_inserter(words)); } - hypothesisBuffer_.insert(words, bufferTimeOffset_); - - // Apply fix for timestamps. - if (!hypothesisBuffer_.fresh_.empty()) { - size_t noNewWords = hypothesisBuffer_.fresh_.size(); - float establishedEnd = hypothesisBuffer_.lastCommittedTime_; - float newBegin = hypothesisBuffer_.fresh_.front().start; - const float newEnd = hypothesisBuffer_.fresh_.back().end; - float shift = 0.F; - for (size_t i = 0; i < hypothesisBuffer_.fresh_.size(); i++) { - const float originalEnd = hypothesisBuffer_.fresh_[i].end; - - if (i < hypothesisBuffer_.hypothesis_.size() && - utils::equalsIgnoreCase(hypothesisBuffer_.fresh_[i].content, - hypothesisBuffer_.hypothesis_[i].content)) { - hypothesisBuffer_.fresh_[i].start = - hypothesisBuffer_.hypothesis_[i].start; - hypothesisBuffer_.fresh_[i].end = hypothesisBuffer_.hypothesis_[i].end; - shift = hypothesisBuffer_.fresh_[i].end - originalEnd; - - establishedEnd = hypothesisBuffer_.hypothesis_[i].end; - newBegin = hypothesisBuffer_.fresh_[i].end; - noNewWords--; - continue; - } - - // In case of a new word, we apply timestamp range scaling - // based on timestamps established in previous iterations. - const float freshDuration = newEnd - establishedEnd; - const float epsilon = std::max( - 0.F, 0.85F * (freshDuration - - static_cast(noNewWords / - params::kStreamWordsPerSecond))); - float scale = - (freshDuration - epsilon) / std::max(newEnd - newBegin, 0.2F); - hypothesisBuffer_.fresh_[i].start = - shift + (hypothesisBuffer_.fresh_[i].start - newEnd) * scale + newEnd; - hypothesisBuffer_.fresh_[i].end = - shift + (hypothesisBuffer_.fresh_[i].end - newEnd) * scale + newEnd; + std::vector committed; + + // Step 1: examine all previously saved EOS points. + // The idea is to remove entries which have changed or no longer exist + // due to model correcting it's output. + for (size_t i = 0; i < eos_.size(); i++) { + const auto &eos = eos_[i]; + if (eos.position >= words.size() || !utils::isEos(words[eos.position]) || + (eos.position > 0 && + eos.preceeding != words[eos.position - 1].content)) { + eos_.erase(eos_.begin() + i, eos_.end()); + break; } } - auto committed = hypothesisBuffer_.commit(); - auto nonCommitted = hypothesisBuffer_.hypothesis_; - - // We want to save the most recent end of sentence word - // to improve the audio cutting mechanism. - for (const auto &word : committed) { - if (!word.punctations.empty()) { - lastSentenceEnd_ = word.end; + // Step 2: check if the newest EOS character from transcript should be + // saved to eos_ vector. + auto lastEosIt = std::find_if(words.rbegin(), words.rend(), utils::isEos); + if (lastEosIt != words.rend()) { + size_t lastEosIndex = std::distance(words.begin(), lastEosIt.base()) - 1; + + // Because of step 1, we know that if the last EOS exist in eos_, + // then it must be the last entry. + if (eos_.empty() || eos_.back().position != lastEosIndex) { + // Register last EOS entry + std::string preceeding = + lastEosIndex > 0 ? words[lastEosIndex - 1].content : ""; + eos_.emplace_back(lastEosIndex, preceeding, lastEosIt->end); } } - // Since Whisper does not accept waveforms longer than 30 seconds, we need - // to cut the audio at some safe point. - { + // Step 3: clear the buffer if it is getting too large. + // The idea is to use the saved EOS entries and try to cut the buffer + // in a 'good' spot - where it will remove a significant audio chunk, yet + // won't affect most recent, unfinished speech samples. + size_t bufferSize = audioBuffer_.size(); + if (bufferSize > static_cast(params::kStreamSafeBufferDuration * + constants::kSamplingRate)) { + // Setup the lock for the entire cleanup section. std::scoped_lock lock(audioBufferMutex_); - const float audioDuration = - static_cast(audioBuffer_.size()) / constants::kSamplingRate; - if (audioDuration > params::kStreamChunkThreshold) { - // Leave some portion of audio in, to improve model behavior - // in future iterations. - const float erasePoint = - hypothesisBuffer_.lastCommittedTime_ == lastSentenceEnd_ - ? audioDuration - : std::min(lastSentenceEnd_, params::kStreamChunkThreshold); - const float minEraseDuration = - audioDuration - params::kStreamAudioBufferMaxReserve; - const float maxEraseDuration = - audioDuration - params::kStreamAudioBufferMinReserve; - const float eraseDuration = std::clamp( - erasePoint - bufferTimeOffset_, minEraseDuration, maxEraseDuration); - const size_t nSamplesToErase = - static_cast(eraseDuration * constants::kSamplingRate); - - audioBuffer_.erase(audioBuffer_.begin(), - audioBuffer_.begin() + nSamplesToErase); - bufferTimeOffset_ += eraseDuration; + const float midBufferThreshold = params::kStreamMaxDuration / 2.0F; + + // If we don't have any EOS entries, then we most likely have not + // recorded any speech. In this case we can safely cut the maximum amount of + // audio data. + if (eos_.empty()) { + size_t cut = bufferSize - params::kStreamSafetyThreshold; + + audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut); + } + + // If we have exactly one (most recent) EOS entry in the eos_, then + // we need to be more careful. + // Normally we want to keep at least one sentence in, but if the sentence + // covers a significant amount of buffer, we have no choice. + else if (eos_.size() == 1) { + const float eosTimestamp = eos_[0].tmstpend; + + const float upperHalfDuration = + std::max(0.0F, eosTimestamp - midBufferThreshold); + const float wordsPerSecond = + upperHalfDuration > 0.1F + ? static_cast(words.size()) / upperHalfDuration + : 0.0F; + + // The EOS sits early enough that cutting up to the safety margin won't + // touch the ongoing (post-EOS) speech. + const bool eosSafe = eosTimestamp < params::kStreamSafeBufferDuration - + params::kStreamSafetyThreshold; + + if (!eosSafe && wordsPerSecond < params::kWordsPerSecondLow) { + // EOS lies past the midpoint, but a low word density implies the spoken + // audio is concentrated in the upper half. Drop the lower half and + // shift the EOS accordingly. + audioBuffer_.erase(audioBuffer_.begin(), + audioBuffer_.begin() + + static_cast(midBufferThreshold * + constants::kSamplingRate)); + eos_[0].tmstpend -= midBufferThreshold; + } else { + // Cut everything up to and including the sentence — either by the + // safety margin (when EOS is early) or (more aggresively) right at the + // EOS boundary — and commit its words. + const size_t cut = + eosSafe + ? bufferSize - + static_cast(params::kStreamSafetyThreshold * + constants::kSamplingRate) + : static_cast(eosTimestamp * constants::kSamplingRate); + + audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut); + + committed.insert(committed.end(), + std::make_move_iterator(words.begin()), + std::make_move_iterator(words.end())); + + words.clear(); + eos_.clear(); + } + } + + // In case of 2 or more sentences, we generally want to keep the last one + // intact. This would provide a bit of stability to the algorithm. + else { + const auto &secondTolastEntry = eos_[eos_.size() - 2]; + + const size_t cut = static_cast(secondTolastEntry.tmstpend * + constants::kSamplingRate); + const size_t lastCommittedPos = secondTolastEntry.position; + + audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut); + + // Move all words up to the last committed position (inclusive) to the + // committed buffer. + committed.insert( + committed.end(), std::make_move_iterator(words.begin()), + std::make_move_iterator(words.begin() + lastCommittedPos + 1)); + words.erase(words.begin(), words.begin() + lastCommittedPos + 1); + + // Retain only the most recent EOS entry. + eos_.erase(eos_.begin(), eos_.end() - 1); + eos_[0].tmstpend -= secondTolastEntry.tmstpend; } } - return {.committed = move_to_vector(committed), - .nonCommitted = move_to_vector(nonCommitted)}; + // Return the results + // Note that uncommitted part represented by recent transcription (words) + // is already shrinked if something has been committed during the cleanup + // phase. + return {.committed = std::move(committed), .nonCommitted = std::move(words)}; } -std::vector OnlineASR::finish() { - // We always push the last remaining hypothesis, even if it's not - // confirmed in second iteration, to avoid ending up with broken sentences. - std::deque remaining = hypothesisBuffer_.hypothesis_; +std::vector OnlineASR::finish(const DecodingOptions &options) { + ProcessResult result = process(options); + + // Last-tick committed delta + whatever never made it past the commit + // threshold. + std::vector residual = std::move(result.committed); + residual.insert(residual.end(), + std::make_move_iterator(result.nonCommitted.begin()), + std::make_move_iterator(result.nonCommitted.end())); - return move_to_vector(remaining); + reset(); + + return residual; } void OnlineASR::reset() { std::scoped_lock lock(audioBufferMutex_); - - hypothesisBuffer_.reset(); - bufferTimeOffset_ = 0.f; - audioBuffer_.clear(); + + eos_.clear(); } } // namespace rnexecutorch::models::speech_to_text::whisper::stream diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h index df6d469e39..0c0b65f40e 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h @@ -1,13 +1,13 @@ #pragma once +#include +#include +#include + #include "../common/schema/OnlineASR.h" #include "../common/types/ProcessResult.h" -#include "../common/types/Segment.h" #include "../common/types/Word.h" #include "ASR.h" -#include "HypothesisBuffer.h" - -#include namespace rnexecutorch::models::speech_to_text::whisper::stream { @@ -21,40 +21,32 @@ class OnlineASR : public schema::OnlineASR { OnlineASR(const ASR *asr); /** - * Appends new audio samples to the internal processing buffer. - * - * @param audio A span of PCM float samples (expected 16kHz). + * Checks if the buffer contains enough audio for the next processing step. + * @return True if ready, false otherwise. */ - void insertAudioChunk(std::span audio) override; + bool isReady() const override; /** - * Determines whether the model is ready to process the next iteration. - * - * @return True if audioBuffer has enough samples, False otherwise + * Appends audio samples to the internal buffer. + * @param audio Span containing the audio data. */ - bool isReady() const override; + void insertAudioChunk(std::span audio) override; /** - * Processes the current audio buffer and returns new transcription results. - * Stability is managed by an internal HypothesisBuffer to ensure that - * only confirmed (stable) text is returned as "committed". - * - * @param options Decoding configuration (language, etc.). - * @return A ProcessResult containing newly committed and uncommitted - * words. + * Processes the current buffered audio and returns transcription results. + * @param options Decoding options for the transcription. + * @return Transcription result containing committed and volatile tokens. */ ProcessResult process(const DecodingOptions &options) override; /** - * Finalizes the current streaming session. - * Flushes any remaining words from the hypothesis buffer. - * - * @return A vector of remaining transcribed words. + * Finalizes the current stream and returns all words. + * @return Vector of detected words. */ - std::vector finish() override; + std::vector finish(const DecodingOptions &options) override; /** - * Reset the streaming state by resetting the buffers + * Resets the internal state and clears buffers. */ void reset() override; @@ -62,19 +54,20 @@ class OnlineASR : public schema::OnlineASR { // ASR module connection for transcribing the audio const ASR *asr_; - // Helper buffers - audio buffer - // Stores the increasing amounts of streamed audio. - // Cleared from time to time after reaching a threshold size. + // Audio buffer (input) - accumulates obtained audio samples. std::vector audioBuffer_ = {}; mutable std::mutex audioBufferMutex_; - float bufferTimeOffset_ = 0.F; // Audio buffer offset - - // Helper buffers - hypothesis buffer - // Manages the whisper streaming hypothesis mechanism. - HypothesisBuffer hypothesisBuffer_; - // State members to keep track of specyfic aspects of buffer state - float lastSentenceEnd_ = 0.F; + // State management helper. + struct EOSEntry { + size_t position; // An absolute position (index) in the transcription (word + // sequence). + std::string preceeding; // A preceeding word in the transcription + float tmstpend; // Ending timestamp of the sentence. + }; + // Stores saved EOS entries in most recent transcription + // and allows to clear the buffer in a smart, non invasive way. + std::vector eos_; }; -} // namespace rnexecutorch::models::speech_to_text::whisper::stream \ No newline at end of file +} // namespace rnexecutorch::models::speech_to_text::whisper::stream diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h index 5eb74c06cc..a99067f411 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h @@ -1,6 +1,9 @@ #pragma once +#include "Constants.h" + #include +#include /** * Hyperparameters @@ -11,90 +14,50 @@ namespace rnexecutorch::models::speech_to_text::whisper::params { /** - * Determines the range of buffer left when skipping an audio chunk - * of size lower than maximum allowed chunk size. - * - * If the audio length does not exceed [kChunkSize * kSamplingRate] - [buffer], - * then instead of moving to the last returned timestamp, we jump across the - * entire 30 seconds chunk. This resolves the issue of multiple redundant - * segments being produced by the transcription algorithm. + * Maximum duration of audio that the streaming buffer keeps before forcing + * a cleanup. Aligned with Whisper's maximum supported input length. */ -constexpr static int32_t kChunkBreakBuffer = 2; // [s] +constexpr inline float kStreamMaxDuration = + static_cast(constants::kChunkSize); /** - * Determines the maximum timestamp difference available for a word to be - * considered as fresh in streaming algorithm. + * The minimum amount of recent audio always kept in the buffer when a blind + * cut is performed. Acts as the lower bound on what survives a cleanup. */ -constexpr static float kStreamFreshThreshold = 3.F; // [s], originally 0.5 +constexpr inline float kStreamSafetyThreshold = 2.F; // [s] /** - * The size of the most recent committed suffix searched in - * fresh words string. - * - * For example, if the committed buffer contains ["I", "did" "a" "very" "nasty" - * "thing."], and kStreamCommitedSuffixSearchSize = 3, then we search for - * ["very" "nasty" "thing."] suffix. + * Forced-cleanup threshold. Once the buffer grows past this duration we run + * the EOS-anchored cleanup routine. */ -constexpr static size_t kStreamCommitedSuffixSearchSize = 5; +constexpr inline float kStreamSafeBufferDuration = + kStreamMaxDuration - kStreamSafetyThreshold; // [s] /** - * Determines the maximum expected size of overlapping fragments between - * fresh words buffer and commited words buffer in streaming mode. - * - * It is a limit of maximum amount of erased repeated words from fresh buffer. - * The bigger it gets, the less probable it is to commit the same phrase twice. + * An estimate of the number of words spoken per second. + * Used for estimating transcription progress and buffer management heuristics. */ -constexpr static size_t kStreamMaxOverlapSize = - 12; // Number of overlaping words +constexpr inline float kWordsPerSecondEstimation = 2.25F; /** - * Similar to kMaxStreamOverlapSize, but this one determines - * the maximum allowed timestamp difference between the overlaping fragments. - * - * It's the first, more strict threshold, used when searching for recently - * committed entries. + * Upper bound for words per second estimate in fast speech. */ -constexpr static float kStreamMaxOverlapTimestampDiff1 = 6.F; // [s] +constexpr inline float kWordsPerSecondHigh = 4.F; /** - * Similar to kMaxStreamOverlapSize, but this one determines - * the maximum allowed timestamp difference between the overlaping fragments. - * - * It's the second, more liberal threshold, used in overlap correction - * algorithm. + * Lower bound for words per second estimate in slow speech. */ -constexpr static float kStreamMaxOverlapTimestampDiff2 = 15.F; // [s] +constexpr inline float kWordsPerSecondLow = 1.5F; /** - * Number of words per 1 allowed mistake (error correction). + * Determines the range of buffer left when skipping an audio chunk + * of size lower than maximum allowed chunk size. * - * For example, if kStreamWordsPerErrorRate = 4, then we allow maximum 1 mistake - * in a 4 word string. - */ -constexpr static size_t kStreamWordsPerErrorRate = 5; - -/** - * A threshold which exceeded causes the main streaming audio buffer to be - * cleared. - */ -constexpr static float kStreamChunkThreshold = 20.F; // [s] - -/** - * Decides how much of recent audio waveform is always kept in when - * clearing the audio buffer in streaming algorithm. - */ -constexpr static float kStreamAudioBufferMinReserve = 2.F; // [s] - -/** - * Decides how much of recent audio waveform can be kept in when - * clearing the audio buffer in streaming algorithm. - */ -constexpr static float kStreamAudioBufferMaxReserve = 6.F; // [s] - -/** - * An estimate of number of words per second produced in a standard - * human conversation speech. + * If the audio length does not exceed [kChunkSize * kSamplingRate] - [buffer], + * then instead of moving to the last returned timestamp, we jump across the + * entire 30 seconds chunk. This resolves the issue of multiple redundant + * segments being produced by the transcription algorithm. */ -constexpr static float kStreamWordsPerSecond = 2.5F; +constexpr inline int32_t kChunkBreakBuffer = 2; // [s] -} // namespace rnexecutorch::models::speech_to_text::whisper::params \ No newline at end of file +} // namespace rnexecutorch::models::speech_to_text::whisper::params diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h index 2e4e3b5076..48c84a84b7 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h @@ -1,6 +1,7 @@ #pragma once #include "../common/types/Word.h" +#include "Constants.h" #include #include #include @@ -8,70 +9,14 @@ namespace rnexecutorch::models::speech_to_text::whisper::utils { -// Compares two strings without case-sensitivity. -inline bool equalsIgnoreCase(const std::string &a, const std::string &b) { - if (a.size() != b.size()) { - return false; - } - return std::equal(a.begin(), a.end(), b.begin(), [](char c1, char c2) { - return std::tolower(static_cast(c1)) == - std::tolower(static_cast(c2)); - }); -} - /** - * Finds the largest (in number of words) overlaping fragment between word - * vectors A (suffix) and B (prefix). + * Checks if the given word represents an End-of-Sentence (EOS) punctuation. * - * An overlaping fragment is any fragment C, which can be simultaneously a - * suffix of A and a prefix of B. Example: A = 'Jane likes food and playing - * games', B = 'playing games and sleeping', the overlap fragment C = 'playing - * games'. - * - * @param suffixVec An input vector, where only suffixes can overlap. - * Typically the 'commited' buffer in streaming algorithm. - * @param preffixVec An input vector, where only prefixes can overlap. - * Typically the 'fresh' buffer in streaming algorithm. - * @param maxCheckRange The maximum size of overlapping fragment. Determines the - * range of search. - * @param maxTimestampDiff The maximum allowed timestamp difference between - * overlaping fragments. If exceeded, the fragment are not considered as - * overlaping. - * @return The size of the largest found overlaping fragment. + * @param word The word to check. */ -template -inline size_t findLargestOverlapingFragment(const Container &suffixVec, - const Container &prefixVec, - size_t maxCheckRange = 10, - float maxTimestampDiff = 100.f) { - size_t range = std::min({suffixVec.size(), prefixVec.size(), maxCheckRange}); - - if (range == 0) { - return 0; - } - - // i starts at the index where the suffix of length 'range' begins. - for (size_t i = suffixVec.size() - range; i < suffixVec.size(); ++i) { - // We search for overlaps by searching for the first word of prefixVec - if (equalsIgnoreCase(suffixVec[i].content, prefixVec[0].content)) { - size_t calculatedSize = suffixVec.size() - i; - - bool isEqual = - std::equal(suffixVec.begin() + i, suffixVec.end(), prefixVec.begin(), - [maxTimestampDiff](const Word &sWord, const Word &pWord) { - return equalsIgnoreCase(sWord.content, pWord.content) && - std::max(std::fabs(sWord.start - pWord.start), - std::fabs(sWord.end - pWord.end)) <= - maxTimestampDiff; - }); - - if (isEqual) { - return calculatedSize; - } - } - } - - return 0; +constexpr inline bool isEos(const Word &word) { + return word.content.size() == 1 && + constants::kEosPunctations.contains(word.content[0]); } } // namespace rnexecutorch::models::speech_to_text::whisper::utils \ No newline at end of file From 92b3f29baa286d99907c529cc3366c90bfec0a4c Mon Sep 17 00:00:00 2001 From: IgorSwat Date: Thu, 7 May 2026 12:39:02 +0200 Subject: [PATCH 02/20] Revert back to 100ms refresh rate --- apps/speech/screens/SpeechToTextScreen.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index dfd39c15b4..4e5e19ae48 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -148,7 +148,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { recorder.current.onAudioReady( { sampleRate, - bufferLength: 0.1 * sampleRate, + bufferLength: 0.1 * sampleRate, // 100 ms channelCount: 1, }, ({ buffer }) => { From 35290db7403af508a45ac511212b197a8211f2e3 Mon Sep 17 00:00:00 2001 From: IgorSwat Date: Thu, 7 May 2026 17:02:21 +0200 Subject: [PATCH 03/20] Add CoreML whisper models --- apps/llm/app/index.tsx | 6 - apps/llm/app/voice_chat/index.tsx | 311 ------------------ apps/speech/screens/SpeechToTextScreen.tsx | 19 +- .../models/speech_to_text/whisper/ASR.cpp | 25 +- .../src/constants/modelUrls.ts | 78 ++--- yarn.lock | 14 + 6 files changed, 76 insertions(+), 377 deletions(-) delete mode 100644 apps/llm/app/voice_chat/index.tsx diff --git a/apps/llm/app/index.tsx b/apps/llm/app/index.tsx index 72358ae72c..b67b3fa7ce 100644 --- a/apps/llm/app/index.tsx +++ b/apps/llm/app/index.tsx @@ -29,12 +29,6 @@ export default function Home() { > LLM Structured Output - router.navigate('voice_chat/')} - > - Voice Chat - router.navigate('multimodal_llm/')} diff --git a/apps/llm/app/voice_chat/index.tsx b/apps/llm/app/voice_chat/index.tsx deleted file mode 100644 index 23ab70bff4..0000000000 --- a/apps/llm/app/voice_chat/index.tsx +++ /dev/null @@ -1,311 +0,0 @@ -import { useContext, useEffect, useState } from 'react'; -import { - Keyboard, - KeyboardAvoidingView, - Platform, - StyleSheet, - Text, - TouchableOpacity, - TouchableWithoutFeedback, - View, -} from 'react-native'; -import SWMIcon from '../../assets/icons/swm_icon.svg'; -import Spinner from '../../components/Spinner'; -import ErrorBanner from '../../components/ErrorBanner'; -import { - useSpeechToText, - useLLM, - QWEN3_0_6B_QUANTIZED, - QWEN3_1_7B_QUANTIZED, - LLAMA3_2_1B_SPINQUANT, - WHISPER_TINY_EN, - WHISPER_TINY_EN_QUANTIZED, - WHISPER_BASE_EN, - WHISPER_SMALL_EN, - LLMProps, - SpeechToTextProps, -} from 'react-native-executorch'; -import { ModelPicker, ModelOption } from '../../components/ModelPicker'; -import PauseIcon from '../../assets/icons/pause_icon.svg'; -import MicIcon from '../../assets/icons/mic_icon.svg'; -import StopIcon from '../../assets/icons/stop_icon.svg'; -import ColorPalette from '../../colors'; -import Messages from '../../components/Messages'; -import { AudioManager, AudioRecorder } from 'react-native-audio-api'; -import DeviceInfo from 'react-native-device-info'; -import { useIsFocused } from '@react-navigation/native'; -import { useSafeAreaInsets } from 'react-native-safe-area-context'; -import { GeneratingContext } from '../../context'; - -type LLMModelSources = LLMProps['model']; -type STTModelSources = SpeechToTextProps['model']; - -const LLM_MODELS: ModelOption[] = [ - { label: 'Qwen3 0.6B', value: QWEN3_0_6B_QUANTIZED }, - { label: 'Qwen3 1.7B', value: QWEN3_1_7B_QUANTIZED }, - { label: 'Llama 1B', value: LLAMA3_2_1B_SPINQUANT }, -]; - -const STT_MODELS: ModelOption[] = [ - { label: 'Whisper Tiny', value: WHISPER_TINY_EN }, - { label: 'Whisper Tiny Q', value: WHISPER_TINY_EN_QUANTIZED }, - { label: 'Whisper Base', value: WHISPER_BASE_EN }, - { label: 'Whisper Small', value: WHISPER_SMALL_EN }, -]; - -export default function VoiceChatScreenWrapper() { - const isFocused = useIsFocused(); - - return isFocused ? : null; -} - -function VoiceChatScreen() { - const { bottom } = useSafeAreaInsets(); - const [isRecording, setIsRecording] = useState(false); - const [liveTranscription, setLiveTranscription] = useState(''); - const [selectedLLM, setSelectedLLM] = - useState(QWEN3_0_6B_QUANTIZED); - const [selectedSTT, setSelectedSTT] = - useState(WHISPER_TINY_EN); - const [error, setError] = useState(null); - - const [recorder] = useState(() => new AudioRecorder()); - - const { setGlobalGenerating } = useContext(GeneratingContext); - - const llm = useLLM({ model: selectedLLM }); - const speechToText = useSpeechToText({ - model: selectedSTT, - }); - - useEffect(() => { - setGlobalGenerating(llm.isGenerating || speechToText.isGenerating); - }, [llm.isGenerating, speechToText.isGenerating, setGlobalGenerating]); - - useEffect(() => { - AudioManager.setAudioSessionOptions({ - iosCategory: 'playAndRecord', - iosMode: 'spokenAudio', - iosOptions: ['allowBluetoothHFP', 'defaultToSpeaker'], - }); - AudioManager.requestRecordingPermissions(); - }, []); - - const handleRecordPress = async () => { - if (isRecording) { - setIsRecording(false); - recorder.stop(); - speechToText.streamStop(); - } else { - setIsRecording(true); - setLiveTranscription(''); - - const sampleRate = 16000; - recorder.onAudioReady( - { - sampleRate, - bufferLength: 0.1 * sampleRate, - channelCount: 1, - }, - ({ buffer }) => { - speechToText.streamInsert(buffer.getChannelData(0)); - } - ); - recorder.start(); - - let finalResult = ''; - - try { - for await (const result of speechToText.stream()) { - const text = result.committed.text + result.nonCommitted.text; - setLiveTranscription(text); - finalResult = text; - } - } catch (e) { - setError(e instanceof Error ? e.message : String(e)); - } finally { - if (finalResult.trim().length > 0) { - await llm.sendMessage(finalResult); - setLiveTranscription(''); - } - } - } - }; - - useEffect(() => { - if (llm.error) setError(String(llm.error)); - }, [llm.error]); - - useEffect(() => { - if (speechToText.error) setError(String(speechToText.error)); - }, [speechToText.error]); - - return (!llm.isReady || !speechToText.isReady) && - !llm.error && - !speechToText.error ? ( - - ) : ( - - - - - Qwen 3 x Whisper - - setError(null)} /> - {llm.messageHistory.length > 0 || liveTranscription.length > 0 ? ( - - 0 - ? [ - ...llm.messageHistory, - { - role: 'user', - content: liveTranscription, - }, - ] - : llm.messageHistory - } - llmResponse={llm.response} - isGenerating={llm.isGenerating} - deleteMessage={llm.deleteMessage} - /> - - ) : ( - - Hello! 👋 - - Tap the mic and speak to me. I'll transcribe your voice and - respond using a language model — all on-device. - - - )} - - setSelectedLLM(m)} - /> - setSelectedSTT(m)} - /> - - - {DeviceInfo.isEmulatorSync() ? ( - - - recording disabled on emulator - - - ) : ( - <> - {llm.isGenerating ? ( - - - - ) : ( - - {isRecording ? ( - - ) : ( - - )} - - )} - - )} - - - - ); -} - -const styles = StyleSheet.create({ - keyboardAvoidingView: { - flex: 1, - }, - topContainer: { - height: 68, - width: '100%', - alignItems: 'center', - justifyContent: 'center', - }, - chatContainer: { - flex: 10, - width: '100%', - }, - textModelName: { - color: ColorPalette.primary, - }, - helloMessageContainer: { - flex: 10, - width: '100%', - alignItems: 'center', - justifyContent: 'center', - }, - helloText: { - fontFamily: 'medium', - fontSize: 30, - color: ColorPalette.primary, - }, - bottomHelloText: { - fontFamily: 'regular', - fontSize: 20, - lineHeight: 28, - textAlign: 'center', - color: ColorPalette.primary, - }, - bottomContainer: { - height: 100, - width: '100%', - justifyContent: 'center', - alignItems: 'center', - paddingHorizontal: 16, - }, - recordTouchable: { - height: '100%', - justifyContent: 'center', - alignItems: 'center', - }, - recordingInfo: { - width: '100%', - display: 'flex', - justifyContent: 'center', - alignItems: 'center', - }, - emulatorBox: { - padding: 10, - margin: 10, - borderWidth: 1, - borderRadius: 8, - borderColor: 'gray', - justifyContent: 'center', - alignItems: 'center', - }, - emulatorWarning: { - color: 'gray', - fontSize: 16, - }, -}); diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index 4e5e19ae48..1c8859d224 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -13,9 +13,10 @@ import { import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; import { useSpeechToText, - WHISPER_TINY_EN, - WHISPER_TINY_EN_QUANTIZED, - WHISPER_BASE_EN, + WHISPER_TINY_EN_XNNPACK, + WHISPER_TINY_EN_COREML, + WHISPER_BASE_EN_XNNPACK, + WHISPER_BASE_EN_COREML, WHISPER_SMALL_EN, TranscriptionResult, SpeechToTextProps, @@ -25,9 +26,10 @@ import { ModelPicker, ModelOption } from '../components/ModelPicker'; type STTModelSources = SpeechToTextProps['model']; const MODELS: ModelOption[] = [ - { label: 'Whisper Tiny', value: WHISPER_TINY_EN }, - { label: 'Whisper Tiny Q', value: WHISPER_TINY_EN_QUANTIZED }, - { label: 'Whisper Base', value: WHISPER_BASE_EN }, + { label: 'Whisper Tiny', value: WHISPER_TINY_EN_XNNPACK }, + { label: 'Whisper Tiny CoreML', value: WHISPER_TINY_EN_COREML }, + { label: 'Whisper Base', value: WHISPER_BASE_EN_XNNPACK }, + { label: 'Whisper Base CoreML', value: WHISPER_BASE_EN_COREML }, { label: 'Whisper Small', value: WHISPER_SMALL_EN }, ]; import FontAwesome from '@expo/vector-icons/FontAwesome'; @@ -46,8 +48,9 @@ import ErrorBanner from '../components/ErrorBanner'; const isSimulator = DeviceInfo.isEmulatorSync(); export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { - const [selectedModel, setSelectedModel] = - useState(WHISPER_TINY_EN); + const [selectedModel, setSelectedModel] = useState( + WHISPER_TINY_EN_XNNPACK + ); const model = useSpeechToText({ model: selectedModel, diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp index b0d08e419b..d2555a79fa 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp @@ -263,11 +263,21 @@ ASR::generate(std::span waveform, const DecodingOptions &options, std::vector scores; uint64_t startPos = 0; - while (std::cmp_less_equal(startPos + sequenceIds.size(), - constants::kMaxDecodeLength)) { - executorch::aten::Tensor logitsTensor = - this->decode(sequenceIds, encoderFeatures, startPos); + // Prefill: feed each initial token individually so decode() always sees 1 + // token + std::span firstToken(sequenceIds.data(), 1); + executorch::aten::Tensor logitsTensor = + this->decode(firstToken, encoderFeatures, startPos); + ++startPos; + for (size_t i = 1; i < sequenceIds.size(); ++i) { + std::span single(sequenceIds.data() + i, 1); + logitsTensor = this->decode(single, encoderFeatures, startPos); + ++startPos; + } + + // Autoregressive decoding: always 1 token at a time + while (std::cmp_less(startPos, constants::kMaxDecodeLength)) { const size_t logitsInnerDim = logitsTensor.size(1); const size_t logitsDictSize = logitsTensor.size(2); const float *logitsData = logitsTensor.const_data_ptr() + @@ -303,15 +313,16 @@ ASR::generate(std::span waveform, const DecodingOptions &options, nextProb = probs[nextId]; } - // Move the startPos pointer by the amount of tokens we processed - startPos += sequenceIds.size(); - sequenceIds = {nextId}; cachedTokens.push_back(nextId); scores.push_back(nextProb); if (nextId == endOfTranscriptionToken_) { break; } + + std::span single(&cachedTokens.back(), 1); + logitsTensor = this->decode(single, encoderFeatures, startPos); + ++startPos; } return {.tokens = std::vector(cachedTokens.cbegin() + diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts index 159396add8..0fdf1bad54 100644 --- a/packages/react-native-executorch/src/constants/modelUrls.ts +++ b/packages/react-native-executorch/src/constants/modelUrls.ts @@ -773,23 +773,17 @@ export const STYLE_TRANSFER_UDNIE_QUANTIZED = { } as const; // S2T -const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${PREVIOUS_VERSION_TAG}/tokenizer.json`; -const WHISPER_TINY_EN_MODEL = `${URL_PREFIX}-whisper-tiny.en/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack.pte`; +const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/tokenizer.json`; +const WHISPER_TINY_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack.pte`; +const WHISPER_TINY_EN_MODEL_COREML = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/coreml/whisper_tiny_en_coreml.pte`; +const WHISPER_TINY_EN_MODEL_VULKAN = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/vulkan/whisper_tiny_en_vulkan.pte`; -const WHISPER_TINY_EN_QUANTIZED_TOKENIZER = `${URL_PREFIX}-whisper-tiny-quantized.en/${PREVIOUS_VERSION_TAG}/tokenizer.json`; -const WHISPER_TINY_EN_QUANTIZED_MODEL = `${URL_PREFIX}-whisper-tiny-quantized.en/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_tiny_en_quantized_xnnpack.pte`; +const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/tokenizer.json`; +const WHISPER_BASE_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/xnnpack/whisper_base_en_xnnpack.pte`; +const WHISPER_BASE_EN_MODEL_COREML = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/coreml/whisper_base_en_coreml.pte`; -const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${PREVIOUS_VERSION_TAG}/tokenizer.json`; -const WHISPER_BASE_EN_MODEL = `${URL_PREFIX}-whisper-base.en/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_base_en_xnnpack.pte`; - -const WHISPER_BASE_EN_QUANTIZED_TOKENIZER = `${URL_PREFIX}-whisper-base-quantized.en/${PREVIOUS_VERSION_TAG}/tokenizer.json`; -const WHISPER_BASE_EN_QUANTIZED_MODEL = `${URL_PREFIX}-whisper-base-quantized.en/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_base_en_quantized_xnnpack.pte`; - -const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${PREVIOUS_VERSION_TAG}/tokenizer.json`; -const WHISPER_SMALL_EN_MODEL = `${URL_PREFIX}-whisper-small.en/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_small_en_xnnpack.pte`; - -const WHISPER_SMALL_EN_QUANTIZED_TOKENIZER = `${URL_PREFIX}-whisper-small-quantized.en/${PREVIOUS_VERSION_TAG}/tokenizer.json`; -const WHISPER_SMALL_EN_QUANTIZED_MODEL = `${URL_PREFIX}-whisper-small-quantized.en/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_small_en_quantized_xnnpack.pte`; +const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/tokenizer.json`; +const WHISPER_SMALL_EN_MODEL = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/xnnpack/whisper_small_en_xnnpack.pte`; const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${PREVIOUS_VERSION_TAG}/tokenizer.json`; const WHISPER_TINY_MODEL = `${URL_PREFIX}-whisper-tiny/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_tiny_xnnpack.pte`; @@ -803,41 +797,45 @@ const WHISPER_SMALL_MODEL = `${URL_PREFIX}-whisper-small/${PREVIOUS_VERSION_TAG} /** * @category Models - Speech To Text */ -export const WHISPER_TINY_EN = { +export const WHISPER_TINY_EN_XNNPACK = { modelName: 'whisper-tiny-en', isMultilingual: false, - modelSource: WHISPER_TINY_EN_MODEL, + modelSource: WHISPER_TINY_EN_MODEL_XNNPACK, tokenizerSource: WHISPER_TINY_EN_TOKENIZER, } as const; -/** - * @category Models - Speech To Text - */ -export const WHISPER_TINY_EN_QUANTIZED = { - modelName: 'whisper-tiny-en-quantized', +export const WHISPER_TINY_EN_COREML = { + modelName: 'whisper-tiny-en', isMultilingual: false, - modelSource: WHISPER_TINY_EN_QUANTIZED_MODEL, - tokenizerSource: WHISPER_TINY_EN_QUANTIZED_TOKENIZER, + modelSource: WHISPER_TINY_EN_MODEL_COREML, + tokenizerSource: WHISPER_TINY_EN_TOKENIZER, +} as const; + +export const WHISPER_TINY_EN_VULKAN = { + modelName: 'whisper-tiny-en', + isMultilingual: false, + modelSource: WHISPER_TINY_EN_MODEL_VULKAN, + tokenizerSource: WHISPER_TINY_EN_TOKENIZER, } as const; /** * @category Models - Speech To Text */ -export const WHISPER_BASE_EN = { +export const WHISPER_BASE_EN_XNNPACK = { modelName: 'whisper-base-en', isMultilingual: false, - modelSource: WHISPER_BASE_EN_MODEL, + modelSource: WHISPER_BASE_EN_MODEL_XNNPACK, tokenizerSource: WHISPER_BASE_EN_TOKENIZER, } as const; /** * @category Models - Speech To Text */ -export const WHISPER_BASE_EN_QUANTIZED = { - modelName: 'whisper-base-en-quantized', +export const WHISPER_BASE_EN_COREML = { + modelName: 'whisper-base-en', isMultilingual: false, - modelSource: WHISPER_BASE_EN_QUANTIZED_MODEL, - tokenizerSource: WHISPER_BASE_EN_QUANTIZED_TOKENIZER, + modelSource: WHISPER_BASE_EN_MODEL_COREML, + tokenizerSource: WHISPER_BASE_EN_TOKENIZER, } as const; /** @@ -850,16 +848,6 @@ export const WHISPER_SMALL_EN = { tokenizerSource: WHISPER_SMALL_EN_TOKENIZER, } as const; -/** - * @category Models - Speech To Text - */ -export const WHISPER_SMALL_EN_QUANTIZED = { - modelName: 'whisper-small-en-quantized', - isMultilingual: false, - modelSource: WHISPER_SMALL_EN_QUANTIZED_MODEL, - tokenizerSource: WHISPER_SMALL_EN_QUANTIZED_TOKENIZER, -} as const; - /** * @category Models - Speech To Text */ @@ -1350,12 +1338,12 @@ export const MODEL_REGISTRY = { STYLE_TRANSFER_RAIN_PRINCESS_QUANTIZED, STYLE_TRANSFER_UDNIE, STYLE_TRANSFER_UDNIE_QUANTIZED, - WHISPER_TINY_EN, - WHISPER_TINY_EN_QUANTIZED, - WHISPER_BASE_EN, - WHISPER_BASE_EN_QUANTIZED, + WHISPER_TINY_EN_XNNPACK, + WHISPER_TINY_EN_COREML, + WHISPER_TINY_EN_VULKAN, + WHISPER_BASE_EN_XNNPACK, + WHISPER_BASE_EN_COREML, WHISPER_SMALL_EN, - WHISPER_SMALL_EN_QUANTIZED, WHISPER_TINY, WHISPER_BASE, WHISPER_SMALL, diff --git a/yarn.lock b/yarn.lock index 7f335abe71..a4439a0f33 100644 --- a/yarn.lock +++ b/yarn.lock @@ -15283,6 +15283,20 @@ __metadata: languageName: node linkType: hard +"react-native-audio-api@npm:0.11.5": + version: 0.11.5 + resolution: "react-native-audio-api@npm:0.11.5" + dependencies: + semver: "npm:^7.7.3" + peerDependencies: + react: "*" + react-native: "*" + bin: + setup-rn-audio-api-web: scripts/setup-rn-audio-api-web.js + checksum: 10/f8a388954c42cfd390b9adbfe6781f9d8049d43ea6ab83a8b229a0d0082df3489d9b48072d7166403ae95a33e8d741aab86ba2307d1bd4ff949fdb72e14ef42d + languageName: node + linkType: hard + "react-native-audio-api@npm:0.12.0": version: 0.12.0 resolution: "react-native-audio-api@npm:0.12.0" From 7473f018c396abb8ed00df29a587747531dd4c4b Mon Sep 17 00:00:00 2001 From: IgorSwat Date: Fri, 8 May 2026 09:53:01 +0200 Subject: [PATCH 04/20] Update model urls --- apps/speech/screens/SpeechToTextScreen.tsx | 21 ++--- .../src/constants/modelUrls.ts | 89 +++++++++++++------ 2 files changed, 75 insertions(+), 35 deletions(-) diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index 1c8859d224..e7ee776034 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -13,24 +13,26 @@ import { import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; import { useSpeechToText, - WHISPER_TINY_EN_XNNPACK, + WHISPER_TINY_EN, WHISPER_TINY_EN_COREML, - WHISPER_BASE_EN_XNNPACK, + WHISPER_BASE_EN, WHISPER_BASE_EN_COREML, WHISPER_SMALL_EN, TranscriptionResult, SpeechToTextProps, + WHISPER_SMALL_EN_COREML, } from 'react-native-executorch'; import { ModelPicker, ModelOption } from '../components/ModelPicker'; type STTModelSources = SpeechToTextProps['model']; const MODELS: ModelOption[] = [ - { label: 'Whisper Tiny', value: WHISPER_TINY_EN_XNNPACK }, - { label: 'Whisper Tiny CoreML', value: WHISPER_TINY_EN_COREML }, - { label: 'Whisper Base', value: WHISPER_BASE_EN_XNNPACK }, - { label: 'Whisper Base CoreML', value: WHISPER_BASE_EN_COREML }, - { label: 'Whisper Small', value: WHISPER_SMALL_EN }, + { label: 'Whisper Tiny EN (XNNPACK)', value: WHISPER_TINY_EN }, + { label: 'Whisper Tiny EN (CoreML)', value: WHISPER_TINY_EN_COREML }, + { label: 'Whisper Base EN (XNNPACK)', value: WHISPER_BASE_EN }, + { label: 'Whisper Base EN (CoreML)', value: WHISPER_BASE_EN_COREML }, + { label: 'Whisper Small EN (XNNPACK)', value: WHISPER_SMALL_EN }, + { label: 'Whisper Small EN (CoreML)', value: WHISPER_SMALL_EN_COREML }, ]; import FontAwesome from '@expo/vector-icons/FontAwesome'; import { @@ -48,9 +50,8 @@ import ErrorBanner from '../components/ErrorBanner'; const isSimulator = DeviceInfo.isEmulatorSync(); export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { - const [selectedModel, setSelectedModel] = useState( - WHISPER_TINY_EN_XNNPACK - ); + const [selectedModel, setSelectedModel] = + useState(WHISPER_TINY_EN); const model = useSpeechToText({ model: selectedModel, diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts index 0fdf1bad54..a87117d863 100644 --- a/packages/react-native-executorch/src/constants/modelUrls.ts +++ b/packages/react-native-executorch/src/constants/modelUrls.ts @@ -776,28 +776,31 @@ export const STYLE_TRANSFER_UDNIE_QUANTIZED = { const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/tokenizer.json`; const WHISPER_TINY_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack.pte`; const WHISPER_TINY_EN_MODEL_COREML = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/coreml/whisper_tiny_en_coreml.pte`; -const WHISPER_TINY_EN_MODEL_VULKAN = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/vulkan/whisper_tiny_en_vulkan.pte`; const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/tokenizer.json`; const WHISPER_BASE_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/xnnpack/whisper_base_en_xnnpack.pte`; const WHISPER_BASE_EN_MODEL_COREML = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/coreml/whisper_base_en_coreml.pte`; -const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/tokenizer.json`; -const WHISPER_SMALL_EN_MODEL = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/xnnpack/whisper_small_en_xnnpack.pte`; +const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/tokenizer.json`; +const WHISPER_SMALL_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/xnnpack/whisper_small_en_xnnpack.pte`; +const WHISPER_SMALL_EN_MODEL_COREML = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/coreml/whisper_small_en_coreml.pte`; -const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${PREVIOUS_VERSION_TAG}/tokenizer.json`; -const WHISPER_TINY_MODEL = `${URL_PREFIX}-whisper-tiny/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_tiny_xnnpack.pte`; +const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/tokenizer.json`; +const WHISPER_TINY_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_xnnpack.pte`; +const WHISPER_TINY_MODEL_COREML = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/coreml/whisper_tiny_coreml.pte`; -const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${PREVIOUS_VERSION_TAG}/tokenizer.json`; -const WHISPER_BASE_MODEL = `${URL_PREFIX}-whisper-base/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_base_xnnpack.pte`; +const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/tokenizer.json`; +const WHISPER_BASE_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/xnnpack/whisper_base_xnnpack.pte`; +const WHISPER_BASE_MODEL_COREML = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/coreml/whisper_base_coreml.pte`; -const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${PREVIOUS_VERSION_TAG}/tokenizer.json`; -const WHISPER_SMALL_MODEL = `${URL_PREFIX}-whisper-small/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_small_xnnpack.pte`; +const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/tokenizer.json`; +const WHISPER_SMALL_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/xnnpack/whisper_small_xnnpack.pte`; +const WHISPER_SMALL_MODEL_COREML = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/coreml/whisper_small_coreml.pte`; /** * @category Models - Speech To Text */ -export const WHISPER_TINY_EN_XNNPACK = { +export const WHISPER_TINY_EN = { modelName: 'whisper-tiny-en', isMultilingual: false, modelSource: WHISPER_TINY_EN_MODEL_XNNPACK, @@ -811,17 +814,10 @@ export const WHISPER_TINY_EN_COREML = { tokenizerSource: WHISPER_TINY_EN_TOKENIZER, } as const; -export const WHISPER_TINY_EN_VULKAN = { - modelName: 'whisper-tiny-en', - isMultilingual: false, - modelSource: WHISPER_TINY_EN_MODEL_VULKAN, - tokenizerSource: WHISPER_TINY_EN_TOKENIZER, -} as const; - /** * @category Models - Speech To Text */ -export const WHISPER_BASE_EN_XNNPACK = { +export const WHISPER_BASE_EN = { modelName: 'whisper-base-en', isMultilingual: false, modelSource: WHISPER_BASE_EN_MODEL_XNNPACK, @@ -844,7 +840,17 @@ export const WHISPER_BASE_EN_COREML = { export const WHISPER_SMALL_EN = { modelName: 'whisper-small-en', isMultilingual: false, - modelSource: WHISPER_SMALL_EN_MODEL, + modelSource: WHISPER_SMALL_EN_MODEL_XNNPACK, + tokenizerSource: WHISPER_SMALL_EN_TOKENIZER, +} as const; + +/** + * @category Models - Speech To Text + */ +export const WHISPER_SMALL_EN_COREML = { + modelName: 'whisper-small-en', + isMultilingual: false, + modelSource: WHISPER_SMALL_EN_MODEL_COREML, tokenizerSource: WHISPER_SMALL_EN_TOKENIZER, } as const; @@ -854,7 +860,17 @@ export const WHISPER_SMALL_EN = { export const WHISPER_TINY = { modelName: 'whisper-tiny', isMultilingual: true, - modelSource: WHISPER_TINY_MODEL, + modelSource: WHISPER_TINY_MODEL_XNNPACK, + tokenizerSource: WHISPER_TINY_TOKENIZER, +} as const; + +/** + * @category Models - Speech To Text + */ +export const WHISPER_TINY_COREML = { + modelName: 'whisper-tiny', + isMultilingual: true, + modelSource: WHISPER_TINY_MODEL_COREML, tokenizerSource: WHISPER_TINY_TOKENIZER, } as const; @@ -864,7 +880,17 @@ export const WHISPER_TINY = { export const WHISPER_BASE = { modelName: 'whisper-base', isMultilingual: true, - modelSource: WHISPER_BASE_MODEL, + modelSource: WHISPER_BASE_MODEL_XNNPACK, + tokenizerSource: WHISPER_BASE_TOKENIZER, +} as const; + +/** + * @category Models - Speech To Text + */ +export const WHISPER_BASE_COREML = { + modelName: 'whisper-base', + isMultilingual: true, + modelSource: WHISPER_BASE_MODEL_COREML, tokenizerSource: WHISPER_BASE_TOKENIZER, } as const; @@ -874,7 +900,17 @@ export const WHISPER_BASE = { export const WHISPER_SMALL = { modelName: 'whisper-small', isMultilingual: true, - modelSource: WHISPER_SMALL_MODEL, + modelSource: WHISPER_SMALL_MODEL_XNNPACK, + tokenizerSource: WHISPER_SMALL_TOKENIZER, +} as const; + +/** + * @category Models - Speech To Text + */ +export const WHISPER_SMALL_COREML = { + modelName: 'whisper-small', + isMultilingual: true, + modelSource: WHISPER_SMALL_MODEL_COREML, tokenizerSource: WHISPER_SMALL_TOKENIZER, } as const; @@ -1338,15 +1374,18 @@ export const MODEL_REGISTRY = { STYLE_TRANSFER_RAIN_PRINCESS_QUANTIZED, STYLE_TRANSFER_UDNIE, STYLE_TRANSFER_UDNIE_QUANTIZED, - WHISPER_TINY_EN_XNNPACK, + WHISPER_TINY_EN, WHISPER_TINY_EN_COREML, - WHISPER_TINY_EN_VULKAN, - WHISPER_BASE_EN_XNNPACK, + WHISPER_BASE_EN, WHISPER_BASE_EN_COREML, WHISPER_SMALL_EN, + WHISPER_SMALL_EN_COREML, WHISPER_TINY, + WHISPER_TINY_COREML, WHISPER_BASE, + WHISPER_BASE_COREML, WHISPER_SMALL, + WHISPER_SMALL_COREML, DEEPLAB_V3_RESNET50, DEEPLAB_V3_RESNET101, DEEPLAB_V3_MOBILENET_V3_LARGE, From 9b90ea32901a9994b43132995d30c228aba83a2f Mon Sep 17 00:00:00 2001 From: IgorSwat Date: Fri, 8 May 2026 10:12:56 +0200 Subject: [PATCH 05/20] Change default model for iOS devices --- apps/speech/screens/SpeechToTextScreen.tsx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index e7ee776034..94d5930fea 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -49,9 +49,12 @@ import ErrorBanner from '../components/ErrorBanner'; const isSimulator = DeviceInfo.isEmulatorSync(); +const DEFAULT_MODEL = + Platform.OS === 'ios' ? WHISPER_BASE_EN_COREML : WHISPER_TINY_EN; + export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const [selectedModel, setSelectedModel] = - useState(WHISPER_TINY_EN); + useState(DEFAULT_MODEL); const model = useSpeechToText({ model: selectedModel, From 9af81240dda1ef98701fc905ef5c5d3ce7343271 Mon Sep 17 00:00:00 2001 From: IgorSwat Date: Sun, 10 May 2026 10:42:06 +0200 Subject: [PATCH 06/20] Add explicit timeout parameter --- apps/speech/screens/SpeechToTextScreen.tsx | 1 + .../models/speech_to_text/SpeechToText.cpp | 5 ++-- .../models/speech_to_text/SpeechToText.h | 3 +- .../models/speech_to_text/whisper/Constants.h | 28 +++++++++---------- .../models/speech_to_text/whisper/Params.h | 2 +- .../useSpeechToText.ts | 3 +- .../SpeechToTextModule.ts | 7 +++-- .../react-native-executorch/src/types/stt.ts | 11 +++++++- 8 files changed, 38 insertions(+), 22 deletions(-) diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index 94d5930fea..ad4f6505c8 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -185,6 +185,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { try { const streamIter = model.stream({ verbose: enableTimestamps, + timeout: 100, }); for await (const { committed, nonCommitted } of streamIter) { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp index 3133c0bb29..9537642d58 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp @@ -115,7 +115,8 @@ TranscriptionResult wordsToResult(const std::vector &words, } // namespace void SpeechToText::stream(std::shared_ptr callback, - std::string languageOption, bool verbose) { + std::string languageOption, bool verbose, + uint32_t timeout) { if (isStreaming_) { throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress, "Streaming is already in progress!"); @@ -158,7 +159,7 @@ void SpeechToText::stream(std::shared_ptr callback, // running transcriptions too rapidly (before the audio buffer is filled // with significant amount of new data) can cause streamer to commit wrong // phrases. - std::this_thread::sleep_for(std::chrono::milliseconds(100)); + std::this_thread::sleep_for(std::chrono::milliseconds(timeout)); } std::vector finalWords = streamer_->finish(options); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h index ade835869c..ec51862793 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h @@ -42,7 +42,8 @@ class SpeechToText { // Stream void stream(std::shared_ptr callback, - std::string languageOption, bool enableTimestamps); + std::string languageOption, bool enableTimestamps, + uint32_t timeout); void streamStop(); void streamInsert(std::span waveform); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h index 30062a75ba..62a9f968f7 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h @@ -9,37 +9,37 @@ namespace rnexecutorch::models::speech_to_text::whisper::constants { // Maximum duration of each audio chunk to process (in seconds) // It is intentionally set to 29 since otherwise only the last chunk would be // correctly transcribe due to the model's positional encoding limit -constexpr static size_t kChunkSize = 29; +inline constexpr size_t kChunkSize = 29; // Sampling rate expected by Whisper and the model's audio pipeline (16 kHz) -constexpr static size_t kSamplingRate = 16000; -constexpr static size_t kSamplesPerMilisecond = kSamplingRate / 1000; +inline constexpr size_t kSamplingRate = 16000; +inline constexpr size_t kSamplesPerMilisecond = kSamplingRate / 1000; -constexpr static size_t kMaxSamples = kChunkSize * kSamplingRate; +inline constexpr size_t kMaxSamples = kChunkSize * kSamplingRate; // The maximum number of tokens the decoder can generate per chunk -constexpr static size_t kMaxDecodeLength = 128; +inline constexpr size_t kMaxDecodeLength = 128; // Minimum allowed chunk length before processing (in audio samples) -constexpr static size_t kMinChunkSamples = 1 * kSamplingRate; +inline constexpr size_t kMinChunkSamples = 1 * kSamplingRate; // Number of mel frames output by the encoder (derived from input spectrogram) -constexpr static size_t kNumFrames = 1500; +inline constexpr size_t kNumFrames = 1500; // Time precision used by Whisper timestamps: each token spans 0.02 seconds -constexpr static float kTimePrecision = 0.02f; +inline constexpr float kTimePrecision = 0.02f; // Special characters serving as pause / end of sentence -static const std::unordered_set kPunctations = {',', '.', '?', +inline const std::unordered_set kPunctations = {',', '.', '?', '!', ':', ';'}; -static const std::unordered_set kEosPunctations = {'.', '?', '!', ';'}; +inline const std::unordered_set kEosPunctations = {'.', '?', '!', ';'}; // Special token constants namespace tokens { -static const std::string kStartOfTranscript = "<|startoftranscript|>"; -static const std::string kEndOfTranscript = "<|endoftext|>"; -static const std::string kBeginTimestamp = "<|0.00|>"; -static const std::string kBlankAudio = "[BLANK_AUDIO]"; +inline const std::string kStartOfTranscript = "<|startoftranscript|>"; +inline const std::string kEndOfTranscript = "<|endoftext|>"; +inline const std::string kBeginTimestamp = "<|0.00|>"; +inline const std::string kBlankAudio = "[BLANK_AUDIO]"; } // namespace tokens } // namespace rnexecutorch::models::speech_to_text::whisper::constants \ No newline at end of file diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h index a99067f411..847a22b1e0 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h @@ -24,7 +24,7 @@ constexpr inline float kStreamMaxDuration = * The minimum amount of recent audio always kept in the buffer when a blind * cut is performed. Acts as the lower bound on what survives a cleanup. */ -constexpr inline float kStreamSafetyThreshold = 2.F; // [s] +constexpr inline float kStreamSafetyThreshold = 3.F; // [s] /** * Forced-cleanup threshold. Once the buffer grows past this duration we run diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts index c906851380..229bba73e3 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts @@ -5,6 +5,7 @@ import { SpeechToTextType, SpeechToTextProps, TranscriptionResult, + StreamingOptions, } from '../../types/stt'; import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; @@ -101,7 +102,7 @@ export const useSpeechToText = ({ ); const stream = useCallback( - async function* (options: DecodingOptions = {}): AsyncGenerator< + async function* (options: StreamingOptions = {}): AsyncGenerator< { committed: TranscriptionResult; nonCommitted: TranscriptionResult; diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts index 273264e0e2..3890c9ae50 100644 --- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts +++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts @@ -2,6 +2,7 @@ import { DecodingOptions, SpeechToTextModelConfig, SpeechToTextModelName, + StreamingOptions, TranscriptionResult, } from '../../types/stt'; import { ResourceFetcher } from '../../utils/ResourceFetcher'; @@ -174,7 +175,7 @@ export class SpeechToTextModule { * @yields An object containing `committed` and `nonCommitted` transcription results. * @returns An async generator yielding transcription updates. */ - public async *stream(options: DecodingOptions = {}): AsyncGenerator<{ + public async *stream(options: StreamingOptions = {}): AsyncGenerator<{ committed: TranscriptionResult; nonCommitted: TranscriptionResult; }> { @@ -182,6 +183,7 @@ export class SpeechToTextModule { const verbose = !!options.verbose; const language = options.language || ''; + const timeout = options.timeout || 100; const queue: { committed: TranscriptionResult; @@ -216,7 +218,8 @@ export class SpeechToTextModule { wake(); }, language, - verbose + verbose, + timeout ); finished = true; diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts index 0a6ed11f70..20f1013ef0 100644 --- a/packages/react-native-executorch/src/types/stt.ts +++ b/packages/react-native-executorch/src/types/stt.ts @@ -94,7 +94,7 @@ export interface SpeechToTextType { * @returns Asynchronous generator that returns `committed` and `nonCommitted` transcription. * Both `committed` and `nonCommitted` are of type `TranscriptionResult` */ - stream(options?: DecodingOptions | undefined): AsyncGenerator< + stream(options?: StreamingOptions | undefined): AsyncGenerator< { committed: TranscriptionResult; nonCommitted: TranscriptionResult; @@ -208,6 +208,15 @@ export interface DecodingOptions { verbose?: boolean; } +/** + * Configuration options for the speech-to-text streaming process. + * @category Types + * @property {number} [timeout] - Specifies (in milliseconds) how much does streamer wait between model inferences. + */ +export interface StreamingOptions extends DecodingOptions { + timeout?: number; +} + /** * Structure that represent single token with timestamp information. * @category Types From f7849fc933a3e6dc4aeefa765b288409da4fdd2e Mon Sep 17 00:00:00 2001 From: IgorSwat Date: Sun, 10 May 2026 14:21:23 +0200 Subject: [PATCH 07/20] Concurrency fixes & automatic cleaunp --- .../speech_to_text/whisper/OnlineASR.cpp | 243 +++++++++++------- .../models/speech_to_text/whisper/OnlineASR.h | 34 ++- 2 files changed, 169 insertions(+), 108 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp index fb57fcb0f3..188c77d80d 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp @@ -16,12 +16,28 @@ OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) { } bool OnlineASR::isReady() const { + std::scoped_lock lock(streamingMutex); + return audioBuffer_.size() >= constants::kMinChunkSamples; } void OnlineASR::insertAudioChunk(std::span audio) { - std::scoped_lock lock(audioBufferMutex_); + std::scoped_lock lock(streamingMutex); + audioBuffer_.insert(audioBuffer_.end(), audio.begin(), audio.end()); + + // Automatic buffer cleanup. + // + // This prevents the audio buffer from growing indefinitely during continuous + // streaming. It is particularly useful when VAD (Voice Activity Detection) + // is used and elements are inserted but not processed for a long time. + // It should not pass the condition in a normal streaming, that is when + // process() method is called regularly within reasonable steps of time. + if (audioBuffer_.size() > constants::kMaxSamples) { + // Note that results are not actually committed now, but saved for + // a later call of process(). + memory_.toCommit = commitAndClean(memory_.transcript); + } } ProcessResult OnlineASR::process(const DecodingOptions &options) { @@ -30,7 +46,7 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) { // Copy the audio buffer to avoid keeping the lock during the entire // transcription process. { - std::scoped_lock lock(audioBufferMutex_); + std::scoped_lock lock(streamingMutex); audioCopy = audioBuffer_; } @@ -51,17 +67,18 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) { std::back_inserter(words)); } - std::vector committed; + // Aquire lock for the rest of the method (extensive usage of audioBuffer_). + std::scoped_lock lock(streamingMutex); // Step 1: examine all previously saved EOS points. // The idea is to remove entries which have changed or no longer exist // due to model correcting it's output. - for (size_t i = 0; i < eos_.size(); i++) { - const auto &eos = eos_[i]; + for (size_t i = 0; i < memory_.eos.size(); i++) { + const auto &eos = memory_.eos[i]; if (eos.position >= words.size() || !utils::isEos(words[eos.position]) || (eos.position > 0 && eos.preceeding != words[eos.position - 1].content)) { - eos_.erase(eos_.begin() + i, eos_.end()); + memory_.eos.erase(memory_.eos.begin() + i, memory_.eos.end()); break; } } @@ -74,110 +91,43 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) { // Because of step 1, we know that if the last EOS exist in eos_, // then it must be the last entry. - if (eos_.empty() || eos_.back().position != lastEosIndex) { + if (memory_.eos.empty() || memory_.eos.back().position != lastEosIndex) { // Register last EOS entry std::string preceeding = lastEosIndex > 0 ? words[lastEosIndex - 1].content : ""; - eos_.emplace_back(lastEosIndex, preceeding, lastEosIt->end); + memory_.eos.emplace_back(lastEosIndex, preceeding, lastEosIt->end); } } - // Step 3: clear the buffer if it is getting too large. + std::vector committed; + + // Step 3: collect all the words which could possible get committed + // in-between iterations. + if (!memory_.toCommit.empty()) { + committed.insert(committed.end(), + std::make_move_iterator(memory_.toCommit.begin()), + std::make_move_iterator(memory_.toCommit.end())); + memory_.toCommit.clear(); + } + + // Step 4: clear the buffer if it is getting too large. // The idea is to use the saved EOS entries and try to cut the buffer // in a 'good' spot - where it will remove a significant audio chunk, yet // won't affect most recent, unfinished speech samples. size_t bufferSize = audioBuffer_.size(); if (bufferSize > static_cast(params::kStreamSafeBufferDuration * constants::kSamplingRate)) { - // Setup the lock for the entire cleanup section. - std::scoped_lock lock(audioBufferMutex_); - - const float midBufferThreshold = params::kStreamMaxDuration / 2.0F; - - // If we don't have any EOS entries, then we most likely have not - // recorded any speech. In this case we can safely cut the maximum amount of - // audio data. - if (eos_.empty()) { - size_t cut = bufferSize - params::kStreamSafetyThreshold; - - audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut); - } - - // If we have exactly one (most recent) EOS entry in the eos_, then - // we need to be more careful. - // Normally we want to keep at least one sentence in, but if the sentence - // covers a significant amount of buffer, we have no choice. - else if (eos_.size() == 1) { - const float eosTimestamp = eos_[0].tmstpend; - - const float upperHalfDuration = - std::max(0.0F, eosTimestamp - midBufferThreshold); - const float wordsPerSecond = - upperHalfDuration > 0.1F - ? static_cast(words.size()) / upperHalfDuration - : 0.0F; - - // The EOS sits early enough that cutting up to the safety margin won't - // touch the ongoing (post-EOS) speech. - const bool eosSafe = eosTimestamp < params::kStreamSafeBufferDuration - - params::kStreamSafetyThreshold; - - if (!eosSafe && wordsPerSecond < params::kWordsPerSecondLow) { - // EOS lies past the midpoint, but a low word density implies the spoken - // audio is concentrated in the upper half. Drop the lower half and - // shift the EOS accordingly. - audioBuffer_.erase(audioBuffer_.begin(), - audioBuffer_.begin() + - static_cast(midBufferThreshold * - constants::kSamplingRate)); - eos_[0].tmstpend -= midBufferThreshold; - } else { - // Cut everything up to and including the sentence — either by the - // safety margin (when EOS is early) or (more aggresively) right at the - // EOS boundary — and commit its words. - const size_t cut = - eosSafe - ? bufferSize - - static_cast(params::kStreamSafetyThreshold * - constants::kSamplingRate) - : static_cast(eosTimestamp * constants::kSamplingRate); - - audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut); - - committed.insert(committed.end(), - std::make_move_iterator(words.begin()), - std::make_move_iterator(words.end())); - - words.clear(); - eos_.clear(); - } - } - - // In case of 2 or more sentences, we generally want to keep the last one - // intact. This would provide a bit of stability to the algorithm. - else { - const auto &secondTolastEntry = eos_[eos_.size() - 2]; - - const size_t cut = static_cast(secondTolastEntry.tmstpend * - constants::kSamplingRate); - const size_t lastCommittedPos = secondTolastEntry.position; - - audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut); - - // Move all words up to the last committed position (inclusive) to the - // committed buffer. - committed.insert( - committed.end(), std::make_move_iterator(words.begin()), - std::make_move_iterator(words.begin() + lastCommittedPos + 1)); - words.erase(words.begin(), words.begin() + lastCommittedPos + 1); + auto newCommitted = commitAndClean(words); - // Retain only the most recent EOS entry. - eos_.erase(eos_.begin(), eos_.end() - 1); - eos_[0].tmstpend -= secondTolastEntry.tmstpend; - } + committed.insert(committed.end(), + std::make_move_iterator(newCommitted.begin()), + std::make_move_iterator(newCommitted.end())); } - // Return the results + // Save the uncommitted part to streamer's memory, + // cause it might be necessary when committing inside streamInsert(). + memory_.transcript = words; + // Note that uncommitted part represented by recent transcription (words) // is already shrinked if something has been committed during the cleanup // phase. @@ -200,10 +150,109 @@ std::vector OnlineASR::finish(const DecodingOptions &options) { } void OnlineASR::reset() { - std::scoped_lock lock(audioBufferMutex_); + std::scoped_lock lock(streamingMutex); + audioBuffer_.clear(); - eos_.clear(); + // Reset memory. + memory_.transcript.clear(); + memory_.eos.clear(); + memory_.toCommit.clear(); +} + +std::vector OnlineASR::commitAndClean(std::vector &transcript) { + const size_t bufferSize = audioBuffer_.size(); + const float midBufferThreshold = params::kStreamMaxDuration / 2.0F; + + std::vector committed; + + // If we don't have any EOS entries, then we most likely have not + // recorded any speech. In this case we can safely cut the maximum amount of + // audio data. + if (memory_.eos.empty()) { + size_t cut = + bufferSize - params::kStreamSafetyThreshold * constants::kSamplingRate; + + audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut); + } + + // If we have exactly one (most recent) EOS entry in the eos_, then + // we need to be more careful. + // Normally we want to keep at least one sentence in, but if the sentence + // covers a significant amount of buffer, we have no choice. + else if (memory_.eos.size() == 1) { + const float eosTimestamp = memory_.eos[0].tmstpend; + + const float upperHalfDuration = + std::max(0.0F, eosTimestamp - midBufferThreshold); + const float wordsPerSecond = + upperHalfDuration > 0.1F + ? static_cast(transcript.size()) / upperHalfDuration + : 0.0F; + + // The EOS sits early enough that cutting up to the safety margin won't + // touch the ongoing (post-EOS) speech. + const bool eosSafe = eosTimestamp < params::kStreamSafeBufferDuration - + params::kStreamSafetyThreshold; + + if (!eosSafe && wordsPerSecond < params::kWordsPerSecondLow) { + // EOS lies past the midpoint, but a low word density implies the spoken + // audio is concentrated in the upper half. Drop the lower half and + // shift the EOS accordingly. + audioBuffer_.erase(audioBuffer_.begin(), + audioBuffer_.begin() + + static_cast(midBufferThreshold * + constants::kSamplingRate)); + memory_.eos[0].tmstpend -= midBufferThreshold; + } else { + // Cut everything up to and including the sentence — either by the + // safety margin (when EOS is early) or (more aggresively) right at the + // EOS boundary — and commit its words. + const size_t cut = + eosSafe + ? bufferSize - + static_cast(params::kStreamSafetyThreshold * + constants::kSamplingRate) + : static_cast(eosTimestamp * constants::kSamplingRate); + + audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut); + + committed.insert(committed.end(), + std::make_move_iterator(transcript.begin()), + std::make_move_iterator(transcript.end())); + + transcript.clear(); + memory_.eos.clear(); + } + } + + // In case of 2 or more sentences, we generally want to keep the last one + // intact. This would provide a bit of stability to the algorithm. + else { + const auto &secondTolastEntry = memory_.eos[memory_.eos.size() - 2]; + + const size_t cut = static_cast(secondTolastEntry.tmstpend * + constants::kSamplingRate); + const size_t lastCommittedPos = secondTolastEntry.position; + + audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut); + + // Move all words up to the last committed position (inclusive) to the + // committed buffer. + committed.insert( + committed.end(), std::make_move_iterator(transcript.begin()), + std::make_move_iterator(transcript.begin() + lastCommittedPos + 1)); + transcript.erase(transcript.begin(), + transcript.begin() + lastCommittedPos + 1); + + // Retain only the most recent EOS entry, shifting both its timestamp + // and its position to match the new (truncated) transcript origin. + memory_.eos.erase(memory_.eos.begin(), memory_.eos.end() - 1); + memory_.eos[0].tmstpend -= secondTolastEntry.tmstpend; + memory_.eos[0].position -= lastCommittedPos + 1; + } + + return committed; } } // namespace rnexecutorch::models::speech_to_text::whisper::stream diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h index 0c0b65f40e..7547d16bd5 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h @@ -51,23 +51,35 @@ class OnlineASR : public schema::OnlineASR { void reset() override; private: + // Cleans up the buffer and returns committed words based on given transcript. + std::vector commitAndClean(std::vector &transcript); + // ASR module connection for transcribing the audio const ASR *asr_; // Audio buffer (input) - accumulates obtained audio samples. std::vector audioBuffer_ = {}; - mutable std::mutex audioBufferMutex_; + mutable std::mutex streamingMutex; // Covers both buffer & memory + + // Streaming memory. + // In general, helps to navigate continous streaming state and improve buffer + // handling algorithms. + struct Memory { + // State management helper. + struct EOSEntry { + size_t position; // An absolute position (index) in the transcription + // (word sequence). + std::string preceeding; // A preceeding word in the transcription + float tmstpend; // Ending timestamp of the sentence. + }; - // State management helper. - struct EOSEntry { - size_t position; // An absolute position (index) in the transcription (word - // sequence). - std::string preceeding; // A preceeding word in the transcription - float tmstpend; // Ending timestamp of the sentence. - }; - // Stores saved EOS entries in most recent transcription - // and allows to clear the buffer in a smart, non invasive way. - std::vector eos_; + std::vector + transcript; // The most recent transcription result (uncommitted only!). + std::vector + eos; // End of sentence points from the most recent transcription. + std::vector toCommit; // Words to be committed in the next iteration + // (next process() call). + } memory_; }; } // namespace rnexecutorch::models::speech_to_text::whisper::stream From 3bf68bf5b38bab76d904503b1a9ad01ae1937f12 Mon Sep 17 00:00:00 2001 From: IgorSwat Date: Thu, 14 May 2026 10:52:47 +0200 Subject: [PATCH 08/20] Update urls & audio-api --- .../src/constants/modelUrls.ts | 24 +++++++++---------- yarn.lock | 14 ----------- 2 files changed, 12 insertions(+), 26 deletions(-) diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts index a87117d863..2a15be25dd 100644 --- a/packages/react-native-executorch/src/constants/modelUrls.ts +++ b/packages/react-native-executorch/src/constants/modelUrls.ts @@ -774,28 +774,28 @@ export const STYLE_TRANSFER_UDNIE_QUANTIZED = { // S2T const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/tokenizer.json`; -const WHISPER_TINY_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack.pte`; -const WHISPER_TINY_EN_MODEL_COREML = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/coreml/whisper_tiny_en_coreml.pte`; +const WHISPER_TINY_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack_fp32.pte`; +const WHISPER_TINY_EN_MODEL_COREML = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/coreml/whisper_tiny_en_coreml_fp32.pte`; const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/tokenizer.json`; -const WHISPER_BASE_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/xnnpack/whisper_base_en_xnnpack.pte`; -const WHISPER_BASE_EN_MODEL_COREML = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/coreml/whisper_base_en_coreml.pte`; +const WHISPER_BASE_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/xnnpack/whisper_base_en_xnnpack_fp32.pte`; +const WHISPER_BASE_EN_MODEL_COREML = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/coreml/whisper_base_en_coreml_fp32.pte`; const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/tokenizer.json`; -const WHISPER_SMALL_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/xnnpack/whisper_small_en_xnnpack.pte`; -const WHISPER_SMALL_EN_MODEL_COREML = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/coreml/whisper_small_en_coreml.pte`; +const WHISPER_SMALL_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/xnnpack/whisper_small_en_xnnpack_fp32.pte`; +const WHISPER_SMALL_EN_MODEL_COREML = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/coreml/whisper_small_en_coreml_fp32.pte`; const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/tokenizer.json`; -const WHISPER_TINY_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_xnnpack.pte`; -const WHISPER_TINY_MODEL_COREML = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/coreml/whisper_tiny_coreml.pte`; +const WHISPER_TINY_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_xnnpack_fp32.pte`; +const WHISPER_TINY_MODEL_COREML = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/coreml/whisper_tiny_coreml_fp32.pte`; const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/tokenizer.json`; -const WHISPER_BASE_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/xnnpack/whisper_base_xnnpack.pte`; -const WHISPER_BASE_MODEL_COREML = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/coreml/whisper_base_coreml.pte`; +const WHISPER_BASE_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/xnnpack/whisper_base_xnnpack_fp32.pte`; +const WHISPER_BASE_MODEL_COREML = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/coreml/whisper_base_coreml_fp32.pte`; const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/tokenizer.json`; -const WHISPER_SMALL_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/xnnpack/whisper_small_xnnpack.pte`; -const WHISPER_SMALL_MODEL_COREML = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/coreml/whisper_small_coreml.pte`; +const WHISPER_SMALL_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/xnnpack/whisper_small_xnnpack_fp32.pte`; +const WHISPER_SMALL_MODEL_COREML = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/coreml/whisper_small_coreml_fp32.pte`; /** * @category Models - Speech To Text diff --git a/yarn.lock b/yarn.lock index a4439a0f33..7f335abe71 100644 --- a/yarn.lock +++ b/yarn.lock @@ -15283,20 +15283,6 @@ __metadata: languageName: node linkType: hard -"react-native-audio-api@npm:0.11.5": - version: 0.11.5 - resolution: "react-native-audio-api@npm:0.11.5" - dependencies: - semver: "npm:^7.7.3" - peerDependencies: - react: "*" - react-native: "*" - bin: - setup-rn-audio-api-web: scripts/setup-rn-audio-api-web.js - checksum: 10/f8a388954c42cfd390b9adbfe6781f9d8049d43ea6ab83a8b229a0d0082df3489d9b48072d7166403ae95a33e8d741aab86ba2307d1bd4ff949fdb72e14ef42d - languageName: node - linkType: hard - "react-native-audio-api@npm:0.12.0": version: 0.12.0 resolution: "react-native-audio-api@npm:0.12.0" From 27769d4a73292f9f01c4e8acaebd3664a067e97e Mon Sep 17 00:00:00 2001 From: IgorSwat Date: Tue, 19 May 2026 13:05:03 +0200 Subject: [PATCH 09/20] Apply review suggestions --- apps/speech/screens/SpeechToTextScreen.tsx | 2 +- .../models/speech_to_text/common/types/Word.h | 2 +- .../models/speech_to_text/whisper/ASR.cpp | 10 +-- .../speech_to_text/whisper/OnlineASR.cpp | 67 ++++++++++--------- .../models/speech_to_text/whisper/Utils.h | 2 +- 5 files changed, 40 insertions(+), 43 deletions(-) diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index ad4f6505c8..2942d5e718 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -18,9 +18,9 @@ import { WHISPER_BASE_EN, WHISPER_BASE_EN_COREML, WHISPER_SMALL_EN, + WHISPER_SMALL_EN_COREML, TranscriptionResult, SpeechToTextProps, - WHISPER_SMALL_EN_COREML, } from 'react-native-executorch'; import { ModelPicker, ModelOption } from '../components/ModelPicker'; diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h index 2343d1faab..fcf7759b24 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h @@ -5,7 +5,7 @@ namespace rnexecutorch::models::speech_to_text { /** - * Basically a different representation of token, + * Different representation of a token, * with timestamps calculated. */ struct Word { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp index d2555a79fa..5a925e6eba 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp @@ -265,15 +265,11 @@ ASR::generate(std::span waveform, const DecodingOptions &options, uint64_t startPos = 0; // Prefill: feed each initial token individually so decode() always sees 1 - // token - std::span firstToken(sequenceIds.data(), 1); - executorch::aten::Tensor logitsTensor = - this->decode(firstToken, encoderFeatures, startPos); - ++startPos; - for (size_t i = 1; i < sequenceIds.size(); ++i) { + // token. + executorch::aten::Tensor logitsTensor{nullptr}; + for (size_t i = 0; i < sequenceIds.size(); i++, startPos++) { std::span single(sequenceIds.data() + i, 1); logitsTensor = this->decode(single, encoderFeatures, startPos); - ++startPos; } // Autoregressive decoding: always 1 token at a time diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp index 188c77d80d..ced3193531 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include "Constants.h" @@ -11,18 +12,17 @@ namespace rnexecutorch::models::speech_to_text::whisper::stream { OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) { - // Reserve an expected amount of memory for audio buffer. audioBuffer_.reserve((constants::kChunkSize + 1) * constants::kSamplingRate); } bool OnlineASR::isReady() const { - std::scoped_lock lock(streamingMutex); + std::scoped_lock lock(streamingMutex); return audioBuffer_.size() >= constants::kMinChunkSamples; } void OnlineASR::insertAudioChunk(std::span audio) { - std::scoped_lock lock(streamingMutex); + std::scoped_lock lock(streamingMutex); audioBuffer_.insert(audioBuffer_.end(), audio.begin(), audio.end()); @@ -41,12 +41,15 @@ void OnlineASR::insertAudioChunk(std::span audio) { } ProcessResult OnlineASR::process(const DecodingOptions &options) { + constexpr float kStreamSafeBufferMaxSamples = + params::kStreamSafeBufferDuration * constants::kSamplingRate; + std::vector audioCopy; // Copy the audio buffer to avoid keeping the lock during the entire // transcription process. { - std::scoped_lock lock(streamingMutex); + std::scoped_lock lock(streamingMutex); audioCopy = audioBuffer_; } @@ -60,25 +63,23 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) { std::vector transcriptions = asr_->transcribe(input, options); // Flatten segments into a single word sequence. - // This is basically our 'nonCommitted' part for now. + // This is our 'nonCommitted' part for now. std::vector words; for (auto &segment : transcriptions) { - std::move(segment.words.begin(), segment.words.end(), - std::back_inserter(words)); + std::ranges::move(segment.words, std::back_inserter(words)); } // Aquire lock for the rest of the method (extensive usage of audioBuffer_). - std::scoped_lock lock(streamingMutex); + std::scoped_lock lock(streamingMutex); // Step 1: examine all previously saved EOS points. // The idea is to remove entries which have changed or no longer exist // due to model correcting it's output. - for (size_t i = 0; i < memory_.eos.size(); i++) { - const auto &eos = memory_.eos[i]; - if (eos.position >= words.size() || !utils::isEos(words[eos.position]) || - (eos.position > 0 && - eos.preceeding != words[eos.position - 1].content)) { - memory_.eos.erase(memory_.eos.begin() + i, memory_.eos.end()); + for (auto it = memory_.eos.begin(); it != memory_.eos.end(); it++) { + if (it->position >= words.size() || !utils::isEos(words[it->position]) || + (it->position > 0 && + it->preceeding != words[it->position - 1].content)) { + memory_.eos.erase(it, memory_.eos.end()); break; } } @@ -92,7 +93,6 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) { // Because of step 1, we know that if the last EOS exist in eos_, // then it must be the last entry. if (memory_.eos.empty() || memory_.eos.back().position != lastEosIndex) { - // Register last EOS entry std::string preceeding = lastEosIndex > 0 ? words[lastEosIndex - 1].content : ""; memory_.eos.emplace_back(lastEosIndex, preceeding, lastEosIt->end); @@ -115,8 +115,8 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) { // in a 'good' spot - where it will remove a significant audio chunk, yet // won't affect most recent, unfinished speech samples. size_t bufferSize = audioBuffer_.size(); - if (bufferSize > static_cast(params::kStreamSafeBufferDuration * - constants::kSamplingRate)) { + if (std::cmp_greater(bufferSize, + kStreamSafeBufferMaxSamples)) { auto newCommitted = commitAndClean(words); committed.insert(committed.end(), @@ -139,7 +139,7 @@ std::vector OnlineASR::finish(const DecodingOptions &options) { // Last-tick committed delta + whatever never made it past the commit // threshold. - std::vector residual = std::move(result.committed); + std::vector residual{std::move(result.committed)}; residual.insert(residual.end(), std::make_move_iterator(result.nonCommitted.begin()), std::make_move_iterator(result.nonCommitted.end())); @@ -150,7 +150,7 @@ std::vector OnlineASR::finish(const DecodingOptions &options) { } void OnlineASR::reset() { - std::scoped_lock lock(streamingMutex); + std::scoped_lock lock(streamingMutex); audioBuffer_.clear(); @@ -161,8 +161,16 @@ void OnlineASR::reset() { } std::vector OnlineASR::commitAndClean(std::vector &transcript) { + constexpr float kMidpointAnchorTime = params::kStreamMaxDuration / 2.0F; + constexpr size_t kMidpointAnchorSamples = + static_cast(kMidpointAnchorTime * constants::kSamplingRate); + constexpr size_t kSafetyMarginSamples = static_cast( + params::kStreamSafetyThreshold * constants::kSamplingRate); + constexpr float kMaxSafeEosTime = + params::kStreamSafeBufferDuration - params::kStreamSafetyThreshold; + constexpr float kMinDurationToCalculateDensity = 0.1F; + const size_t bufferSize = audioBuffer_.size(); - const float midBufferThreshold = params::kStreamMaxDuration / 2.0F; std::vector committed; @@ -184,35 +192,30 @@ std::vector OnlineASR::commitAndClean(std::vector &transcript) { const float eosTimestamp = memory_.eos[0].tmstpend; const float upperHalfDuration = - std::max(0.0F, eosTimestamp - midBufferThreshold); + std::max(0.0F, eosTimestamp - kMidpointAnchorTime); const float wordsPerSecond = - upperHalfDuration > 0.1F + upperHalfDuration > kMinDurationToCalculateDensity ? static_cast(transcript.size()) / upperHalfDuration : 0.0F; // The EOS sits early enough that cutting up to the safety margin won't // touch the ongoing (post-EOS) speech. - const bool eosSafe = eosTimestamp < params::kStreamSafeBufferDuration - - params::kStreamSafetyThreshold; + const bool eosSafe = eosTimestamp < kMaxSafeEosTime; if (!eosSafe && wordsPerSecond < params::kWordsPerSecondLow) { // EOS lies past the midpoint, but a low word density implies the spoken // audio is concentrated in the upper half. Drop the lower half and // shift the EOS accordingly. audioBuffer_.erase(audioBuffer_.begin(), - audioBuffer_.begin() + - static_cast(midBufferThreshold * - constants::kSamplingRate)); - memory_.eos[0].tmstpend -= midBufferThreshold; + audioBuffer_.begin() + kMidpointAnchorSamples); + memory_.eos[0].tmstpend -= kMidpointAnchorTime; } else { // Cut everything up to and including the sentence — either by the // safety margin (when EOS is early) or (more aggresively) right at the // EOS boundary — and commit its words. const size_t cut = eosSafe - ? bufferSize - - static_cast(params::kStreamSafetyThreshold * - constants::kSamplingRate) + ? bufferSize - kSafetyMarginSamples : static_cast(eosTimestamp * constants::kSamplingRate); audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut); @@ -237,8 +240,6 @@ std::vector OnlineASR::commitAndClean(std::vector &transcript) { audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut); - // Move all words up to the last committed position (inclusive) to the - // committed buffer. committed.insert( committed.end(), std::make_move_iterator(transcript.begin()), std::make_move_iterator(transcript.begin() + lastCommittedPos + 1)); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h index 48c84a84b7..ae461c27cf 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h @@ -14,7 +14,7 @@ namespace rnexecutorch::models::speech_to_text::whisper::utils { * * @param word The word to check. */ -constexpr inline bool isEos(const Word &word) { +inline bool isEos(const Word &word) { return word.content.size() == 1 && constants::kEosPunctations.contains(word.content[0]); } From c5b142d9a946dc04ba8dc0da9c977cb860b72894 Mon Sep 17 00:00:00 2001 From: IgorSwat Date: Tue, 19 May 2026 13:17:24 +0200 Subject: [PATCH 10/20] Rebase with main --- .../src/constants/modelUrls.ts | 36 +++++++++---------- .../references/reference-models.md | 21 ++++++----- .../references/reference-models.md | 21 ++++++----- 3 files changed, 42 insertions(+), 36 deletions(-) diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts index 2a15be25dd..aec9da1c0f 100644 --- a/packages/react-native-executorch/src/constants/modelUrls.ts +++ b/packages/react-native-executorch/src/constants/modelUrls.ts @@ -773,29 +773,29 @@ export const STYLE_TRANSFER_UDNIE_QUANTIZED = { } as const; // S2T -const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/tokenizer.json`; -const WHISPER_TINY_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack_fp32.pte`; -const WHISPER_TINY_EN_MODEL_COREML = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/coreml/whisper_tiny_en_coreml_fp32.pte`; +const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/tokenizer.json`; +const WHISPER_TINY_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack_fp32.pte`; +const WHISPER_TINY_EN_MODEL_COREML = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/coreml/whisper_tiny_en_coreml_fp32.pte`; -const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/tokenizer.json`; -const WHISPER_BASE_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/xnnpack/whisper_base_en_xnnpack_fp32.pte`; -const WHISPER_BASE_EN_MODEL_COREML = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/coreml/whisper_base_en_coreml_fp32.pte`; +const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/tokenizer.json`; +const WHISPER_BASE_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/xnnpack/whisper_base_en_xnnpack_fp32.pte`; +const WHISPER_BASE_EN_MODEL_COREML = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/coreml/whisper_base_en_coreml_fp32.pte`; -const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/tokenizer.json`; -const WHISPER_SMALL_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/xnnpack/whisper_small_en_xnnpack_fp32.pte`; -const WHISPER_SMALL_EN_MODEL_COREML = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/coreml/whisper_small_en_coreml_fp32.pte`; +const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/tokenizer.json`; +const WHISPER_SMALL_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/xnnpack/whisper_small_en_xnnpack_fp32.pte`; +const WHISPER_SMALL_EN_MODEL_COREML = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/coreml/whisper_small_en_coreml_fp32.pte`; -const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/tokenizer.json`; -const WHISPER_TINY_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_xnnpack_fp32.pte`; -const WHISPER_TINY_MODEL_COREML = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/coreml/whisper_tiny_coreml_fp32.pte`; +const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/tokenizer.json`; +const WHISPER_TINY_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/xnnpack/whisper_tiny_xnnpack_fp32.pte`; +const WHISPER_TINY_MODEL_COREML = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/coreml/whisper_tiny_coreml_fp32.pte`; -const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/tokenizer.json`; -const WHISPER_BASE_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/xnnpack/whisper_base_xnnpack_fp32.pte`; -const WHISPER_BASE_MODEL_COREML = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/coreml/whisper_base_coreml_fp32.pte`; +const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/tokenizer.json`; +const WHISPER_BASE_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/xnnpack/whisper_base_xnnpack_fp32.pte`; +const WHISPER_BASE_MODEL_COREML = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/coreml/whisper_base_coreml_fp32.pte`; -const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/tokenizer.json`; -const WHISPER_SMALL_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/xnnpack/whisper_small_xnnpack_fp32.pte`; -const WHISPER_SMALL_MODEL_COREML = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/coreml/whisper_small_coreml_fp32.pte`; +const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/tokenizer.json`; +const WHISPER_SMALL_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/xnnpack/whisper_small_xnnpack_fp32.pte`; +const WHISPER_SMALL_MODEL_COREML = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/coreml/whisper_small_coreml_fp32.pte`; /** * @category Models - Speech To Text diff --git a/skills/canary/react-native-executorch/references/reference-models.md b/skills/canary/react-native-executorch/references/reference-models.md index f6010a7793..02134f4513 100644 --- a/skills/canary/react-native-executorch/references/reference-models.md +++ b/skills/canary/react-native-executorch/references/reference-models.md @@ -195,18 +195,21 @@ For a list of all available Speech to Text models reference [this Hugging Face c ### Whisper Models (English only) -- **WHISPER_TINY_EN** - Whisper Tiny English-only -- **WHISPER_TINY_EN_QUANTIZED** - Whisper Tiny English-only quantized -- **WHISPER_BASE_EN** - Whisper Base English-only -- **WHISPER_BASE_EN_QUANTIZED** - Whisper Base English-only quantized -- **WHISPER_SMALL_EN** - Whisper Small English-only -- **WHISPER_SMALL_EN_QUANTIZED** - Whisper Small English-only quantized +- **WHISPER_TINY_EN** - Whisper Tiny English-only (XNNPACK) +- **WHISPER_TINY_EN_COREML** - Whisper Tiny English-only (CoreML) +- **WHISPER_BASE_EN** - Whisper Base English-only (XNNPACK) +- **WHISPER_BASE_EN_COREML** - Whisper Base English-only (CoreML) +- **WHISPER_SMALL_EN** - Whisper Small English-only (XNNPACK) +- **WHISPER_SMALL_EN_COREML** - Whisper Small English-only (CoreML) ### Whisper Models (Multilingual) -- **WHISPER_TINY** - Whisper Tiny multilingual -- **WHISPER_BASE** - Whisper Base multilingual -- **WHISPER_SMALL** - Whisper Small multilingual +- **WHISPER_TINY** - Whisper Tiny multilingual (XNNPACK) +- **WHISPER_TINY_COREML** - Whisper Tiny multilingual (CoreML) +- **WHISPER_BASE** - Whisper Base multilingual (XNNPACK) +- **WHISPER_BASE_COREML** - Whisper Base multilingual (CoreML) +- **WHISPER_SMALL** - Whisper Small multilingual (XNNPACK) +- **WHISPER_SMALL_COREML** - Whisper Small multilingual (CoreML) --- diff --git a/skills/react-native-executorch/references/reference-models.md b/skills/react-native-executorch/references/reference-models.md index f6010a7793..02134f4513 100644 --- a/skills/react-native-executorch/references/reference-models.md +++ b/skills/react-native-executorch/references/reference-models.md @@ -195,18 +195,21 @@ For a list of all available Speech to Text models reference [this Hugging Face c ### Whisper Models (English only) -- **WHISPER_TINY_EN** - Whisper Tiny English-only -- **WHISPER_TINY_EN_QUANTIZED** - Whisper Tiny English-only quantized -- **WHISPER_BASE_EN** - Whisper Base English-only -- **WHISPER_BASE_EN_QUANTIZED** - Whisper Base English-only quantized -- **WHISPER_SMALL_EN** - Whisper Small English-only -- **WHISPER_SMALL_EN_QUANTIZED** - Whisper Small English-only quantized +- **WHISPER_TINY_EN** - Whisper Tiny English-only (XNNPACK) +- **WHISPER_TINY_EN_COREML** - Whisper Tiny English-only (CoreML) +- **WHISPER_BASE_EN** - Whisper Base English-only (XNNPACK) +- **WHISPER_BASE_EN_COREML** - Whisper Base English-only (CoreML) +- **WHISPER_SMALL_EN** - Whisper Small English-only (XNNPACK) +- **WHISPER_SMALL_EN_COREML** - Whisper Small English-only (CoreML) ### Whisper Models (Multilingual) -- **WHISPER_TINY** - Whisper Tiny multilingual -- **WHISPER_BASE** - Whisper Base multilingual -- **WHISPER_SMALL** - Whisper Small multilingual +- **WHISPER_TINY** - Whisper Tiny multilingual (XNNPACK) +- **WHISPER_TINY_COREML** - Whisper Tiny multilingual (CoreML) +- **WHISPER_BASE** - Whisper Base multilingual (XNNPACK) +- **WHISPER_BASE_COREML** - Whisper Base multilingual (CoreML) +- **WHISPER_SMALL** - Whisper Small multilingual (XNNPACK) +- **WHISPER_SMALL_COREML** - Whisper Small multilingual (CoreML) --- From 6bba141d959abb030c1bace7dbbc214b6573f4c9 Mon Sep 17 00:00:00 2001 From: IgorSwat Date: Wed, 20 May 2026 13:39:27 +0200 Subject: [PATCH 11/20] Minor fixes --- .../models/speech_to_text/whisper/ASR.cpp | 13 +++++++------ .../models/speech_to_text/whisper/OnlineASR.cpp | 14 ++++++-------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp index 5a925e6eba..dbf7155d23 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp @@ -1,8 +1,3 @@ -#include -#include -#include -#include - #include "ASR.h" #include "Constants.h" #include "Params.h" @@ -11,6 +6,12 @@ #include #include +#include +#include +#include +#include +#include + namespace rnexecutorch::models::speech_to_text::whisper { using executorch::runtime::etensor::ScalarType; @@ -451,7 +452,7 @@ ASR::estimateWordLevelTimestampsLinear(std::span tokens, puncts += w.back(); w.pop_back(); } - std::reverse(puncts.begin(), puncts.end()); + std::ranges::reverse(puncts); // Add the core word. wordObjs.emplace_back(std::move(w), wStart, wEnd); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp index ced3193531..0567716bbe 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp @@ -1,14 +1,13 @@ #include "OnlineASR.h" +#include "Constants.h" +#include "Params.h" +#include "Utils.h" #include #include #include #include -#include "Constants.h" -#include "Params.h" -#include "Utils.h" - namespace rnexecutorch::models::speech_to_text::whisper::stream { OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) { @@ -41,8 +40,8 @@ void OnlineASR::insertAudioChunk(std::span audio) { } ProcessResult OnlineASR::process(const DecodingOptions &options) { - constexpr float kStreamSafeBufferMaxSamples = - params::kStreamSafeBufferDuration * constants::kSamplingRate; + constexpr size_t kStreamSafeBufferMaxSamples = static_cast( + params::kStreamSafeBufferDuration * constants::kSamplingRate); std::vector audioCopy; @@ -115,8 +114,7 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) { // in a 'good' spot - where it will remove a significant audio chunk, yet // won't affect most recent, unfinished speech samples. size_t bufferSize = audioBuffer_.size(); - if (std::cmp_greater(bufferSize, - kStreamSafeBufferMaxSamples)) { + if (bufferSize > kStreamSafeBufferMaxSamples) { auto newCommitted = commitAndClean(words); committed.insert(committed.end(), From 1aebae62ef64abe920a4324f95f39812a3d20087 Mon Sep 17 00:00:00 2001 From: IgorSwat Date: Wed, 20 May 2026 17:59:52 +0200 Subject: [PATCH 12/20] Fix broken test build --- .../common/rnexecutorch/tests/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt index 06a30a13f7..1fcad420cc 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt +++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt @@ -262,7 +262,6 @@ add_rn_test(SpeechToTextTests integration/SpeechToTextTest.cpp SOURCES ${RNEXECUTORCH_DIR}/models/speech_to_text/SpeechToText.cpp ${RNEXECUTORCH_DIR}/models/speech_to_text/whisper/ASR.cpp - ${RNEXECUTORCH_DIR}/models/speech_to_text/whisper/HypothesisBuffer.cpp ${RNEXECUTORCH_DIR}/models/speech_to_text/whisper/OnlineASR.cpp ${RNEXECUTORCH_DIR}/data_processing/gzip.cpp ${TOKENIZER_SOURCES} From 88185d5dcfbb3b659348d6dc609437b50c460667 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 20 May 2026 19:22:13 +0200 Subject: [PATCH 13/20] chore(stt): drop unused transcribeStringOnly declaration The method was declared in SpeechToText.h but never defined or referenced anywhere in the package. Removing it cleans up the public API surface. --- .../common/rnexecutorch/models/speech_to_text/SpeechToText.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h index ec51862793..ae053008cd 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h @@ -34,10 +34,6 @@ class SpeechToText { std::string languageOption, bool verbose) const; - [[nodiscard("Registered non-void function")]] - std::vector transcribeStringOnly(std::span waveform, - std::string languageOption) const; - size_t getMemoryLowerBound() const noexcept; // Stream From 44f69310636a0b90327c87d0ba4c54bc7f377bd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 20 May 2026 19:23:20 +0200 Subject: [PATCH 14/20] fix(stt): preserve pending committed words in OnlineASR insertAudioChunk's overflow path was overwriting memory_.toCommit on each cap-hit. Two cap-hits before the next process() call silently dropped the first batch. Append instead of assign. --- .../models/speech_to_text/whisper/OnlineASR.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp index 0567716bbe..e663c5bfab 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp @@ -34,8 +34,11 @@ void OnlineASR::insertAudioChunk(std::span audio) { // process() method is called regularly within reasonable steps of time. if (audioBuffer_.size() > constants::kMaxSamples) { // Note that results are not actually committed now, but saved for - // a later call of process(). - memory_.toCommit = commitAndClean(memory_.transcript); + // a later call of process(). Append rather than assign so that two + // back-to-back buffer-cap hits (e.g. while VAD is muted) don't drop the + // first batch. + auto pending = commitAndClean(memory_.transcript); + std::ranges::move(pending, std::back_inserter(memory_.toCommit)); } } From aaeac64ba01284525a53d92a99dc7c716ada9c97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 20 May 2026 19:23:44 +0200 Subject: [PATCH 15/20] fix(stt): use scores.size() as avgLogProb denominator The previous tokens.size() + 1 matched neither a literal mean (would be scores.size()) nor OpenAI Whisper's formula (len(full_seq) + 1, where full_seq includes the SOT prefix and EOT). Align with whisper.cpp, which divides by the number of summed log-probs. --- .../common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp index dbf7155d23..d57a06cd6c 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp @@ -214,7 +214,9 @@ std::vector ASR::generate(std::span waveform, scores.begin(), scores.end(), 0.0f, std::plus<>(), [](float s) { return std::log(std::max(s, 1e-9f)); }); - const float avgLogProb = cumLogProb / static_cast(tokens.size() + 1); + // Match whisper.cpp: divide by the number of summed log-probs. + const float avgLogProb = + cumLogProb / static_cast(std::max(1, scores.size())); const std::string text = tokenizer_->decode(tokens, true); const float compressionRatio = this->calculateCompressionRatio(text); From 7f465402f208b89c2bcb15a18ab74a1ae0824df4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 20 May 2026 19:24:21 +0200 Subject: [PATCH 16/20] perf(stt): hoist mt19937 out of the sampling loop random_device was consulted and a fresh Mersenne state constructed for every sampled token. Seed once per generate() call instead. --- .../common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp index d57a06cd6c..a9f2b152b4 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp @@ -275,6 +275,9 @@ ASR::generate(std::span waveform, const DecodingOptions &options, logitsTensor = this->decode(single, encoderFeatures, startPos); } + // Seed once per generate() call rather than per sampled token. + std::mt19937 gen(std::random_device{}()); + // Autoregressive decoding: always 1 token at a time while (std::cmp_less(startPos, constants::kMaxDecodeLength)) { const size_t logitsInnerDim = logitsTensor.size(1); @@ -307,7 +310,6 @@ ASR::generate(std::span waveform, const DecodingOptions &options, nextProb = *maxIt; } else { std::discrete_distribution<> dist(probs.begin(), probs.end()); - std::mt19937 gen((std::random_device{}())); nextId = dist(gen); nextProb = probs[nextId]; } From 72912ade9f72c6c49f6bc3c54d0aacdbb1e9b615 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 20 May 2026 19:26:30 +0200 Subject: [PATCH 17/20] fix(stt)!: drop quantized variants from SpeechToTextModelName MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The whisper-*-en-quantized constants are removed in this PR, but the SpeechToTextModelName union still accepted those literals — type-safe to pass, runtime-failing to use. Drop them from the union as part of the same breaking-change. --- packages/react-native-executorch/src/types/stt.ts | 3 --- 1 file changed, 3 deletions(-) diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts index 20f1013ef0..f9a2fb56d8 100644 --- a/packages/react-native-executorch/src/types/stt.ts +++ b/packages/react-native-executorch/src/types/stt.ts @@ -7,11 +7,8 @@ import { RnExecutorchError } from '../errors/errorUtils'; */ export type SpeechToTextModelName = | 'whisper-tiny-en' - | 'whisper-tiny-en-quantized' | 'whisper-base-en' - | 'whisper-base-en-quantized' | 'whisper-small-en' - | 'whisper-small-en-quantized' | 'whisper-tiny' | 'whisper-base' | 'whisper-small'; From 06059503eb1907986b0b8e5d30fb1564761762bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 20 May 2026 19:26:46 +0200 Subject: [PATCH 18/20] chore(stt): align stream() declaration with definition The header had bool enableTimestamps; the .cpp uses bool verbose (which matches the JS-side DecodingOptions.verbose). Rename here for consistency. --- .../common/rnexecutorch/models/speech_to_text/SpeechToText.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h index ae053008cd..16e94ef88b 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h @@ -38,8 +38,7 @@ class SpeechToText { // Stream void stream(std::shared_ptr callback, - std::string languageOption, bool enableTimestamps, - uint32_t timeout); + std::string languageOption, bool verbose, uint32_t timeout); void streamStop(); void streamInsert(std::span waveform); From a5c366273a8a7b3dbd0cf12b15d19e2f0bf1cb34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 20 May 2026 19:27:45 +0200 Subject: [PATCH 19/20] fix(stt): break streamStop() out of the timeout pause immediately The streaming loop slept sleep_for(timeout) ms unconditionally between inferences, so streamStop() couldn't take effect until the next pause expired (final flush delayed by the full timeout). Replace with a condition_variable wait that streamStop() signals; inserts intentionally do not wake the loop, preserving the throttle. --- .../models/speech_to_text/SpeechToText.cpp | 13 ++++++++++--- .../models/speech_to_text/SpeechToText.h | 7 +++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp index 9537642d58..3acd076779 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp @@ -158,8 +158,12 @@ void SpeechToText::stream(std::shared_ptr callback, // The reasoning is very simple: with the current liberal threshold values, // running transcriptions too rapidly (before the audio buffer is filled // with significant amount of new data) can cause streamer to commit wrong - // phrases. - std::this_thread::sleep_for(std::chrono::milliseconds(timeout)); + // phrases. We wait on a condition_variable so streamStop() can break the + // pause immediately — inserts intentionally do not wake us, to preserve + // the throttle. + std::unique_lock lock(streamCvMutex_); + streamCv_.wait_for(lock, std::chrono::milliseconds(timeout), + [this] { return !isStreaming_.load(); }); } std::vector finalWords = streamer_->finish(options); @@ -170,7 +174,10 @@ void SpeechToText::stream(std::shared_ptr callback, resetStreamState(); } -void SpeechToText::streamStop() { isStreaming_ = false; } +void SpeechToText::streamStop() { + isStreaming_ = false; + streamCv_.notify_all(); +} void SpeechToText::streamInsert(std::span waveform) { streamer_->insertAudioChunk(waveform); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h index 16e94ef88b..adcfd8ae99 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h @@ -1,6 +1,8 @@ #pragma once #include +#include +#include #include #include #include @@ -54,6 +56,11 @@ class SpeechToText { std::unique_ptr streamer_ = nullptr; std::atomic isStreaming_ = false; std::atomic readyToProcess_ = false; + + // Lets streamStop() wake the streaming loop immediately instead of + // waiting for the next throttling interval to expire. + std::mutex streamCvMutex_; + std::condition_variable streamCv_; }; } // namespace models::speech_to_text From ef92351be9e3c32327d8a31bb4a8068c7cc3b45e Mon Sep 17 00:00:00 2001 From: IgorSwat Date: Thu, 21 May 2026 09:59:47 +0200 Subject: [PATCH 20/20] docs: simplify & update STT docs --- .../useSpeechToText.md | 321 ++++++------------ .../SpeechToTextModule.md | 215 +++++------- 2 files changed, 176 insertions(+), 360 deletions(-) diff --git a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md index 089b844eb0..dc9f88179c 100644 --- a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md +++ b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md @@ -17,20 +17,31 @@ keywords: description: "Learn how to use speech-to-text models in your React Native applications with React Native ExecuTorch's useSpeechToText hook." --- -Speech to text is a task that allows to transform spoken language to written text. It is commonly used to implement features such as transcription or voice assistants. +Speech to text (STT) converts spoken audio into written text. This hook allows you to implement features like voice assistants, real-time transcription, and audio file processing directly on-device. :::info -It is recommended to use models provided by us, which are available at our [Hugging Face repository](https://huggingface.co/collections/software-mansion/speech-to-text-68d0ec99ed794250491b8bbe). You can also use [constants](https://github.com/software-mansion/react-native-executorch/blob/main/packages/react-native-executorch/src/constants/modelUrls.ts) shipped with our library. +We recommend using our optimized models available on [Hugging Face](https://huggingface.co/collections/software-mansion/speech-to-text-68d0ec99ed794250491b8bbe). You can also use pre-defined [constants](https://github.com/software-mansion/react-native-executorch/blob/main/packages/react-native-executorch/src/constants/modelUrls.ts) included in the library. ::: ## API Reference -- For detailed API Reference for `useSpeechToText` see: [`useSpeechToText` API Reference](../../06-api-reference/functions/useSpeechToText.md). -- For all speech to text models available out-of-the-box in React Native ExecuTorch see: [STT Models](../../06-api-reference/index.md#models---speech-to-text). +- [`useSpeechToText` API Reference](../../06-api-reference/functions/useSpeechToText.md) +- [STT Models List](../../06-api-reference/index.md#models---speech-to-text) -## High Level Overview +## Basic Usage (File Transcription) -You can obtain waveform from audio in any way most suitable to you, however in the snippet below we utilize [`react-native-audio-api`](https://docs.swmansion.com/react-native-audio-api/) library to process a `.mp3` file. +Use `transcribe` for processing pre-recorded audio or short clips. The input should be a `Float32Array` of audio samples at **16 kHz**. + +### Transcribe Options + +The `transcribe()` function accepts an optional configuration object: + +- `language`: The language code (e.g., `'es'`, `'fr'`). Required for multilingual models. +- `verbose`: If `true`, the method returns a detailed `TranscriptionResult` object following the OpenAI Whisper `verbose_json` format (including segments and word-level timestamps). + +In this example, we use [`react-native-audio-api`](https://docs.swmansion.com/react-native-audio-api/) to decode an audio file into the required format. + +### Example ```typescript import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch'; @@ -41,181 +52,46 @@ const model = useSpeechToText({ model: WHISPER_TINY_EN, }); +// 1. Get audio file const { uri } = await FileSystem.downloadAsync( 'https://some-audio-url.com/file.mp3', - FileSystem.cacheDirectory + 'audio_file' + `${FileSystem.cacheDirectory}audio_file` ); +// 2. Decode to 16kHz PCM Float32Array const audioContext = new AudioContext({ sampleRate: 16000 }); const decodedAudioData = await audioContext.decodeAudioData(uri); const audioBuffer = decodedAudioData.getChannelData(0); +// 3. Transcribe try { - const transcription = await model.transcribe(audioBuffer); - console.log(transcription.text); + const result = await model.transcribe(audioBuffer); + console.log('Transcription:', result.text); } catch (error) { - console.error('Error during audio transcription', error); + console.error('Transcription failed:', error); } ``` -### Streaming - -Since speech-to-text models can only process audio segments up to 30 seconds long, we need to split longer inputs into chunks. However, simple chunking may cut speech mid-sentence, making it harder for the model to understand. To address this, we use the [whisper-streaming](https://aclanthology.org/2023.ijcnlp-demo.3.pdf) algorithm. While this introduces some overhead, it enables accurate processing of audio inputs of arbitrary length. - -### Arguments - -`useSpeechToText` takes [`SpeechToTextProps`](../../06-api-reference/interfaces/SpeechToTextProps.md) that consists of: - -- `model` of type [`SpeechToTextConfig`](../../06-api-reference/interfaces/SpeechToTextModelConfig.md), containing the [`isMultilingual` flag](../../06-api-reference/interfaces/SpeechToTextModelConfig.md#ismultilingual), [tokenizer source](../../06-api-reference/interfaces/SpeechToTextModelConfig.md#tokenizersource) and [model source](../../06-api-reference/interfaces/SpeechToTextModelConfig.md#modelsource). -- An optional flag [`preventLoad`](../../06-api-reference/interfaces/SpeechToTextProps.md#preventload) which prevents auto-loading of the model. - -You need more details? Check the following resources: +## Live Streaming Transcription -- For detailed information about `useSpeechToText` arguments check this section: [`useSpeechToText` arguments](../../06-api-reference/functions/useSpeechToText.md#parameters) -- For all speech to text models available out-of-the-box in React Native ExecuTorch see: [STT Models](../../06-api-reference/index.md#models---speech-to-text). -- For more information on loading resources, take a look at [loading models](../../01-fundamentals/02-loading-models.md) page. - -### Returns - -`useSpeechToText` returns an object called `SpeechToTextType` containing bunch of functions to interact with STT. - -Please note, that both [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) and [`stream`](../../06-api-reference/interfaces/SpeechToTextType.md#stream) functions accept [`DecodingOptions`](../../06-api-reference/interfaces/DecodingOptions.md) type as an argument. It accepts language abbreviation, you can check them out in [`language`](../../06-api-reference/interfaces/DecodingOptions.md#language) property of this config of type [`SpeechToTextLanguage`](../../06-api-reference/type-aliases/SpeechToTextLanguage.md). - -To get more details please read: [`SpeechToTextType` API Reference](../../06-api-reference/interfaces/SpeechToTextType.md). - -## Running the model - -Before running the model's [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) method, make sure to extract the audio waveform you want to transcribe. You'll need to handle this step yourself, ensuring the audio is sampled at 16 kHz. Once you have the waveform, pass it as an argument to the transcribe method. The method returns a promise that resolves to the generated transcription on success, or an error if inference fails. - -### Multilingual transcription - -If you want to transcribe speech in languages other than English, use the multilingual version of Whisper. To generate the output in your desired language, pass the [`language`](../../06-api-reference/interfaces/DecodingOptions.md#language) option to the [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) method. - -```typescript -import { useSpeechToText, WHISPER_TINY } from 'react-native-executorch'; +For real-time applications or audio streams of arbitrary length, use the **Streaming API**. This is optimized for live input, handling the 30-second window limitation of Whisper models automatically to ensure context isn't lost between chunks. -const model = useSpeechToText({ - model: WHISPER_TINY, -}); +### How it works: -const transcription = await model.transcribe(spanishAudio, { language: 'es' }); -``` +1. **Feed audio**: Use `streamInsert` to push small chunks of audio (e.g., 100ms) as they arrive from the microphone. +2. **Get results**: The `stream` generator yields two types of text: + - `committed`: Finalized text that won't change. + - `nonCommitted`: Temporary text that might update as the model gets more context from the audio. -### Timestamps & Transcription Stat Data +### Streaming Options -You can obtain word-level timestamps and other useful parameters from transcription ([`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) and [`stream`](../../06-api-reference/interfaces/SpeechToTextType.md#stream) methods) by setting `verbose: true` in the options. The result mimics the _verbose_json_ format from OpenAI Whisper API. For more information please read [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe), [`stream`](../../06-api-reference/interfaces/SpeechToTextType.md#stream), and [`TranscriptionResult`](../../06-api-reference/interfaces/TranscriptionResult.md) API References. +The `stream()` function accepts several optional parameters: -```typescript -const transcription = await model.transcribe(audioBuffer, { verbose: true }); -// Example result -// -// transcription: { -// task: "transcription", -// text: "Example text for a ...", -// duration: 9.05, -// language: "en", -// segments: [ -// { -// start: 0, -// end: 5.4, -// text: "Example text for", -// words: [ -// { -// word: "Example", -// start: 0, -// end: 1.4 -// }, -// ... -// ] -// tokens: [1, 32, 45, ...], -// temperature: 0.0, -// avgLogprob: -1.235, -// compressionRatio: 1.632 -// }, -// ... -// ] -// } -``` +- `language`: The language code (e.g., `'es'`, `'fr'`). Required for multilingual models. +- `verbose`: If `true`, includes word-level timestamps and segment metadata in the result objects. +- `timeout`: (Advanced) The interval (in milliseconds) between processing consecutive audio chunks in streaming mode. Lower values provide more frequent updates and lower latency, while higher values reduce CPU consumption. Defaults to `100`. -## Example - -```tsx -import React, { useState } from 'react'; -import { Button, Text, View } from 'react-native'; -import { - useSpeechToText, - WHISPER_TINY_EN, - TranscriptionResult, -} from 'react-native-executorch'; -import { AudioContext } from 'react-native-audio-api'; -import * as FileSystem from 'expo-file-system'; - -function App() { - const model = useSpeechToText({ - model: WHISPER_TINY_EN, - }); - - const [transcription, setTranscription] = useState(null); - - const loadAudio = async () => { - const { uri } = await FileSystem.downloadAsync( - 'https://some-audio-url.com/file.mp3', - FileSystem.cacheDirectory + 'audio_file' - ); - - const audioContext = new AudioContext({ sampleRate: 16000 }); - const decodedAudioData = await audioContext.decodeAudioDataSource(uri); - const audioBuffer = decodedAudioData.getChannelData(0); - - return audioBuffer; - }; - - const handleTranscribe = async () => { - const audio = await loadAudio(); - // Default text transcription - const result = await model.transcribe(audio); - setTranscription(result); - }; - - const handleTranscribeWithTimestamps = async () => { - const audio = await loadAudio(); - // Transcription with timestamps - const result = await model.transcribe(audio, { verbose: true }); - setTranscription(result); - }; - - // Custom logic for printing transcription - // e.g. - - const renderContent = () => { - if (!transcription) return Press a button to transcribe; - - if (transcription.segments && transcription.segments.length > 0) { - return ( - - {transcription.text + - '\n\nNum segments: ' + - transcription.segments.length.toString()} - - ); - } - return {transcription.text}; - }; - - return ( - - {renderContent()} -