From bc31cd5528a71612225e4e67f98f1db15ccd8282 Mon Sep 17 00:00:00 2001
From: IgorSwat <igorswat2002@o2.pl>
Date: Thu, 7 May 2026 12:10:30 +0200
Subject: [PATCH 01/20] Optimal streaming algorithm

---
 .../host_objects/JsiConversions.h             |   3 +-
 .../models/speech_to_text/SpeechToText.cpp    |   4 +-
 .../speech_to_text/common/schema/OnlineASR.h  |   2 +-
 .../models/speech_to_text/common/types/Word.h |   7 +-
 .../models/speech_to_text/whisper/ASR.cpp     |  16 +-
 .../models/speech_to_text/whisper/Constants.h |  15 +-
 .../whisper/HypothesisBuffer.cpp              | 199 --------------
 .../speech_to_text/whisper/HypothesisBuffer.h |  82 ------
 .../speech_to_text/whisper/OnlineASR.cpp      | 258 +++++++++++-------
 .../models/speech_to_text/whisper/OnlineASR.h |  65 ++---
 .../models/speech_to_text/whisper/Params.h    |  95 ++-----
 .../models/speech_to_text/whisper/Utils.h     |  67 +----
 12 files changed, 245 insertions(+), 568 deletions(-)
 delete mode 100644 packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp
 delete mode 100644 packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.h
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index 077d426c8f..c50410a4f7 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -591,8 +591,7 @@ inline jsi::Value getJsiValue(const Segment &seg, jsi::Runtime &runtime) {
     jsi::Object wordObj(runtime);
     wordObj.setProperty(
         runtime, "word",
-        jsi::String::createFromUtf8(runtime, seg.words[i].content +
-                                                 seg.words[i].punctations));
+        jsi::String::createFromUtf8(runtime, seg.words[i].content));
     wordObj.setProperty(runtime, "start",
                         static_cast<double>(seg.words[i].start));
     wordObj.setProperty(runtime, "end", static_cast<double>(seg.words[i].end));
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index 4b58c5039b..3133c0bb29 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -94,7 +94,7 @@ TranscriptionResult wordsToResult(const std::vector<Word> &words,
 
   std::string fullText;
   for (const auto &w : words) {
-    fullText += w.content + w.punctations;
+    fullText += w.content;
   }
   res.text = fullText;
 
@@ -161,7 +161,7 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
     std::this_thread::sleep_for(std::chrono::milliseconds(100));
   }
 
-  std::vector<Word> finalWords = streamer_->finish();
+  std::vector<Word> finalWords = streamer_->finish(options);
   TranscriptionResult finalRes =
       wordsToResult(finalWords, languageOption, verbose);
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h
index 357309391d..efe6cc2819 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h
@@ -36,7 +36,7 @@ class OnlineASR {
 
   virtual ProcessResult process(const DecodingOptions &options) = 0;
 
-  virtual std::vector<Word> finish() = 0;
+  virtual std::vector<Word> finish(const DecodingOptions &options) = 0;
 
   virtual void reset() = 0;
 };
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h
index e7319f95b5..2343d1faab 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h
@@ -4,13 +4,14 @@
 
 namespace rnexecutorch::models::speech_to_text {
 
+/**
+ * Basically a different representation of token,
+ * with timestamps calculated.
+ */
 struct Word {
   std::string content;
   float start;
   float end;
-
-  std::string
-      punctations; // Trailing punctations which appear after the main content
 };
 
 } // namespace rnexecutorch::models::speech_to_text
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
index d1debeb0f0..b0d08e419b 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
@@ -138,8 +138,9 @@ executorch::aten::Tensor ASR::decode(std::span<uint64_t> tokens,
       positionShape, cachePositions.data(), ScalarType::Long);
 
   const auto encoderOutputSize = static_cast<int32_t>(encoderOutput.size());
-  std::vector<int32_t> encShape = {1, constants::kNumFrames,
-                                   encoderOutputSize / constants::kNumFrames};
+  std::vector<int32_t> encShape = {
+      1, static_cast<int32_t>(constants::kNumFrames),
+      encoderOutputSize / static_cast<int32_t>(constants::kNumFrames)};
   auto encoderTensor = executorch::extension::make_tensor_ptr(
       std::move(encShape), const_cast<float *>(encoderOutput.data()),
       ScalarType::Float);
@@ -437,7 +438,7 @@ ASR::estimateWordLevelTimestampsLinear(std::span<const uint64_t> tokens,
     const float wEnd = wStart + timePerChar * wSize;
     prevCharCount += wSize;
 
-    // We store punctations separately to other characters.
+    // Detect and extract trailing punctuations.
     std::string puncts = "";
     while (!w.empty() && constants::kPunctations.contains(w.back())) {
       puncts += w.back();
@@ -445,7 +446,14 @@ ASR::estimateWordLevelTimestampsLinear(std::span<const uint64_t> tokens,
     }
     std::reverse(puncts.begin(), puncts.end());
 
-    wordObjs.emplace_back(std::move(w), wStart, wEnd, std::move(puncts));
+    // Add the core word.
+    wordObjs.emplace_back(std::move(w), wStart, wEnd);
+
+    // If punctuation was present, add it as a separate "word" with an
+    // instantaneous timestamp at the end of the original word.
+    if (!puncts.empty()) {
+      wordObjs.emplace_back(std::move(puncts), wEnd, wEnd);
+    }
   }
 
   return wordObjs;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h
index 0b284345ec..30062a75ba 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h
@@ -9,20 +9,22 @@ namespace rnexecutorch::models::speech_to_text::whisper::constants {
 // Maximum duration of each audio chunk to process (in seconds)
 // It is intentionally set to 29 since otherwise only the last chunk would be
 // correctly transcribe due to the model's positional encoding limit
-constexpr static int32_t kChunkSize = 29;
+constexpr static size_t kChunkSize = 29;
 
 // Sampling rate expected by Whisper and the model's audio pipeline (16 kHz)
-constexpr static int32_t kSamplingRate = 16000;
-constexpr static int32_t kSamplesPerMilisecond = kSamplingRate / 1000;
+constexpr static size_t kSamplingRate = 16000;
+constexpr static size_t kSamplesPerMilisecond = kSamplingRate / 1000;
+
+constexpr static size_t kMaxSamples = kChunkSize * kSamplingRate;
 
 // The maximum number of tokens the decoder can generate per chunk
-constexpr static int32_t kMaxDecodeLength = 128;
+constexpr static size_t kMaxDecodeLength = 128;
 
 // Minimum allowed chunk length before processing (in audio samples)
-constexpr static int32_t kMinChunkSamples = 1 * kSamplingRate;
+constexpr static size_t kMinChunkSamples = 1 * kSamplingRate;
 
 // Number of mel frames output by the encoder (derived from input spectrogram)
-constexpr static int32_t kNumFrames = 1500;
+constexpr static size_t kNumFrames = 1500;
 
 // Time precision used by Whisper timestamps: each token spans 0.02 seconds
 constexpr static float kTimePrecision = 0.02f;
@@ -30,6 +32,7 @@ constexpr static float kTimePrecision = 0.02f;
 // Special characters serving as pause / end of sentence
 static const std::unordered_set<char> kPunctations = {',', '.', '?',
                                                       '!', ':', ';'};
+static const std::unordered_set<char> kEosPunctations = {'.', '?', '!', ';'};
 
 // Special token constants
 namespace tokens {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp
deleted file mode 100644
index ce365e4e44..0000000000
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-#include "HypothesisBuffer.h"
-#include "Params.h"
-#include "Utils.h"
-
-#include <algorithm>
-#include <cmath>
-
-namespace rnexecutorch::models::speech_to_text::whisper::stream {
-
-void HypothesisBuffer::insert(std::span<const Word> words, float offset) {
-  // Step 1 - decide which words should be considered as fresh.
-  fresh_.clear();
-
-  // We try to find the last committed word in a transcription string.
-  // Everything beyond that word will be considered as fresh.
-  // To make the algorithm more resilient to repeated strings of words,
-  // we check also the preceeding words as well as timestamps (with liberal
-  // range).
-  size_t firstFreshWordIdx = 0;
-  if (!committed_.empty()) {
-    std::optional<size_t> lastMatchingWordIdx =
-        findCommittedSuffix(words, params::kStreamCommitedSuffixSearchSize,
-                            params::kStreamMaxOverlapTimestampDiff1,
-                            params::kStreamWordsPerErrorRate);
-    firstFreshWordIdx = lastMatchingWordIdx.value_or(0);
-  }
-
-  bool isCompletelyFresh = firstFreshWordIdx == 0;
-  for (size_t i = firstFreshWordIdx; i < words.size(); i++) {
-    const auto &word = words[i];
-
-    // Global start is a beginning timestamp relative only to the beginning of
-    // the current streaming process.
-    const float startGlobal = word.start + offset;
-    const float endGlobal = word.end + offset;
-
-    if (!isCompletelyFresh ||
-        startGlobal > lastCommittedTime_ - params::kStreamFreshThreshold) {
-      fresh_.emplace_back(word.content, startGlobal, endGlobal,
-                          word.punctations);
-    }
-  }
-
-  // Step 2 - we have already selected the fresh words. Now it's time to
-  // correct any mistakes and remove the words which overlap with already
-  // commited segments - to avoid duplicates.
-  if (!fresh_.empty() && !committed_.empty()) {
-    // Calculate the largest overlapping fragment size.
-    // Note that we use size limit (kStreamMaxOverlapSize) for efficiency of the
-    // algorithm, and timestamp difference limit
-    // (kStreamMaxOverlapTimestampDiff) to avoid removing correct fragments
-    // which were just repeated after some time.
-    size_t overlapSize = utils::findLargestOverlapingFragment(
-        committed_, fresh_, params::kStreamMaxOverlapSize,
-        params::kStreamMaxOverlapTimestampDiff2);
-
-    if (overlapSize > 0) {
-      fresh_.erase(fresh_.begin(), fresh_.begin() + overlapSize);
-    }
-  }
-}
-
-std::deque<Word> HypothesisBuffer::commit() {
-  std::deque<Word> toCommit = {};
-
-  // Find a stable prefix: words that haven't changed between last and current
-  // iteration.
-  while (!fresh_.empty() && !hypothesis_.empty() &&
-         fresh_.front().content == hypothesis_.front().content) {
-    // The last word from the fresh_ buffer must also match punctations with the
-    // hypothesis. This is done in order to ensure correct punctation marks in
-    // the resulting transcription.
-    if (fresh_.size() == 1 &&
-        fresh_.front().punctations != hypothesis_.front().punctations) {
-      break;
-    }
-
-    // Take timestamps from the hypothesis, but actual content from the fresh
-    // buffer.
-    toCommit.emplace_back(std::move(fresh_.front().content),
-                          hypothesis_.front().start, hypothesis_.front().end,
-                          std::move(fresh_.front().punctations));
-    fresh_.pop_front();
-    hypothesis_.pop_front();
-  }
-
-  // Save the last committed word timestamp.
-  // This will mark the end of the entire committed sequence.
-  if (!toCommit.empty()) {
-    lastCommittedTime_ = toCommit.back().end;
-  }
-
-  // The remaining words from the fresh buffer (uncommitted phrase)
-  // become a hypothesis for the next iteration.
-  hypothesis_ = std::move(fresh_);
-  fresh_.clear();
-
-  // The last step is to commit the selected words.
-  committed_.insert(committed_.end(), toCommit.cbegin(), toCommit.cend());
-
-  return toCommit;
-}
-
-void HypothesisBuffer::releaseCommits(size_t wordsToKeep) {
-  if (committed_.size() > wordsToKeep) {
-    size_t nWordsToErase = committed_.size() - wordsToKeep;
-    committed_.erase(committed_.begin(), committed_.begin() + nWordsToErase);
-  }
-}
-
-void HypothesisBuffer::reset() {
-  fresh_.clear();
-  hypothesis_.clear();
-  committed_.clear();
-
-  lastCommittedTime_ = 0.f;
-}
-
-std::optional<size_t> HypothesisBuffer::findCommittedSuffix(
-    std::span<const Word> words, size_t nCommitted,
-    float timestampDiffTolerance, size_t wordsPerMistake) {
-  if (words.empty() || committed_.empty() || nCommitted == 0) {
-    return std::nullopt;
-  }
-
-  // Determine the subset size of committed words to check against.
-  size_t committedToMatchSize = std::min(nCommitted, committed_.size());
-
-  // Iterate backwards through 'words' to find the most recent occurrence of a
-  // suffix of 'committed_' (or the full 'committed_' sequence).
-  for (int32_t i = static_cast<int32_t>(words.size()) - 1; i >= 0; --i) {
-    bool match = true;
-    size_t matchedCount = 0;
-    size_t contentMistakeCount = 0;
-
-    // Linearly interpolate tolerance if we are at the beginning and can't check
-    // all committed words.
-    float effectiveTolerance = timestampDiffTolerance;
-    if (i < static_cast<int32_t>(committedToMatchSize) - 1) {
-      effectiveTolerance *=
-          static_cast<float>(i + 1) / static_cast<float>(committedToMatchSize);
-    }
-
-    // Try to match backwards from words[i] and committed_.back()
-    for (size_t j = 0; j < committedToMatchSize; ++j) {
-      int32_t wordsIdx = i - static_cast<int32_t>(j);
-      int32_t committedIdx =
-          static_cast<int32_t>(committed_.size()) - 1 - static_cast<int32_t>(j);
-
-      if (wordsIdx < 0) {
-        // We reached the beginning of the words span.
-        // The algorithm allows matching a partial prefix if it's at the start.
-        break;
-      }
-
-      const Word &w1 = words[wordsIdx];
-      const Word &w2 = committed_[committedIdx];
-
-      // Check timestamps within tolerance
-      if (std::max(std::abs(w1.start - w2.start), std::abs(w1.end - w2.end)) >
-          effectiveTolerance) {
-        match = false;
-        break;
-      }
-
-      // Allow sparse content mismatches while still treating the overall
-      // sequence as matching.
-      if (utils::equalsIgnoreCase(w1.content, w2.content)) {
-        matchedCount++;
-      } else {
-        contentMistakeCount++;
-      }
-
-      // Early exit if mistake count already exceeds what we can recover from
-      // given the remaining words to check.
-      if (wordsPerMistake > 0) {
-        size_t remainingToMatch = committedToMatchSize - 1 - j;
-        size_t maxPossibleMatched = matchedCount + remainingToMatch;
-        if (contentMistakeCount > (maxPossibleMatched / wordsPerMistake)) {
-          match = false;
-          break;
-        }
-      }
-    }
-
-    // One content mistake is allowed per M matched words.
-    size_t maxAllowedMistakes =
-        (wordsPerMistake == 0) ? 0 : (matchedCount / wordsPerMistake);
-
-    if (match && matchedCount > 0 &&
-        contentMistakeCount <= maxAllowedMistakes) {
-      return static_cast<size_t>(i);
-    }
-  }
-
-  return std::nullopt;
-}
-
-} // namespace rnexecutorch::models::speech_to_text::whisper::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.h
deleted file mode 100644
index 25833ec01b..0000000000
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#pragma once
-
-#include <deque>
-#include <optional>
-#include <span>
-
-#include "../common/types/Word.h"
-
-namespace rnexecutorch::models::speech_to_text::whisper::stream {
-
-/**
- * A buffer for managing streaming transcription hypotheses.
- * This class handles stabilization of the transcription result by tracking
- * "fresh" hypotheses and "committing" them once they are stable across updates.
- */
-class HypothesisBuffer {
-public:
-  /**
-   * Inserts new words into the fresh_ buffer.
-   * Words are filtered based on the last committed time and checked for
-   * overlaps with existing committed words to prevent duplicates.
-   *
-   * @param newWords A span of recently generated words.
-   * @param offset   Time offset to adjust the word timestamps.
-   */
-  void insert(std::span<const Word> words, float offset);
-
-  /**
-   * Attempts to commit words present in the fresh_ buffer.
-   * A phrase from fresh_ buffer can only be committed if it also appears
-   * in the hypothesis_ buffer (uncommitted words from previous iteration).
-   *
-   * Uncommitted words become a 'hypothesis' and are moved into the hypothesis_
-   * buffer.
-   *
-   * @return A sequence of words committed in the current iteration.
-   */
-  std::deque<Word> commit();
-
-  /**
-   * Shrinks the committed_ buffer by erasing all words except N latest ones.
-   *
-   * Used primarily to relieve increasing memory usage during very
-   * long streaming sessions.
-   *
-   * @param wordsToKeep - number of trailing words to be kept in.
-   */
-  void releaseCommits(size_t wordsToKeep);
-
-  /**
-   * Resets all the stored buffers and state variables to the initial state
-   */
-  void reset();
-
-  // Declare a friendship with OnlineASR to allow it to access the internal
-  // state of stored buffers.
-  friend class OnlineASR;
-
-private:
-  // Finds the most recent occurance of given committed string of words
-  // in a custom span of words.
-  // Returns the index of the last matching word (or nullopt if not present).
-  std::optional<size_t> findCommittedSuffix(std::span<const Word> words,
-                                            size_t nCommitted,
-                                            float timestampDiffTolerance = 1.F,
-                                            size_t wordsPerMistake = 4);
-
-  // Stored buffers
-  // The lifecycle of a correct result word looks as following:
-  // fresh buffer -> hypothesis buffer -> commited
-  std::deque<Word>
-      fresh_; // 'New' words from current iterations, which require some checks
-              // before they go into hypothesis_ buffer.
-  std::deque<Word>
-      hypothesis_; // Words potentially to be commited, stored between
-                   // iterations (obtained from fresh_ buffer).
-  std::deque<Word> committed_; // A history of already commited words.
-
-  float lastCommittedTime_ = 0.0f;
-};
-
-} // namespace rnexecutorch::models::speech_to_text::whisper::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
index ded2183201..fb57fcb0f3 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
@@ -1,26 +1,22 @@
+#include "OnlineASR.h"
+
 #include <algorithm>
 #include <iterator>
-#include <numeric>
-#include <sstream>
+#include <utility>
 
 #include "Constants.h"
-#include "OnlineASR.h"
 #include "Params.h"
 #include "Utils.h"
 
 namespace rnexecutorch::models::speech_to_text::whisper::stream {
 
-namespace {
-std::vector<Word> move_to_vector(std::deque<Word> &container) {
-  return std::vector<Word>(std::make_move_iterator(container.begin()),
-                           std::make_move_iterator(container.end()));
+OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) {
+  // Reserve an expected amount of memory for audio buffer.
+  audioBuffer_.reserve((constants::kChunkSize + 1) * constants::kSamplingRate);
 }
-} // namespace
 
-OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) {
-  // Reserve a minimal expected amount of memory for audio buffer.
-  audioBuffer_.reserve(static_cast<size_t>(2 * params::kStreamChunkThreshold *
-                                           constants::kSamplingRate));
+bool OnlineASR::isReady() const {
+  return audioBuffer_.size() >= constants::kMinChunkSamples;
 }
 
 void OnlineASR::insertAudioChunk(std::span<const float> audio) {
@@ -28,10 +24,6 @@ void OnlineASR::insertAudioChunk(std::span<const float> audio) {
   audioBuffer_.insert(audioBuffer_.end(), audio.begin(), audio.end());
 }
 
-bool OnlineASR::isReady() const {
-  return audioBuffer_.size() >= constants::kMinChunkSamples;
-}
-
 ProcessResult OnlineASR::process(const DecodingOptions &options) {
   std::vector<float> audioCopy;
 
@@ -42,122 +34,176 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
     audioCopy = audioBuffer_;
   }
 
-  std::vector<Segment> transcriptions = asr_->transcribe(audioBuffer_, options);
+  // Obtain a transcription for current audio buffer state.
+  // It's very unlikely that buffer will exceed whisper's maximum capacity, but
+  // for absolute safety we can additionally clip the buffer.
+  std::span<const float> input(
+      audioCopy.begin(),
+      audioCopy.begin() + std::min(constants::kMaxSamples, audioCopy.size()));
 
-  if (transcriptions.empty()) {
-    return {.committed = {}, .nonCommitted = {}};
-  }
+  std::vector<Segment> transcriptions = asr_->transcribe(input, options);
 
   // Flatten segments into a single word sequence.
+  // This is basically our 'nonCommitted' part for now.
   std::vector<Word> words;
-  words.reserve(transcriptions.front().words.size());
-
   for (auto &segment : transcriptions) {
-    words.insert(words.end(), std::make_move_iterator(segment.words.begin()),
-                 std::make_move_iterator(segment.words.end()));
+    std::move(segment.words.begin(), segment.words.end(),
+              std::back_inserter(words));
   }
 
-  hypothesisBuffer_.insert(words, bufferTimeOffset_);
-
-  // Apply fix for timestamps.
-  if (!hypothesisBuffer_.fresh_.empty()) {
-    size_t noNewWords = hypothesisBuffer_.fresh_.size();
-    float establishedEnd = hypothesisBuffer_.lastCommittedTime_;
-    float newBegin = hypothesisBuffer_.fresh_.front().start;
-    const float newEnd = hypothesisBuffer_.fresh_.back().end;
-    float shift = 0.F;
-    for (size_t i = 0; i < hypothesisBuffer_.fresh_.size(); i++) {
-      const float originalEnd = hypothesisBuffer_.fresh_[i].end;
-
-      if (i < hypothesisBuffer_.hypothesis_.size() &&
-          utils::equalsIgnoreCase(hypothesisBuffer_.fresh_[i].content,
-                                  hypothesisBuffer_.hypothesis_[i].content)) {
-        hypothesisBuffer_.fresh_[i].start =
-            hypothesisBuffer_.hypothesis_[i].start;
-        hypothesisBuffer_.fresh_[i].end = hypothesisBuffer_.hypothesis_[i].end;
-        shift = hypothesisBuffer_.fresh_[i].end - originalEnd;
-
-        establishedEnd = hypothesisBuffer_.hypothesis_[i].end;
-        newBegin = hypothesisBuffer_.fresh_[i].end;
-        noNewWords--;
-        continue;
-      }
-
-      // In case of a new word, we apply timestamp range scaling
-      // based on timestamps established in previous iterations.
-      const float freshDuration = newEnd - establishedEnd;
-      const float epsilon = std::max(
-          0.F, 0.85F * (freshDuration -
-                        static_cast<float>(noNewWords /
-                                           params::kStreamWordsPerSecond)));
-      float scale =
-          (freshDuration - epsilon) / std::max(newEnd - newBegin, 0.2F);
-      hypothesisBuffer_.fresh_[i].start =
-          shift + (hypothesisBuffer_.fresh_[i].start - newEnd) * scale + newEnd;
-      hypothesisBuffer_.fresh_[i].end =
-          shift + (hypothesisBuffer_.fresh_[i].end - newEnd) * scale + newEnd;
+  std::vector<Word> committed;
+
+  // Step 1: examine all previously saved EOS points.
+  // The idea is to remove entries which have changed or no longer exist
+  // due to model correcting it's output.
+  for (size_t i = 0; i < eos_.size(); i++) {
+    const auto &eos = eos_[i];
+    if (eos.position >= words.size() || !utils::isEos(words[eos.position]) ||
+        (eos.position > 0 &&
+         eos.preceeding != words[eos.position - 1].content)) {
+      eos_.erase(eos_.begin() + i, eos_.end());
+      break;
     }
   }
 
-  auto committed = hypothesisBuffer_.commit();
-  auto nonCommitted = hypothesisBuffer_.hypothesis_;
-
-  // We want to save the most recent end of sentence word
-  // to improve the audio cutting mechanism.
-  for (const auto &word : committed) {
-    if (!word.punctations.empty()) {
-      lastSentenceEnd_ = word.end;
+  // Step 2: check if the newest EOS character from transcript should be
+  // saved to eos_ vector.
+  auto lastEosIt = std::find_if(words.rbegin(), words.rend(), utils::isEos);
+  if (lastEosIt != words.rend()) {
+    size_t lastEosIndex = std::distance(words.begin(), lastEosIt.base()) - 1;
+
+    // Because of step 1, we know that if the last EOS exist in eos_,
+    // then it must be the last entry.
+    if (eos_.empty() || eos_.back().position != lastEosIndex) {
+      // Register last EOS entry
+      std::string preceeding =
+          lastEosIndex > 0 ? words[lastEosIndex - 1].content : "";
+      eos_.emplace_back(lastEosIndex, preceeding, lastEosIt->end);
     }
   }
 
-  // Since Whisper does not accept waveforms longer than 30 seconds, we need
-  // to cut the audio at some safe point.
-  {
+  // Step 3: clear the buffer if it is getting too large.
+  // The idea is to use the saved EOS entries and try to cut the buffer
+  // in a 'good' spot - where it will remove a significant audio chunk, yet
+  // won't affect most recent, unfinished speech samples.
+  size_t bufferSize = audioBuffer_.size();
+  if (bufferSize > static_cast<size_t>(params::kStreamSafeBufferDuration *
+                                       constants::kSamplingRate)) {
+    // Setup the lock for the entire cleanup section.
     std::scoped_lock<std::mutex> lock(audioBufferMutex_);
 
-    const float audioDuration =
-        static_cast<float>(audioBuffer_.size()) / constants::kSamplingRate;
-    if (audioDuration > params::kStreamChunkThreshold) {
-      // Leave some portion of audio in, to improve model behavior
-      // in future iterations.
-      const float erasePoint =
-          hypothesisBuffer_.lastCommittedTime_ == lastSentenceEnd_
-              ? audioDuration
-              : std::min(lastSentenceEnd_, params::kStreamChunkThreshold);
-      const float minEraseDuration =
-          audioDuration - params::kStreamAudioBufferMaxReserve;
-      const float maxEraseDuration =
-          audioDuration - params::kStreamAudioBufferMinReserve;
-      const float eraseDuration = std::clamp(
-          erasePoint - bufferTimeOffset_, minEraseDuration, maxEraseDuration);
-      const size_t nSamplesToErase =
-          static_cast<size_t>(eraseDuration * constants::kSamplingRate);
-
-      audioBuffer_.erase(audioBuffer_.begin(),
-                         audioBuffer_.begin() + nSamplesToErase);
-      bufferTimeOffset_ += eraseDuration;
+    const float midBufferThreshold = params::kStreamMaxDuration / 2.0F;
+
+    // If we don't have any EOS entries, then we most likely have not
+    // recorded any speech. In this case we can safely cut the maximum amount of
+    // audio data.
+    if (eos_.empty()) {
+      size_t cut = bufferSize - params::kStreamSafetyThreshold;
+
+      audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut);
+    }
+
+    // If we have exactly one (most recent) EOS entry in the eos_, then
+    // we need to be more careful.
+    // Normally we want to keep at least one sentence in, but if the sentence
+    // covers a significant amount of buffer, we have no choice.
+    else if (eos_.size() == 1) {
+      const float eosTimestamp = eos_[0].tmstpend;
+
+      const float upperHalfDuration =
+          std::max(0.0F, eosTimestamp - midBufferThreshold);
+      const float wordsPerSecond =
+          upperHalfDuration > 0.1F
+              ? static_cast<float>(words.size()) / upperHalfDuration
+              : 0.0F;
+
+      // The EOS sits early enough that cutting up to the safety margin won't
+      // touch the ongoing (post-EOS) speech.
+      const bool eosSafe = eosTimestamp < params::kStreamSafeBufferDuration -
+                                              params::kStreamSafetyThreshold;
+
+      if (!eosSafe && wordsPerSecond < params::kWordsPerSecondLow) {
+        // EOS lies past the midpoint, but a low word density implies the spoken
+        // audio is concentrated in the upper half. Drop the lower half and
+        // shift the EOS accordingly.
+        audioBuffer_.erase(audioBuffer_.begin(),
+                           audioBuffer_.begin() +
+                               static_cast<size_t>(midBufferThreshold *
+                                                   constants::kSamplingRate));
+        eos_[0].tmstpend -= midBufferThreshold;
+      } else {
+        // Cut everything up to and including the sentence — either by the
+        // safety margin (when EOS is early) or (more aggresively) right at the
+        // EOS boundary — and commit its words.
+        const size_t cut =
+            eosSafe
+                ? bufferSize -
+                      static_cast<size_t>(params::kStreamSafetyThreshold *
+                                          constants::kSamplingRate)
+                : static_cast<size_t>(eosTimestamp * constants::kSamplingRate);
+
+        audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut);
+
+        committed.insert(committed.end(),
+                         std::make_move_iterator(words.begin()),
+                         std::make_move_iterator(words.end()));
+
+        words.clear();
+        eos_.clear();
+      }
+    }
+
+    // In case of 2 or more sentences, we generally want to keep the last one
+    // intact. This would provide a bit of stability to the algorithm.
+    else {
+      const auto &secondTolastEntry = eos_[eos_.size() - 2];
+
+      const size_t cut = static_cast<size_t>(secondTolastEntry.tmstpend *
+                                             constants::kSamplingRate);
+      const size_t lastCommittedPos = secondTolastEntry.position;
+
+      audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut);
+
+      // Move all words up to the last committed position (inclusive) to the
+      // committed buffer.
+      committed.insert(
+          committed.end(), std::make_move_iterator(words.begin()),
+          std::make_move_iterator(words.begin() + lastCommittedPos + 1));
+      words.erase(words.begin(), words.begin() + lastCommittedPos + 1);
+
+      // Retain only the most recent EOS entry.
+      eos_.erase(eos_.begin(), eos_.end() - 1);
+      eos_[0].tmstpend -= secondTolastEntry.tmstpend;
     }
   }
 
-  return {.committed = move_to_vector(committed),
-          .nonCommitted = move_to_vector(nonCommitted)};
+  // Return the results
+  // Note that uncommitted part represented by recent transcription (words)
+  // is already shrinked if something has been committed during the cleanup
+  // phase.
+  return {.committed = std::move(committed), .nonCommitted = std::move(words)};
 }
 
-std::vector<Word> OnlineASR::finish() {
-  // We always push the last remaining hypothesis, even if it's not
-  // confirmed in second iteration, to avoid ending up with broken sentences.
-  std::deque<Word> remaining = hypothesisBuffer_.hypothesis_;
+std::vector<Word> OnlineASR::finish(const DecodingOptions &options) {
+  ProcessResult result = process(options);
+
+  // Last-tick committed delta + whatever never made it past the commit
+  // threshold.
+  std::vector<Word> residual = std::move(result.committed);
+  residual.insert(residual.end(),
+                  std::make_move_iterator(result.nonCommitted.begin()),
+                  std::make_move_iterator(result.nonCommitted.end()));
 
-  return move_to_vector(remaining);
+  reset();
+
+  return residual;
 }
 
 void OnlineASR::reset() {
   std::scoped_lock<std::mutex> lock(audioBufferMutex_);
-
-  hypothesisBuffer_.reset();
-  bufferTimeOffset_ = 0.f;
-
   audioBuffer_.clear();
+
+  eos_.clear();
 }
 
 } // namespace rnexecutorch::models::speech_to_text::whisper::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h
index df6d469e39..0c0b65f40e 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h
@@ -1,13 +1,13 @@
 #pragma once
 
+#include <mutex>
+#include <span>
+#include <vector>
+
 #include "../common/schema/OnlineASR.h"
 #include "../common/types/ProcessResult.h"
-#include "../common/types/Segment.h"
 #include "../common/types/Word.h"
 #include "ASR.h"
-#include "HypothesisBuffer.h"
-
-#include <mutex>
 
 namespace rnexecutorch::models::speech_to_text::whisper::stream {
 
@@ -21,40 +21,32 @@ class OnlineASR : public schema::OnlineASR {
   OnlineASR(const ASR *asr);
 
   /**
-   * Appends new audio samples to the internal processing buffer.
-   *
-   * @param audio A span of PCM float samples (expected 16kHz).
+   * Checks if the buffer contains enough audio for the next processing step.
+   * @return True if ready, false otherwise.
    */
-  void insertAudioChunk(std::span<const float> audio) override;
+  bool isReady() const override;
 
   /**
-   * Determines whether the model is ready to process the next iteration.
-   *
-   * @return True if audioBuffer has enough samples, False otherwise
+   * Appends audio samples to the internal buffer.
+   * @param audio Span containing the audio data.
    */
-  bool isReady() const override;
+  void insertAudioChunk(std::span<const float> audio) override;
 
   /**
-   * Processes the current audio buffer and returns new transcription results.
-   * Stability is managed by an internal HypothesisBuffer to ensure that
-   * only confirmed (stable) text is returned as "committed".
-   *
-   * @param options Decoding configuration (language, etc.).
-   * @return        A ProcessResult containing newly committed and uncommitted
-   * words.
+   * Processes the current buffered audio and returns transcription results.
+   * @param options Decoding options for the transcription.
+   * @return Transcription result containing committed and volatile tokens.
    */
   ProcessResult process(const DecodingOptions &options) override;
 
   /**
-   * Finalizes the current streaming session.
-   * Flushes any remaining words from the hypothesis buffer.
-   *
-   * @return A vector of remaining transcribed words.
+   * Finalizes the current stream and returns all words.
+   * @return Vector of detected words.
    */
-  std::vector<Word> finish() override;
+  std::vector<Word> finish(const DecodingOptions &options) override;
 
   /**
-   * Reset the streaming state by resetting the buffers
+   * Resets the internal state and clears buffers.
    */
   void reset() override;
 
@@ -62,19 +54,20 @@ class OnlineASR : public schema::OnlineASR {
   // ASR module connection for transcribing the audio
   const ASR *asr_;
 
-  // Helper buffers - audio buffer
-  // Stores the increasing amounts of streamed audio.
-  // Cleared from time to time after reaching a threshold size.
+  // Audio buffer (input) - accumulates obtained audio samples.
   std::vector<float> audioBuffer_ = {};
   mutable std::mutex audioBufferMutex_;
-  float bufferTimeOffset_ = 0.F; // Audio buffer offset
-
-  // Helper buffers - hypothesis buffer
-  // Manages the whisper streaming hypothesis mechanism.
-  HypothesisBuffer hypothesisBuffer_;
 
-  // State members to keep track of specyfic aspects of buffer state
-  float lastSentenceEnd_ = 0.F;
+  // State management helper.
+  struct EOSEntry {
+    size_t position; // An absolute position (index) in the transcription (word
+                     // sequence).
+    std::string preceeding; // A preceeding word in the transcription
+    float tmstpend;         // Ending timestamp of the sentence.
+  };
+  // Stores saved EOS entries in most recent transcription
+  // and allows to clear the buffer in a smart, non invasive way.
+  std::vector<EOSEntry> eos_;
 };
 
-} // namespace rnexecutorch::models::speech_to_text::whisper::stream
\ No newline at end of file
+} // namespace rnexecutorch::models::speech_to_text::whisper::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h
index 5eb74c06cc..a99067f411 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h
@@ -1,6 +1,9 @@
 #pragma once
 
+#include "Constants.h"
+
 #include <cinttypes>
+#include <cstdlib>
 
 /**
  * Hyperparameters
@@ -11,90 +14,50 @@
 namespace rnexecutorch::models::speech_to_text::whisper::params {
 
 /**
- * Determines the range of buffer left when skipping an audio chunk
- * of size lower than maximum allowed chunk size.
- *
- * If the audio length does not exceed [kChunkSize * kSamplingRate] - [buffer],
- * then instead of moving to the last returned timestamp, we jump across the
- * entire 30 seconds chunk. This resolves the issue of multiple redundant
- * segments being produced by the transcription algorithm.
+ * Maximum duration of audio that the streaming buffer keeps before forcing
+ * a cleanup. Aligned with Whisper's maximum supported input length.
  */
-constexpr static int32_t kChunkBreakBuffer = 2; // [s]
+constexpr inline float kStreamMaxDuration =
+    static_cast<float>(constants::kChunkSize);
 
 /**
- * Determines the maximum timestamp difference available for a word to be
- * considered as fresh in streaming algorithm.
+ * The minimum amount of recent audio always kept in the buffer when a blind
+ * cut is performed. Acts as the lower bound on what survives a cleanup.
  */
-constexpr static float kStreamFreshThreshold = 3.F; // [s], originally 0.5
+constexpr inline float kStreamSafetyThreshold = 2.F; // [s]
 
 /**
- * The size of the most recent committed suffix searched in
- * fresh words string.
- *
- * For example, if the committed buffer contains ["I", "did" "a" "very" "nasty"
- * "thing."], and kStreamCommitedSuffixSearchSize = 3, then we search for
- * ["very" "nasty" "thing."] suffix.
+ * Forced-cleanup threshold. Once the buffer grows past this duration we run
+ * the EOS-anchored cleanup routine.
  */
-constexpr static size_t kStreamCommitedSuffixSearchSize = 5;
+constexpr inline float kStreamSafeBufferDuration =
+    kStreamMaxDuration - kStreamSafetyThreshold; // [s]
 
 /**
- * Determines the maximum expected size of overlapping fragments between
- * fresh words buffer and commited words buffer in streaming mode.
- *
- * It is a limit of maximum amount of erased repeated words from fresh buffer.
- * The bigger it gets, the less probable it is to commit the same phrase twice.
+ * An estimate of the number of words spoken per second.
+ * Used for estimating transcription progress and buffer management heuristics.
  */
-constexpr static size_t kStreamMaxOverlapSize =
-    12; // Number of overlaping words
+constexpr inline float kWordsPerSecondEstimation = 2.25F;
 
 /**
- * Similar to kMaxStreamOverlapSize, but this one determines
- * the maximum allowed timestamp difference between the overlaping fragments.
- *
- * It's the first, more strict threshold, used when searching for recently
- * committed entries.
+ * Upper bound for words per second estimate in fast speech.
  */
-constexpr static float kStreamMaxOverlapTimestampDiff1 = 6.F; // [s]
+constexpr inline float kWordsPerSecondHigh = 4.F;
 
 /**
- * Similar to kMaxStreamOverlapSize, but this one determines
- * the maximum allowed timestamp difference between the overlaping fragments.
- *
- * It's the second, more liberal threshold, used in overlap correction
- * algorithm.
+ * Lower bound for words per second estimate in slow speech.
  */
-constexpr static float kStreamMaxOverlapTimestampDiff2 = 15.F; // [s]
+constexpr inline float kWordsPerSecondLow = 1.5F;
 
 /**
- * Number of words per 1 allowed mistake (error correction).
+ * Determines the range of buffer left when skipping an audio chunk
+ * of size lower than maximum allowed chunk size.
  *
- * For example, if kStreamWordsPerErrorRate = 4, then we allow maximum 1 mistake
- * in a 4 word string.
- */
-constexpr static size_t kStreamWordsPerErrorRate = 5;
-
-/**
- * A threshold which exceeded causes the main streaming audio buffer to be
- * cleared.
- */
-constexpr static float kStreamChunkThreshold = 20.F; // [s]
-
-/**
- * Decides how much of recent audio waveform is always kept in when
- * clearing the audio buffer in streaming algorithm.
- */
-constexpr static float kStreamAudioBufferMinReserve = 2.F; // [s]
-
-/**
- * Decides how much of recent audio waveform can be kept in when
- * clearing the audio buffer in streaming algorithm.
- */
-constexpr static float kStreamAudioBufferMaxReserve = 6.F; // [s]
-
-/**
- * An estimate of number of words per second produced in a standard
- * human conversation speech.
+ * If the audio length does not exceed [kChunkSize * kSamplingRate] - [buffer],
+ * then instead of moving to the last returned timestamp, we jump across the
+ * entire 30 seconds chunk. This resolves the issue of multiple redundant
+ * segments being produced by the transcription algorithm.
  */
-constexpr static float kStreamWordsPerSecond = 2.5F;
+constexpr inline int32_t kChunkBreakBuffer = 2; // [s]
 
-} // namespace rnexecutorch::models::speech_to_text::whisper::params
\ No newline at end of file
+} // namespace rnexecutorch::models::speech_to_text::whisper::params
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h
index 2e4e3b5076..48c84a84b7 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "../common/types/Word.h"
+#include "Constants.h"
 #include <algorithm>
 #include <cmath>
 #include <span>
@@ -8,70 +9,14 @@
 
 namespace rnexecutorch::models::speech_to_text::whisper::utils {
 
-// Compares two strings without case-sensitivity.
-inline bool equalsIgnoreCase(const std::string &a, const std::string &b) {
-  if (a.size() != b.size()) {
-    return false;
-  }
-  return std::equal(a.begin(), a.end(), b.begin(), [](char c1, char c2) {
-    return std::tolower(static_cast<unsigned char>(c1)) ==
-           std::tolower(static_cast<unsigned char>(c2));
-  });
-}
-
 /**
- * Finds the largest (in number of words) overlaping fragment between word
- * vectors A (suffix) and B (prefix).
+ * Checks if the given word represents an End-of-Sentence (EOS) punctuation.
  *
- * An overlaping fragment is any fragment C, which can be simultaneously a
- * suffix of A and a prefix of B. Example: A = 'Jane likes food and playing
- * games', B = 'playing games and sleeping', the overlap fragment C = 'playing
- * games'.
- *
- * @param suffixVec An input vector, where only suffixes can overlap.
- *                  Typically the 'commited' buffer in streaming algorithm.
- * @param preffixVec An input vector, where only prefixes can overlap.
- *                   Typically the 'fresh' buffer in streaming algorithm.
- * @param maxCheckRange The maximum size of overlapping fragment. Determines the
- * range of search.
- * @param maxTimestampDiff The maximum allowed timestamp difference between
- * overlaping fragments. If exceeded, the fragment are not considered as
- * overlaping.
- * @return The size of the largest found overlaping fragment.
+ * @param word The word to check.
  */
-template <typename Container>
-inline size_t findLargestOverlapingFragment(const Container &suffixVec,
-                                            const Container &prefixVec,
-                                            size_t maxCheckRange = 10,
-                                            float maxTimestampDiff = 100.f) {
-  size_t range = std::min({suffixVec.size(), prefixVec.size(), maxCheckRange});
-
-  if (range == 0) {
-    return 0;
-  }
-
-  // i starts at the index where the suffix of length 'range' begins.
-  for (size_t i = suffixVec.size() - range; i < suffixVec.size(); ++i) {
-    // We search for overlaps by searching for the first word of prefixVec
-    if (equalsIgnoreCase(suffixVec[i].content, prefixVec[0].content)) {
-      size_t calculatedSize = suffixVec.size() - i;
-
-      bool isEqual =
-          std::equal(suffixVec.begin() + i, suffixVec.end(), prefixVec.begin(),
-                     [maxTimestampDiff](const Word &sWord, const Word &pWord) {
-                       return equalsIgnoreCase(sWord.content, pWord.content) &&
-                              std::max(std::fabs(sWord.start - pWord.start),
-                                       std::fabs(sWord.end - pWord.end)) <=
-                                  maxTimestampDiff;
-                     });
-
-      if (isEqual) {
-        return calculatedSize;
-      }
-    }
-  }
-
-  return 0;
+constexpr inline bool isEos(const Word &word) {
+  return word.content.size() == 1 &&
+         constants::kEosPunctations.contains(word.content[0]);
 }
 
 } // namespace rnexecutorch::models::speech_to_text::whisper::utils
\ No newline at end of file

From 92b3f29baa286d99907c529cc3366c90bfec0a4c Mon Sep 17 00:00:00 2001
From: IgorSwat <igorswat2002@o2.pl>
Date: Thu, 7 May 2026 12:39:02 +0200
Subject: [PATCH 02/20] Revert back to 100ms refresh rate

---
 apps/speech/screens/SpeechToTextScreen.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index dfd39c15b4..4e5e19ae48 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -148,7 +148,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     recorder.current.onAudioReady(
       {
         sampleRate,
-        bufferLength: 0.1 * sampleRate,
+        bufferLength: 0.1 * sampleRate, // 100 ms
         channelCount: 1,
       },
       ({ buffer }) => {

From 35290db7403af508a45ac511212b197a8211f2e3 Mon Sep 17 00:00:00 2001
From: IgorSwat <igorswat2002@o2.pl>
Date: Thu, 7 May 2026 17:02:21 +0200
Subject: [PATCH 03/20] Add CoreML whisper models

---
 apps/llm/app/index.tsx                        |   6 -
 apps/llm/app/voice_chat/index.tsx             | 311 ------------------
 apps/speech/screens/SpeechToTextScreen.tsx    |  19 +-
 .../models/speech_to_text/whisper/ASR.cpp     |  25 +-
 .../src/constants/modelUrls.ts                |  78 ++---
 yarn.lock                                     |  14 +
 6 files changed, 76 insertions(+), 377 deletions(-)
 delete mode 100644 apps/llm/app/voice_chat/index.tsx

diff --git a/apps/llm/app/index.tsx b/apps/llm/app/index.tsx
index 72358ae72c..b67b3fa7ce 100644
--- a/apps/llm/app/index.tsx
+++ b/apps/llm/app/index.tsx
@@ -29,12 +29,6 @@ export default function Home() {
         >
           <Text style={styles.buttonText}>LLM Structured Output</Text>
         </TouchableOpacity>
-        <TouchableOpacity
-          style={styles.button}
-          onPress={() => router.navigate('voice_chat/')}
-        >
-          <Text style={styles.buttonText}>Voice Chat</Text>
-        </TouchableOpacity>
         <TouchableOpacity
           style={styles.button}
           onPress={() => router.navigate('multimodal_llm/')}
diff --git a/apps/llm/app/voice_chat/index.tsx b/apps/llm/app/voice_chat/index.tsx
deleted file mode 100644
index 23ab70bff4..0000000000
--- a/apps/llm/app/voice_chat/index.tsx
+++ /dev/null
@@ -1,311 +0,0 @@
-import { useContext, useEffect, useState } from 'react';
-import {
-  Keyboard,
-  KeyboardAvoidingView,
-  Platform,
-  StyleSheet,
-  Text,
-  TouchableOpacity,
-  TouchableWithoutFeedback,
-  View,
-} from 'react-native';
-import SWMIcon from '../../assets/icons/swm_icon.svg';
-import Spinner from '../../components/Spinner';
-import ErrorBanner from '../../components/ErrorBanner';
-import {
-  useSpeechToText,
-  useLLM,
-  QWEN3_0_6B_QUANTIZED,
-  QWEN3_1_7B_QUANTIZED,
-  LLAMA3_2_1B_SPINQUANT,
-  WHISPER_TINY_EN,
-  WHISPER_TINY_EN_QUANTIZED,
-  WHISPER_BASE_EN,
-  WHISPER_SMALL_EN,
-  LLMProps,
-  SpeechToTextProps,
-} from 'react-native-executorch';
-import { ModelPicker, ModelOption } from '../../components/ModelPicker';
-import PauseIcon from '../../assets/icons/pause_icon.svg';
-import MicIcon from '../../assets/icons/mic_icon.svg';
-import StopIcon from '../../assets/icons/stop_icon.svg';
-import ColorPalette from '../../colors';
-import Messages from '../../components/Messages';
-import { AudioManager, AudioRecorder } from 'react-native-audio-api';
-import DeviceInfo from 'react-native-device-info';
-import { useIsFocused } from '@react-navigation/native';
-import { useSafeAreaInsets } from 'react-native-safe-area-context';
-import { GeneratingContext } from '../../context';
-
-type LLMModelSources = LLMProps['model'];
-type STTModelSources = SpeechToTextProps['model'];
-
-const LLM_MODELS: ModelOption<LLMModelSources>[] = [
-  { label: 'Qwen3 0.6B', value: QWEN3_0_6B_QUANTIZED },
-  { label: 'Qwen3 1.7B', value: QWEN3_1_7B_QUANTIZED },
-  { label: 'Llama 1B', value: LLAMA3_2_1B_SPINQUANT },
-];
-
-const STT_MODELS: ModelOption<STTModelSources>[] = [
-  { label: 'Whisper Tiny', value: WHISPER_TINY_EN },
-  { label: 'Whisper Tiny Q', value: WHISPER_TINY_EN_QUANTIZED },
-  { label: 'Whisper Base', value: WHISPER_BASE_EN },
-  { label: 'Whisper Small', value: WHISPER_SMALL_EN },
-];
-
-export default function VoiceChatScreenWrapper() {
-  const isFocused = useIsFocused();
-
-  return isFocused ? <VoiceChatScreen /> : null;
-}
-
-function VoiceChatScreen() {
-  const { bottom } = useSafeAreaInsets();
-  const [isRecording, setIsRecording] = useState(false);
-  const [liveTranscription, setLiveTranscription] = useState('');
-  const [selectedLLM, setSelectedLLM] =
-    useState<LLMModelSources>(QWEN3_0_6B_QUANTIZED);
-  const [selectedSTT, setSelectedSTT] =
-    useState<STTModelSources>(WHISPER_TINY_EN);
-  const [error, setError] = useState<string | null>(null);
-
-  const [recorder] = useState(() => new AudioRecorder());
-
-  const { setGlobalGenerating } = useContext(GeneratingContext);
-
-  const llm = useLLM({ model: selectedLLM });
-  const speechToText = useSpeechToText({
-    model: selectedSTT,
-  });
-
-  useEffect(() => {
-    setGlobalGenerating(llm.isGenerating || speechToText.isGenerating);
-  }, [llm.isGenerating, speechToText.isGenerating, setGlobalGenerating]);
-
-  useEffect(() => {
-    AudioManager.setAudioSessionOptions({
-      iosCategory: 'playAndRecord',
-      iosMode: 'spokenAudio',
-      iosOptions: ['allowBluetoothHFP', 'defaultToSpeaker'],
-    });
-    AudioManager.requestRecordingPermissions();
-  }, []);
-
-  const handleRecordPress = async () => {
-    if (isRecording) {
-      setIsRecording(false);
-      recorder.stop();
-      speechToText.streamStop();
-    } else {
-      setIsRecording(true);
-      setLiveTranscription('');
-
-      const sampleRate = 16000;
-      recorder.onAudioReady(
-        {
-          sampleRate,
-          bufferLength: 0.1 * sampleRate,
-          channelCount: 1,
-        },
-        ({ buffer }) => {
-          speechToText.streamInsert(buffer.getChannelData(0));
-        }
-      );
-      recorder.start();
-
-      let finalResult = '';
-
-      try {
-        for await (const result of speechToText.stream()) {
-          const text = result.committed.text + result.nonCommitted.text;
-          setLiveTranscription(text);
-          finalResult = text;
-        }
-      } catch (e) {
-        setError(e instanceof Error ? e.message : String(e));
-      } finally {
-        if (finalResult.trim().length > 0) {
-          await llm.sendMessage(finalResult);
-          setLiveTranscription('');
-        }
-      }
-    }
-  };
-
-  useEffect(() => {
-    if (llm.error) setError(String(llm.error));
-  }, [llm.error]);
-
-  useEffect(() => {
-    if (speechToText.error) setError(String(speechToText.error));
-  }, [speechToText.error]);
-
-  return (!llm.isReady || !speechToText.isReady) &&
-    !llm.error &&
-    !speechToText.error ? (
-    <Spinner
-      visible={true}
-      textContent={`Loading the LLM model ${(llm.downloadProgress * 100).toFixed(0)} %\nLoading the speech model ${(speechToText.downloadProgress * 100).toFixed(0)} %`}
-    />
-  ) : (
-    <TouchableWithoutFeedback onPress={Keyboard.dismiss}>
-      <KeyboardAvoidingView
-        style={styles.keyboardAvoidingView}
-        behavior={Platform.OS === 'ios' ? 'padding' : 'height'}
-        keyboardVerticalOffset={Platform.OS === 'android' ? 30 : 0}
-      >
-        <View style={styles.topContainer}>
-          <SWMIcon width={45} height={45} />
-          <Text style={styles.textModelName}>Qwen 3 x Whisper</Text>
-        </View>
-        <ErrorBanner message={error} onDismiss={() => setError(null)} />
-        {llm.messageHistory.length > 0 || liveTranscription.length > 0 ? (
-          <View style={styles.chatContainer}>
-            <Messages
-              chatHistory={
-                isRecording && liveTranscription.length > 0
-                  ? [
-                      ...llm.messageHistory,
-                      {
-                        role: 'user',
-                        content: liveTranscription,
-                      },
-                    ]
-                  : llm.messageHistory
-              }
-              llmResponse={llm.response}
-              isGenerating={llm.isGenerating}
-              deleteMessage={llm.deleteMessage}
-            />
-          </View>
-        ) : (
-          <View style={styles.helloMessageContainer}>
-            <Text style={styles.helloText}>Hello! 👋</Text>
-            <Text style={styles.bottomHelloText}>
-              Tap the mic and speak to me. I'll transcribe your voice and
-              respond using a language model — all on-device.
-            </Text>
-          </View>
-        )}
-
-        <ModelPicker
-          models={LLM_MODELS}
-          selectedModel={selectedLLM}
-          onSelect={(m) => setSelectedLLM(m)}
-        />
-        <ModelPicker
-          models={STT_MODELS}
-          selectedModel={selectedSTT}
-          onSelect={(m) => setSelectedSTT(m)}
-        />
-
-        <View
-          style={[
-            styles.bottomContainer,
-            Platform.OS === 'android' && {
-              paddingBottom: bottom || 16,
-              height: 100 + (bottom || 16),
-            },
-          ]}
-        >
-          {DeviceInfo.isEmulatorSync() ? (
-            <View style={styles.emulatorBox}>
-              <Text style={[styles.emulatorWarning]}>
-                recording disabled on emulator
-              </Text>
-            </View>
-          ) : (
-            <>
-              {llm.isGenerating ? (
-                <TouchableOpacity onPress={llm.interrupt}>
-                  <PauseIcon height={40} width={40} padding={4} margin={8} />
-                </TouchableOpacity>
-              ) : (
-                <TouchableOpacity
-                  style={
-                    !isRecording ? styles.recordTouchable : styles.recordingInfo
-                  }
-                  onPress={handleRecordPress}
-                >
-                  {isRecording ? (
-                    <StopIcon height={40} width={40} padding={4} margin={8} />
-                  ) : (
-                    <MicIcon height={40} width={40} padding={4} margin={8} />
-                  )}
-                </TouchableOpacity>
-              )}
-            </>
-          )}
-        </View>
-      </KeyboardAvoidingView>
-    </TouchableWithoutFeedback>
-  );
-}
-
-const styles = StyleSheet.create({
-  keyboardAvoidingView: {
-    flex: 1,
-  },
-  topContainer: {
-    height: 68,
-    width: '100%',
-    alignItems: 'center',
-    justifyContent: 'center',
-  },
-  chatContainer: {
-    flex: 10,
-    width: '100%',
-  },
-  textModelName: {
-    color: ColorPalette.primary,
-  },
-  helloMessageContainer: {
-    flex: 10,
-    width: '100%',
-    alignItems: 'center',
-    justifyContent: 'center',
-  },
-  helloText: {
-    fontFamily: 'medium',
-    fontSize: 30,
-    color: ColorPalette.primary,
-  },
-  bottomHelloText: {
-    fontFamily: 'regular',
-    fontSize: 20,
-    lineHeight: 28,
-    textAlign: 'center',
-    color: ColorPalette.primary,
-  },
-  bottomContainer: {
-    height: 100,
-    width: '100%',
-    justifyContent: 'center',
-    alignItems: 'center',
-    paddingHorizontal: 16,
-  },
-  recordTouchable: {
-    height: '100%',
-    justifyContent: 'center',
-    alignItems: 'center',
-  },
-  recordingInfo: {
-    width: '100%',
-    display: 'flex',
-    justifyContent: 'center',
-    alignItems: 'center',
-  },
-  emulatorBox: {
-    padding: 10,
-    margin: 10,
-    borderWidth: 1,
-    borderRadius: 8,
-    borderColor: 'gray',
-    justifyContent: 'center',
-    alignItems: 'center',
-  },
-  emulatorWarning: {
-    color: 'gray',
-    fontSize: 16,
-  },
-});
diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index 4e5e19ae48..1c8859d224 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -13,9 +13,10 @@ import {
 import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
 import {
   useSpeechToText,
-  WHISPER_TINY_EN,
-  WHISPER_TINY_EN_QUANTIZED,
-  WHISPER_BASE_EN,
+  WHISPER_TINY_EN_XNNPACK,
+  WHISPER_TINY_EN_COREML,
+  WHISPER_BASE_EN_XNNPACK,
+  WHISPER_BASE_EN_COREML,
   WHISPER_SMALL_EN,
   TranscriptionResult,
   SpeechToTextProps,
@@ -25,9 +26,10 @@ import { ModelPicker, ModelOption } from '../components/ModelPicker';
 type STTModelSources = SpeechToTextProps['model'];
 
 const MODELS: ModelOption<STTModelSources>[] = [
-  { label: 'Whisper Tiny', value: WHISPER_TINY_EN },
-  { label: 'Whisper Tiny Q', value: WHISPER_TINY_EN_QUANTIZED },
-  { label: 'Whisper Base', value: WHISPER_BASE_EN },
+  { label: 'Whisper Tiny', value: WHISPER_TINY_EN_XNNPACK },
+  { label: 'Whisper Tiny CoreML', value: WHISPER_TINY_EN_COREML },
+  { label: 'Whisper Base', value: WHISPER_BASE_EN_XNNPACK },
+  { label: 'Whisper Base CoreML', value: WHISPER_BASE_EN_COREML },
   { label: 'Whisper Small', value: WHISPER_SMALL_EN },
 ];
 import FontAwesome from '@expo/vector-icons/FontAwesome';
@@ -46,8 +48,9 @@ import ErrorBanner from '../components/ErrorBanner';
 const isSimulator = DeviceInfo.isEmulatorSync();
 
 export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
-  const [selectedModel, setSelectedModel] =
-    useState<STTModelSources>(WHISPER_TINY_EN);
+  const [selectedModel, setSelectedModel] = useState<STTModelSources>(
+    WHISPER_TINY_EN_XNNPACK
+  );
 
   const model = useSpeechToText({
     model: selectedModel,
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
index b0d08e419b..d2555a79fa 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
@@ -263,11 +263,21 @@ ASR::generate(std::span<const float> waveform, const DecodingOptions &options,
   std::vector<float> scores;
 
   uint64_t startPos = 0;
-  while (std::cmp_less_equal(startPos + sequenceIds.size(),
-                             constants::kMaxDecodeLength)) {
-    executorch::aten::Tensor logitsTensor =
-        this->decode(sequenceIds, encoderFeatures, startPos);
 
+  // Prefill: feed each initial token individually so decode() always sees 1
+  // token
+  std::span<uint64_t> firstToken(sequenceIds.data(), 1);
+  executorch::aten::Tensor logitsTensor =
+      this->decode(firstToken, encoderFeatures, startPos);
+  ++startPos;
+  for (size_t i = 1; i < sequenceIds.size(); ++i) {
+    std::span<uint64_t> single(sequenceIds.data() + i, 1);
+    logitsTensor = this->decode(single, encoderFeatures, startPos);
+    ++startPos;
+  }
+
+  // Autoregressive decoding: always 1 token at a time
+  while (std::cmp_less(startPos, constants::kMaxDecodeLength)) {
     const size_t logitsInnerDim = logitsTensor.size(1);
     const size_t logitsDictSize = logitsTensor.size(2);
     const float *logitsData = logitsTensor.const_data_ptr<float>() +
@@ -303,15 +313,16 @@ ASR::generate(std::span<const float> waveform, const DecodingOptions &options,
       nextProb = probs[nextId];
     }
 
-    // Move the startPos pointer by the amount of tokens we processed
-    startPos += sequenceIds.size();
-    sequenceIds = {nextId};
     cachedTokens.push_back(nextId);
     scores.push_back(nextProb);
 
     if (nextId == endOfTranscriptionToken_) {
       break;
     }
+
+    std::span<uint64_t> single(&cachedTokens.back(), 1);
+    logitsTensor = this->decode(single, encoderFeatures, startPos);
+    ++startPos;
   }
 
   return {.tokens = std::vector<uint64_t>(cachedTokens.cbegin() +
diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
index 159396add8..0fdf1bad54 100644
--- a/packages/react-native-executorch/src/constants/modelUrls.ts
+++ b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -773,23 +773,17 @@ export const STYLE_TRANSFER_UDNIE_QUANTIZED = {
 } as const;
 
 // S2T
-const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-const WHISPER_TINY_EN_MODEL = `${URL_PREFIX}-whisper-tiny.en/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack.pte`;
+const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/tokenizer.json`;
+const WHISPER_TINY_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack.pte`;
+const WHISPER_TINY_EN_MODEL_COREML = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/coreml/whisper_tiny_en_coreml.pte`;
+const WHISPER_TINY_EN_MODEL_VULKAN = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/vulkan/whisper_tiny_en_vulkan.pte`;
 
-const WHISPER_TINY_EN_QUANTIZED_TOKENIZER = `${URL_PREFIX}-whisper-tiny-quantized.en/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-const WHISPER_TINY_EN_QUANTIZED_MODEL = `${URL_PREFIX}-whisper-tiny-quantized.en/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_tiny_en_quantized_xnnpack.pte`;
+const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/tokenizer.json`;
+const WHISPER_BASE_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/xnnpack/whisper_base_en_xnnpack.pte`;
+const WHISPER_BASE_EN_MODEL_COREML = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/coreml/whisper_base_en_coreml.pte`;
 
-const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-const WHISPER_BASE_EN_MODEL = `${URL_PREFIX}-whisper-base.en/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_base_en_xnnpack.pte`;
-
-const WHISPER_BASE_EN_QUANTIZED_TOKENIZER = `${URL_PREFIX}-whisper-base-quantized.en/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-const WHISPER_BASE_EN_QUANTIZED_MODEL = `${URL_PREFIX}-whisper-base-quantized.en/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_base_en_quantized_xnnpack.pte`;
-
-const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-const WHISPER_SMALL_EN_MODEL = `${URL_PREFIX}-whisper-small.en/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_small_en_xnnpack.pte`;
-
-const WHISPER_SMALL_EN_QUANTIZED_TOKENIZER = `${URL_PREFIX}-whisper-small-quantized.en/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-const WHISPER_SMALL_EN_QUANTIZED_MODEL = `${URL_PREFIX}-whisper-small-quantized.en/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_small_en_quantized_xnnpack.pte`;
+const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/tokenizer.json`;
+const WHISPER_SMALL_EN_MODEL = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/xnnpack/whisper_small_en_xnnpack.pte`;
 
 const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
 const WHISPER_TINY_MODEL = `${URL_PREFIX}-whisper-tiny/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_tiny_xnnpack.pte`;
@@ -803,41 +797,45 @@ const WHISPER_SMALL_MODEL = `${URL_PREFIX}-whisper-small/${PREVIOUS_VERSION_TAG}
 /**
  * @category Models - Speech To Text
  */
-export const WHISPER_TINY_EN = {
+export const WHISPER_TINY_EN_XNNPACK = {
   modelName: 'whisper-tiny-en',
   isMultilingual: false,
-  modelSource: WHISPER_TINY_EN_MODEL,
+  modelSource: WHISPER_TINY_EN_MODEL_XNNPACK,
   tokenizerSource: WHISPER_TINY_EN_TOKENIZER,
 } as const;
 
-/**
- * @category Models - Speech To Text
- */
-export const WHISPER_TINY_EN_QUANTIZED = {
-  modelName: 'whisper-tiny-en-quantized',
+export const WHISPER_TINY_EN_COREML = {
+  modelName: 'whisper-tiny-en',
   isMultilingual: false,
-  modelSource: WHISPER_TINY_EN_QUANTIZED_MODEL,
-  tokenizerSource: WHISPER_TINY_EN_QUANTIZED_TOKENIZER,
+  modelSource: WHISPER_TINY_EN_MODEL_COREML,
+  tokenizerSource: WHISPER_TINY_EN_TOKENIZER,
+} as const;
+
+export const WHISPER_TINY_EN_VULKAN = {
+  modelName: 'whisper-tiny-en',
+  isMultilingual: false,
+  modelSource: WHISPER_TINY_EN_MODEL_VULKAN,
+  tokenizerSource: WHISPER_TINY_EN_TOKENIZER,
 } as const;
 
 /**
  * @category Models - Speech To Text
  */
-export const WHISPER_BASE_EN = {
+export const WHISPER_BASE_EN_XNNPACK = {
   modelName: 'whisper-base-en',
   isMultilingual: false,
-  modelSource: WHISPER_BASE_EN_MODEL,
+  modelSource: WHISPER_BASE_EN_MODEL_XNNPACK,
   tokenizerSource: WHISPER_BASE_EN_TOKENIZER,
 } as const;
 
 /**
  * @category Models - Speech To Text
  */
-export const WHISPER_BASE_EN_QUANTIZED = {
-  modelName: 'whisper-base-en-quantized',
+export const WHISPER_BASE_EN_COREML = {
+  modelName: 'whisper-base-en',
   isMultilingual: false,
-  modelSource: WHISPER_BASE_EN_QUANTIZED_MODEL,
-  tokenizerSource: WHISPER_BASE_EN_QUANTIZED_TOKENIZER,
+  modelSource: WHISPER_BASE_EN_MODEL_COREML,
+  tokenizerSource: WHISPER_BASE_EN_TOKENIZER,
 } as const;
 
 /**
@@ -850,16 +848,6 @@ export const WHISPER_SMALL_EN = {
   tokenizerSource: WHISPER_SMALL_EN_TOKENIZER,
 } as const;
 
-/**
- * @category Models - Speech To Text
- */
-export const WHISPER_SMALL_EN_QUANTIZED = {
-  modelName: 'whisper-small-en-quantized',
-  isMultilingual: false,
-  modelSource: WHISPER_SMALL_EN_QUANTIZED_MODEL,
-  tokenizerSource: WHISPER_SMALL_EN_QUANTIZED_TOKENIZER,
-} as const;
-
 /**
  * @category Models - Speech To Text
  */
@@ -1350,12 +1338,12 @@ export const MODEL_REGISTRY = {
     STYLE_TRANSFER_RAIN_PRINCESS_QUANTIZED,
     STYLE_TRANSFER_UDNIE,
     STYLE_TRANSFER_UDNIE_QUANTIZED,
-    WHISPER_TINY_EN,
-    WHISPER_TINY_EN_QUANTIZED,
-    WHISPER_BASE_EN,
-    WHISPER_BASE_EN_QUANTIZED,
+    WHISPER_TINY_EN_XNNPACK,
+    WHISPER_TINY_EN_COREML,
+    WHISPER_TINY_EN_VULKAN,
+    WHISPER_BASE_EN_XNNPACK,
+    WHISPER_BASE_EN_COREML,
     WHISPER_SMALL_EN,
-    WHISPER_SMALL_EN_QUANTIZED,
     WHISPER_TINY,
     WHISPER_BASE,
     WHISPER_SMALL,
diff --git a/yarn.lock b/yarn.lock
index 7f335abe71..a4439a0f33 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -15283,6 +15283,20 @@ __metadata:
   languageName: node
   linkType: hard
 
+"react-native-audio-api@npm:0.11.5":
+  version: 0.11.5
+  resolution: "react-native-audio-api@npm:0.11.5"
+  dependencies:
+    semver: "npm:^7.7.3"
+  peerDependencies:
+    react: "*"
+    react-native: "*"
+  bin:
+    setup-rn-audio-api-web: scripts/setup-rn-audio-api-web.js
+  checksum: 10/f8a388954c42cfd390b9adbfe6781f9d8049d43ea6ab83a8b229a0d0082df3489d9b48072d7166403ae95a33e8d741aab86ba2307d1bd4ff949fdb72e14ef42d
+  languageName: node
+  linkType: hard
+
 "react-native-audio-api@npm:0.12.0":
   version: 0.12.0
   resolution: "react-native-audio-api@npm:0.12.0"

From 7473f018c396abb8ed00df29a587747531dd4c4b Mon Sep 17 00:00:00 2001
From: IgorSwat <igorswat2002@o2.pl>
Date: Fri, 8 May 2026 09:53:01 +0200
Subject: [PATCH 04/20] Update model urls

---
 apps/speech/screens/SpeechToTextScreen.tsx    | 21 ++---
 .../src/constants/modelUrls.ts                | 89 +++++++++++++------
 2 files changed, 75 insertions(+), 35 deletions(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index 1c8859d224..e7ee776034 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -13,24 +13,26 @@ import {
 import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
 import {
   useSpeechToText,
-  WHISPER_TINY_EN_XNNPACK,
+  WHISPER_TINY_EN,
   WHISPER_TINY_EN_COREML,
-  WHISPER_BASE_EN_XNNPACK,
+  WHISPER_BASE_EN,
   WHISPER_BASE_EN_COREML,
   WHISPER_SMALL_EN,
   TranscriptionResult,
   SpeechToTextProps,
+  WHISPER_SMALL_EN_COREML,
 } from 'react-native-executorch';
 import { ModelPicker, ModelOption } from '../components/ModelPicker';
 
 type STTModelSources = SpeechToTextProps['model'];
 
 const MODELS: ModelOption<STTModelSources>[] = [
-  { label: 'Whisper Tiny', value: WHISPER_TINY_EN_XNNPACK },
-  { label: 'Whisper Tiny CoreML', value: WHISPER_TINY_EN_COREML },
-  { label: 'Whisper Base', value: WHISPER_BASE_EN_XNNPACK },
-  { label: 'Whisper Base CoreML', value: WHISPER_BASE_EN_COREML },
-  { label: 'Whisper Small', value: WHISPER_SMALL_EN },
+  { label: 'Whisper Tiny EN (XNNPACK)', value: WHISPER_TINY_EN },
+  { label: 'Whisper Tiny EN (CoreML)', value: WHISPER_TINY_EN_COREML },
+  { label: 'Whisper Base EN (XNNPACK)', value: WHISPER_BASE_EN },
+  { label: 'Whisper Base EN (CoreML)', value: WHISPER_BASE_EN_COREML },
+  { label: 'Whisper Small EN (XNNPACK)', value: WHISPER_SMALL_EN },
+  { label: 'Whisper Small EN (CoreML)', value: WHISPER_SMALL_EN_COREML },
 ];
 import FontAwesome from '@expo/vector-icons/FontAwesome';
 import {
@@ -48,9 +50,8 @@ import ErrorBanner from '../components/ErrorBanner';
 const isSimulator = DeviceInfo.isEmulatorSync();
 
 export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
-  const [selectedModel, setSelectedModel] = useState<STTModelSources>(
-    WHISPER_TINY_EN_XNNPACK
-  );
+  const [selectedModel, setSelectedModel] =
+    useState<STTModelSources>(WHISPER_TINY_EN);
 
   const model = useSpeechToText({
     model: selectedModel,
diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
index 0fdf1bad54..a87117d863 100644
--- a/packages/react-native-executorch/src/constants/modelUrls.ts
+++ b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -776,28 +776,31 @@ export const STYLE_TRANSFER_UDNIE_QUANTIZED = {
 const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/tokenizer.json`;
 const WHISPER_TINY_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack.pte`;
 const WHISPER_TINY_EN_MODEL_COREML = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/coreml/whisper_tiny_en_coreml.pte`;
-const WHISPER_TINY_EN_MODEL_VULKAN = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/vulkan/whisper_tiny_en_vulkan.pte`;
 
 const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/tokenizer.json`;
 const WHISPER_BASE_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/xnnpack/whisper_base_en_xnnpack.pte`;
 const WHISPER_BASE_EN_MODEL_COREML = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/coreml/whisper_base_en_coreml.pte`;
 
-const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/tokenizer.json`;
-const WHISPER_SMALL_EN_MODEL = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/xnnpack/whisper_small_en_xnnpack.pte`;
+const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/tokenizer.json`;
+const WHISPER_SMALL_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/xnnpack/whisper_small_en_xnnpack.pte`;
+const WHISPER_SMALL_EN_MODEL_COREML = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/coreml/whisper_small_en_coreml.pte`;
 
-const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-const WHISPER_TINY_MODEL = `${URL_PREFIX}-whisper-tiny/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_tiny_xnnpack.pte`;
+const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/tokenizer.json`;
+const WHISPER_TINY_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_xnnpack.pte`;
+const WHISPER_TINY_MODEL_COREML = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/coreml/whisper_tiny_coreml.pte`;
 
-const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-const WHISPER_BASE_MODEL = `${URL_PREFIX}-whisper-base/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_base_xnnpack.pte`;
+const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/tokenizer.json`;
+const WHISPER_BASE_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/xnnpack/whisper_base_xnnpack.pte`;
+const WHISPER_BASE_MODEL_COREML = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/coreml/whisper_base_coreml.pte`;
 
-const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${PREVIOUS_VERSION_TAG}/tokenizer.json`;
-const WHISPER_SMALL_MODEL = `${URL_PREFIX}-whisper-small/${PREVIOUS_VERSION_TAG}/xnnpack/whisper_small_xnnpack.pte`;
+const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/tokenizer.json`;
+const WHISPER_SMALL_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/xnnpack/whisper_small_xnnpack.pte`;
+const WHISPER_SMALL_MODEL_COREML = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/coreml/whisper_small_coreml.pte`;
 
 /**
  * @category Models - Speech To Text
  */
-export const WHISPER_TINY_EN_XNNPACK = {
+export const WHISPER_TINY_EN = {
   modelName: 'whisper-tiny-en',
   isMultilingual: false,
   modelSource: WHISPER_TINY_EN_MODEL_XNNPACK,
@@ -811,17 +814,10 @@ export const WHISPER_TINY_EN_COREML = {
   tokenizerSource: WHISPER_TINY_EN_TOKENIZER,
 } as const;
 
-export const WHISPER_TINY_EN_VULKAN = {
-  modelName: 'whisper-tiny-en',
-  isMultilingual: false,
-  modelSource: WHISPER_TINY_EN_MODEL_VULKAN,
-  tokenizerSource: WHISPER_TINY_EN_TOKENIZER,
-} as const;
-
 /**
  * @category Models - Speech To Text
  */
-export const WHISPER_BASE_EN_XNNPACK = {
+export const WHISPER_BASE_EN = {
   modelName: 'whisper-base-en',
   isMultilingual: false,
   modelSource: WHISPER_BASE_EN_MODEL_XNNPACK,
@@ -844,7 +840,17 @@ export const WHISPER_BASE_EN_COREML = {
 export const WHISPER_SMALL_EN = {
   modelName: 'whisper-small-en',
   isMultilingual: false,
-  modelSource: WHISPER_SMALL_EN_MODEL,
+  modelSource: WHISPER_SMALL_EN_MODEL_XNNPACK,
+  tokenizerSource: WHISPER_SMALL_EN_TOKENIZER,
+} as const;
+
+/**
+ * @category Models - Speech To Text
+ */
+export const WHISPER_SMALL_EN_COREML = {
+  modelName: 'whisper-small-en',
+  isMultilingual: false,
+  modelSource: WHISPER_SMALL_EN_MODEL_COREML,
   tokenizerSource: WHISPER_SMALL_EN_TOKENIZER,
 } as const;
 
@@ -854,7 +860,17 @@ export const WHISPER_SMALL_EN = {
 export const WHISPER_TINY = {
   modelName: 'whisper-tiny',
   isMultilingual: true,
-  modelSource: WHISPER_TINY_MODEL,
+  modelSource: WHISPER_TINY_MODEL_XNNPACK,
+  tokenizerSource: WHISPER_TINY_TOKENIZER,
+} as const;
+
+/**
+ * @category Models - Speech To Text
+ */
+export const WHISPER_TINY_COREML = {
+  modelName: 'whisper-tiny',
+  isMultilingual: true,
+  modelSource: WHISPER_TINY_MODEL_COREML,
   tokenizerSource: WHISPER_TINY_TOKENIZER,
 } as const;
 
@@ -864,7 +880,17 @@ export const WHISPER_TINY = {
 export const WHISPER_BASE = {
   modelName: 'whisper-base',
   isMultilingual: true,
-  modelSource: WHISPER_BASE_MODEL,
+  modelSource: WHISPER_BASE_MODEL_XNNPACK,
+  tokenizerSource: WHISPER_BASE_TOKENIZER,
+} as const;
+
+/**
+ * @category Models - Speech To Text
+ */
+export const WHISPER_BASE_COREML = {
+  modelName: 'whisper-base',
+  isMultilingual: true,
+  modelSource: WHISPER_BASE_MODEL_COREML,
   tokenizerSource: WHISPER_BASE_TOKENIZER,
 } as const;
 
@@ -874,7 +900,17 @@ export const WHISPER_BASE = {
 export const WHISPER_SMALL = {
   modelName: 'whisper-small',
   isMultilingual: true,
-  modelSource: WHISPER_SMALL_MODEL,
+  modelSource: WHISPER_SMALL_MODEL_XNNPACK,
+  tokenizerSource: WHISPER_SMALL_TOKENIZER,
+} as const;
+
+/**
+ * @category Models - Speech To Text
+ */
+export const WHISPER_SMALL_COREML = {
+  modelName: 'whisper-small',
+  isMultilingual: true,
+  modelSource: WHISPER_SMALL_MODEL_COREML,
   tokenizerSource: WHISPER_SMALL_TOKENIZER,
 } as const;
 
@@ -1338,15 +1374,18 @@ export const MODEL_REGISTRY = {
     STYLE_TRANSFER_RAIN_PRINCESS_QUANTIZED,
     STYLE_TRANSFER_UDNIE,
     STYLE_TRANSFER_UDNIE_QUANTIZED,
-    WHISPER_TINY_EN_XNNPACK,
+    WHISPER_TINY_EN,
     WHISPER_TINY_EN_COREML,
-    WHISPER_TINY_EN_VULKAN,
-    WHISPER_BASE_EN_XNNPACK,
+    WHISPER_BASE_EN,
     WHISPER_BASE_EN_COREML,
     WHISPER_SMALL_EN,
+    WHISPER_SMALL_EN_COREML,
     WHISPER_TINY,
+    WHISPER_TINY_COREML,
     WHISPER_BASE,
+    WHISPER_BASE_COREML,
     WHISPER_SMALL,
+    WHISPER_SMALL_COREML,
     DEEPLAB_V3_RESNET50,
     DEEPLAB_V3_RESNET101,
     DEEPLAB_V3_MOBILENET_V3_LARGE,

From 9b90ea32901a9994b43132995d30c228aba83a2f Mon Sep 17 00:00:00 2001
From: IgorSwat <igorswat2002@o2.pl>
Date: Fri, 8 May 2026 10:12:56 +0200
Subject: [PATCH 05/20] Change default model for iOS devices

---
 apps/speech/screens/SpeechToTextScreen.tsx | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index e7ee776034..94d5930fea 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -49,9 +49,12 @@ import ErrorBanner from '../components/ErrorBanner';
 
 const isSimulator = DeviceInfo.isEmulatorSync();
 
+const DEFAULT_MODEL =
+  Platform.OS === 'ios' ? WHISPER_BASE_EN_COREML : WHISPER_TINY_EN;
+
 export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   const [selectedModel, setSelectedModel] =
-    useState<STTModelSources>(WHISPER_TINY_EN);
+    useState<STTModelSources>(DEFAULT_MODEL);
 
   const model = useSpeechToText({
     model: selectedModel,

From 9af81240dda1ef98701fc905ef5c5d3ce7343271 Mon Sep 17 00:00:00 2001
From: IgorSwat <igorswat2002@o2.pl>
Date: Sun, 10 May 2026 10:42:06 +0200
Subject: [PATCH 06/20] Add explicit timeout parameter

---
 apps/speech/screens/SpeechToTextScreen.tsx    |  1 +
 .../models/speech_to_text/SpeechToText.cpp    |  5 ++--
 .../models/speech_to_text/SpeechToText.h      |  3 +-
 .../models/speech_to_text/whisper/Constants.h | 28 +++++++++----------
 .../models/speech_to_text/whisper/Params.h    |  2 +-
 .../useSpeechToText.ts                        |  3 +-
 .../SpeechToTextModule.ts                     |  7 +++--
 .../react-native-executorch/src/types/stt.ts  | 11 +++++++-
 8 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index 94d5930fea..ad4f6505c8 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -185,6 +185,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     try {
       const streamIter = model.stream({
         verbose: enableTimestamps,
+        timeout: 100,
       });
 
       for await (const { committed, nonCommitted } of streamIter) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index 3133c0bb29..9537642d58 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -115,7 +115,8 @@ TranscriptionResult wordsToResult(const std::vector<Word> &words,
 } // namespace
 
 void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
-                          std::string languageOption, bool verbose) {
+                          std::string languageOption, bool verbose,
+                          uint32_t timeout) {
   if (isStreaming_) {
     throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress,
                             "Streaming is already in progress!");
@@ -158,7 +159,7 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
     // running transcriptions too rapidly (before the audio buffer is filled
     // with significant amount of new data) can cause streamer to commit wrong
     // phrases.
-    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    std::this_thread::sleep_for(std::chrono::milliseconds(timeout));
   }
 
   std::vector<Word> finalWords = streamer_->finish(options);
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
index ade835869c..ec51862793 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
@@ -42,7 +42,8 @@ class SpeechToText {
 
   // Stream
   void stream(std::shared_ptr<jsi::Function> callback,
-              std::string languageOption, bool enableTimestamps);
+              std::string languageOption, bool enableTimestamps,
+              uint32_t timeout);
   void streamStop();
   void streamInsert(std::span<float> waveform);
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h
index 30062a75ba..62a9f968f7 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h
@@ -9,37 +9,37 @@ namespace rnexecutorch::models::speech_to_text::whisper::constants {
 // Maximum duration of each audio chunk to process (in seconds)
 // It is intentionally set to 29 since otherwise only the last chunk would be
 // correctly transcribe due to the model's positional encoding limit
-constexpr static size_t kChunkSize = 29;
+inline constexpr size_t kChunkSize = 29;
 
 // Sampling rate expected by Whisper and the model's audio pipeline (16 kHz)
-constexpr static size_t kSamplingRate = 16000;
-constexpr static size_t kSamplesPerMilisecond = kSamplingRate / 1000;
+inline constexpr size_t kSamplingRate = 16000;
+inline constexpr size_t kSamplesPerMilisecond = kSamplingRate / 1000;
 
-constexpr static size_t kMaxSamples = kChunkSize * kSamplingRate;
+inline constexpr size_t kMaxSamples = kChunkSize * kSamplingRate;
 
 // The maximum number of tokens the decoder can generate per chunk
-constexpr static size_t kMaxDecodeLength = 128;
+inline constexpr size_t kMaxDecodeLength = 128;
 
 // Minimum allowed chunk length before processing (in audio samples)
-constexpr static size_t kMinChunkSamples = 1 * kSamplingRate;
+inline constexpr size_t kMinChunkSamples = 1 * kSamplingRate;
 
 // Number of mel frames output by the encoder (derived from input spectrogram)
-constexpr static size_t kNumFrames = 1500;
+inline constexpr size_t kNumFrames = 1500;
 
 // Time precision used by Whisper timestamps: each token spans 0.02 seconds
-constexpr static float kTimePrecision = 0.02f;
+inline constexpr float kTimePrecision = 0.02f;
 
 // Special characters serving as pause / end of sentence
-static const std::unordered_set<char> kPunctations = {',', '.', '?',
+inline const std::unordered_set<char> kPunctations = {',', '.', '?',
                                                       '!', ':', ';'};
-static const std::unordered_set<char> kEosPunctations = {'.', '?', '!', ';'};
+inline const std::unordered_set<char> kEosPunctations = {'.', '?', '!', ';'};
 
 // Special token constants
 namespace tokens {
-static const std::string kStartOfTranscript = "<|startoftranscript|>";
-static const std::string kEndOfTranscript = "<|endoftext|>";
-static const std::string kBeginTimestamp = "<|0.00|>";
-static const std::string kBlankAudio = "[BLANK_AUDIO]";
+inline const std::string kStartOfTranscript = "<|startoftranscript|>";
+inline const std::string kEndOfTranscript = "<|endoftext|>";
+inline const std::string kBeginTimestamp = "<|0.00|>";
+inline const std::string kBlankAudio = "[BLANK_AUDIO]";
 } // namespace tokens
 
 } // namespace rnexecutorch::models::speech_to_text::whisper::constants
\ No newline at end of file
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h
index a99067f411..847a22b1e0 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h
@@ -24,7 +24,7 @@ constexpr inline float kStreamMaxDuration =
  * The minimum amount of recent audio always kept in the buffer when a blind
  * cut is performed. Acts as the lower bound on what survives a cleanup.
  */
-constexpr inline float kStreamSafetyThreshold = 2.F; // [s]
+constexpr inline float kStreamSafetyThreshold = 3.F; // [s]
 
 /**
  * Forced-cleanup threshold. Once the buffer grows past this duration we run
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index c906851380..229bba73e3 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -5,6 +5,7 @@ import {
   SpeechToTextType,
   SpeechToTextProps,
   TranscriptionResult,
+  StreamingOptions,
 } from '../../types/stt';
 import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
 import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
@@ -101,7 +102,7 @@ export const useSpeechToText = ({
   );
 
   const stream = useCallback(
-    async function* (options: DecodingOptions = {}): AsyncGenerator<
+    async function* (options: StreamingOptions = {}): AsyncGenerator<
       {
         committed: TranscriptionResult;
         nonCommitted: TranscriptionResult;
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
index 273264e0e2..3890c9ae50 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
@@ -2,6 +2,7 @@ import {
   DecodingOptions,
   SpeechToTextModelConfig,
   SpeechToTextModelName,
+  StreamingOptions,
   TranscriptionResult,
 } from '../../types/stt';
 import { ResourceFetcher } from '../../utils/ResourceFetcher';
@@ -174,7 +175,7 @@ export class SpeechToTextModule {
    * @yields An object containing `committed` and `nonCommitted` transcription results.
    * @returns An async generator yielding transcription updates.
    */
-  public async *stream(options: DecodingOptions = {}): AsyncGenerator<{
+  public async *stream(options: StreamingOptions = {}): AsyncGenerator<{
     committed: TranscriptionResult;
     nonCommitted: TranscriptionResult;
   }> {
@@ -182,6 +183,7 @@ export class SpeechToTextModule {
 
     const verbose = !!options.verbose;
     const language = options.language || '';
+    const timeout = options.timeout || 100;
 
     const queue: {
       committed: TranscriptionResult;
@@ -216,7 +218,8 @@ export class SpeechToTextModule {
             wake();
           },
           language,
-          verbose
+          verbose,
+          timeout
         );
 
         finished = true;
diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts
index 0a6ed11f70..20f1013ef0 100644
--- a/packages/react-native-executorch/src/types/stt.ts
+++ b/packages/react-native-executorch/src/types/stt.ts
@@ -94,7 +94,7 @@ export interface SpeechToTextType {
    * @returns Asynchronous generator that returns `committed` and `nonCommitted` transcription.
    * Both `committed` and `nonCommitted` are of type `TranscriptionResult`
    */
-  stream(options?: DecodingOptions | undefined): AsyncGenerator<
+  stream(options?: StreamingOptions | undefined): AsyncGenerator<
     {
       committed: TranscriptionResult;
       nonCommitted: TranscriptionResult;
@@ -208,6 +208,15 @@ export interface DecodingOptions {
   verbose?: boolean;
 }
 
+/**
+ * Configuration options for the speech-to-text streaming process.
+ * @category Types
+ * @property {number} [timeout] - Specifies (in milliseconds) how much does streamer wait between model inferences.
+ */
+export interface StreamingOptions extends DecodingOptions {
+  timeout?: number;
+}
+
 /**
  * Structure that represent single token with timestamp information.
  * @category Types

From f7849fc933a3e6dc4aeefa765b288409da4fdd2e Mon Sep 17 00:00:00 2001
From: IgorSwat <igorswat2002@o2.pl>
Date: Sun, 10 May 2026 14:21:23 +0200
Subject: [PATCH 07/20] Concurrency fixes & automatic cleaunp

---
 .../speech_to_text/whisper/OnlineASR.cpp      | 243 +++++++++++-------
 .../models/speech_to_text/whisper/OnlineASR.h |  34 ++-
 2 files changed, 169 insertions(+), 108 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
index fb57fcb0f3..188c77d80d 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
@@ -16,12 +16,28 @@ OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) {
 }
 
 bool OnlineASR::isReady() const {
+  std::scoped_lock<std::mutex> lock(streamingMutex);
+
   return audioBuffer_.size() >= constants::kMinChunkSamples;
 }
 
 void OnlineASR::insertAudioChunk(std::span<const float> audio) {
-  std::scoped_lock<std::mutex> lock(audioBufferMutex_);
+  std::scoped_lock<std::mutex> lock(streamingMutex);
+
   audioBuffer_.insert(audioBuffer_.end(), audio.begin(), audio.end());
+
+  // Automatic buffer cleanup.
+  //
+  // This prevents the audio buffer from growing indefinitely during continuous
+  // streaming. It is particularly useful when VAD (Voice Activity Detection)
+  // is used and elements are inserted but not processed for a long time.
+  // It should not pass the condition in a normal streaming, that is when
+  // process() method is called regularly within reasonable steps of time.
+  if (audioBuffer_.size() > constants::kMaxSamples) {
+    // Note that results are not actually committed now, but saved for
+    // a later call of process().
+    memory_.toCommit = commitAndClean(memory_.transcript);
+  }
 }
 
 ProcessResult OnlineASR::process(const DecodingOptions &options) {
@@ -30,7 +46,7 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
   // Copy the audio buffer to avoid keeping the lock during the entire
   // transcription process.
   {
-    std::scoped_lock<std::mutex> lock(audioBufferMutex_);
+    std::scoped_lock<std::mutex> lock(streamingMutex);
     audioCopy = audioBuffer_;
   }
 
@@ -51,17 +67,18 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
               std::back_inserter(words));
   }
 
-  std::vector<Word> committed;
+  // Aquire lock for the rest of the method (extensive usage of audioBuffer_).
+  std::scoped_lock<std::mutex> lock(streamingMutex);
 
   // Step 1: examine all previously saved EOS points.
   // The idea is to remove entries which have changed or no longer exist
   // due to model correcting it's output.
-  for (size_t i = 0; i < eos_.size(); i++) {
-    const auto &eos = eos_[i];
+  for (size_t i = 0; i < memory_.eos.size(); i++) {
+    const auto &eos = memory_.eos[i];
     if (eos.position >= words.size() || !utils::isEos(words[eos.position]) ||
         (eos.position > 0 &&
          eos.preceeding != words[eos.position - 1].content)) {
-      eos_.erase(eos_.begin() + i, eos_.end());
+      memory_.eos.erase(memory_.eos.begin() + i, memory_.eos.end());
       break;
     }
   }
@@ -74,110 +91,43 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
 
     // Because of step 1, we know that if the last EOS exist in eos_,
     // then it must be the last entry.
-    if (eos_.empty() || eos_.back().position != lastEosIndex) {
+    if (memory_.eos.empty() || memory_.eos.back().position != lastEosIndex) {
       // Register last EOS entry
       std::string preceeding =
           lastEosIndex > 0 ? words[lastEosIndex - 1].content : "";
-      eos_.emplace_back(lastEosIndex, preceeding, lastEosIt->end);
+      memory_.eos.emplace_back(lastEosIndex, preceeding, lastEosIt->end);
     }
   }
 
-  // Step 3: clear the buffer if it is getting too large.
+  std::vector<Word> committed;
+
+  // Step 3: collect all the words which could possible get committed
+  // in-between iterations.
+  if (!memory_.toCommit.empty()) {
+    committed.insert(committed.end(),
+                     std::make_move_iterator(memory_.toCommit.begin()),
+                     std::make_move_iterator(memory_.toCommit.end()));
+    memory_.toCommit.clear();
+  }
+
+  // Step 4: clear the buffer if it is getting too large.
   // The idea is to use the saved EOS entries and try to cut the buffer
   // in a 'good' spot - where it will remove a significant audio chunk, yet
   // won't affect most recent, unfinished speech samples.
   size_t bufferSize = audioBuffer_.size();
   if (bufferSize > static_cast<size_t>(params::kStreamSafeBufferDuration *
                                        constants::kSamplingRate)) {
-    // Setup the lock for the entire cleanup section.
-    std::scoped_lock<std::mutex> lock(audioBufferMutex_);
-
-    const float midBufferThreshold = params::kStreamMaxDuration / 2.0F;
-
-    // If we don't have any EOS entries, then we most likely have not
-    // recorded any speech. In this case we can safely cut the maximum amount of
-    // audio data.
-    if (eos_.empty()) {
-      size_t cut = bufferSize - params::kStreamSafetyThreshold;
-
-      audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut);
-    }
-
-    // If we have exactly one (most recent) EOS entry in the eos_, then
-    // we need to be more careful.
-    // Normally we want to keep at least one sentence in, but if the sentence
-    // covers a significant amount of buffer, we have no choice.
-    else if (eos_.size() == 1) {
-      const float eosTimestamp = eos_[0].tmstpend;
-
-      const float upperHalfDuration =
-          std::max(0.0F, eosTimestamp - midBufferThreshold);
-      const float wordsPerSecond =
-          upperHalfDuration > 0.1F
-              ? static_cast<float>(words.size()) / upperHalfDuration
-              : 0.0F;
-
-      // The EOS sits early enough that cutting up to the safety margin won't
-      // touch the ongoing (post-EOS) speech.
-      const bool eosSafe = eosTimestamp < params::kStreamSafeBufferDuration -
-                                              params::kStreamSafetyThreshold;
-
-      if (!eosSafe && wordsPerSecond < params::kWordsPerSecondLow) {
-        // EOS lies past the midpoint, but a low word density implies the spoken
-        // audio is concentrated in the upper half. Drop the lower half and
-        // shift the EOS accordingly.
-        audioBuffer_.erase(audioBuffer_.begin(),
-                           audioBuffer_.begin() +
-                               static_cast<size_t>(midBufferThreshold *
-                                                   constants::kSamplingRate));
-        eos_[0].tmstpend -= midBufferThreshold;
-      } else {
-        // Cut everything up to and including the sentence — either by the
-        // safety margin (when EOS is early) or (more aggresively) right at the
-        // EOS boundary — and commit its words.
-        const size_t cut =
-            eosSafe
-                ? bufferSize -
-                      static_cast<size_t>(params::kStreamSafetyThreshold *
-                                          constants::kSamplingRate)
-                : static_cast<size_t>(eosTimestamp * constants::kSamplingRate);
-
-        audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut);
-
-        committed.insert(committed.end(),
-                         std::make_move_iterator(words.begin()),
-                         std::make_move_iterator(words.end()));
-
-        words.clear();
-        eos_.clear();
-      }
-    }
-
-    // In case of 2 or more sentences, we generally want to keep the last one
-    // intact. This would provide a bit of stability to the algorithm.
-    else {
-      const auto &secondTolastEntry = eos_[eos_.size() - 2];
-
-      const size_t cut = static_cast<size_t>(secondTolastEntry.tmstpend *
-                                             constants::kSamplingRate);
-      const size_t lastCommittedPos = secondTolastEntry.position;
-
-      audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut);
-
-      // Move all words up to the last committed position (inclusive) to the
-      // committed buffer.
-      committed.insert(
-          committed.end(), std::make_move_iterator(words.begin()),
-          std::make_move_iterator(words.begin() + lastCommittedPos + 1));
-      words.erase(words.begin(), words.begin() + lastCommittedPos + 1);
+    auto newCommitted = commitAndClean(words);
 
-      // Retain only the most recent EOS entry.
-      eos_.erase(eos_.begin(), eos_.end() - 1);
-      eos_[0].tmstpend -= secondTolastEntry.tmstpend;
-    }
+    committed.insert(committed.end(),
+                     std::make_move_iterator(newCommitted.begin()),
+                     std::make_move_iterator(newCommitted.end()));
   }
 
-  // Return the results
+  // Save the uncommitted part to streamer's memory,
+  // cause it might be necessary when committing inside streamInsert().
+  memory_.transcript = words;
+
   // Note that uncommitted part represented by recent transcription (words)
   // is already shrinked if something has been committed during the cleanup
   // phase.
@@ -200,10 +150,109 @@ std::vector<Word> OnlineASR::finish(const DecodingOptions &options) {
 }
 
 void OnlineASR::reset() {
-  std::scoped_lock<std::mutex> lock(audioBufferMutex_);
+  std::scoped_lock<std::mutex> lock(streamingMutex);
+
   audioBuffer_.clear();
 
-  eos_.clear();
+  // Reset memory.
+  memory_.transcript.clear();
+  memory_.eos.clear();
+  memory_.toCommit.clear();
+}
+
+std::vector<Word> OnlineASR::commitAndClean(std::vector<Word> &transcript) {
+  const size_t bufferSize = audioBuffer_.size();
+  const float midBufferThreshold = params::kStreamMaxDuration / 2.0F;
+
+  std::vector<Word> committed;
+
+  // If we don't have any EOS entries, then we most likely have not
+  // recorded any speech. In this case we can safely cut the maximum amount of
+  // audio data.
+  if (memory_.eos.empty()) {
+    size_t cut =
+        bufferSize - params::kStreamSafetyThreshold * constants::kSamplingRate;
+
+    audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut);
+  }
+
+  // If we have exactly one (most recent) EOS entry in the eos_, then
+  // we need to be more careful.
+  // Normally we want to keep at least one sentence in, but if the sentence
+  // covers a significant amount of buffer, we have no choice.
+  else if (memory_.eos.size() == 1) {
+    const float eosTimestamp = memory_.eos[0].tmstpend;
+
+    const float upperHalfDuration =
+        std::max(0.0F, eosTimestamp - midBufferThreshold);
+    const float wordsPerSecond =
+        upperHalfDuration > 0.1F
+            ? static_cast<float>(transcript.size()) / upperHalfDuration
+            : 0.0F;
+
+    // The EOS sits early enough that cutting up to the safety margin won't
+    // touch the ongoing (post-EOS) speech.
+    const bool eosSafe = eosTimestamp < params::kStreamSafeBufferDuration -
+                                            params::kStreamSafetyThreshold;
+
+    if (!eosSafe && wordsPerSecond < params::kWordsPerSecondLow) {
+      // EOS lies past the midpoint, but a low word density implies the spoken
+      // audio is concentrated in the upper half. Drop the lower half and
+      // shift the EOS accordingly.
+      audioBuffer_.erase(audioBuffer_.begin(),
+                         audioBuffer_.begin() +
+                             static_cast<size_t>(midBufferThreshold *
+                                                 constants::kSamplingRate));
+      memory_.eos[0].tmstpend -= midBufferThreshold;
+    } else {
+      // Cut everything up to and including the sentence — either by the
+      // safety margin (when EOS is early) or (more aggresively) right at the
+      // EOS boundary — and commit its words.
+      const size_t cut =
+          eosSafe
+              ? bufferSize -
+                    static_cast<size_t>(params::kStreamSafetyThreshold *
+                                        constants::kSamplingRate)
+              : static_cast<size_t>(eosTimestamp * constants::kSamplingRate);
+
+      audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut);
+
+      committed.insert(committed.end(),
+                       std::make_move_iterator(transcript.begin()),
+                       std::make_move_iterator(transcript.end()));
+
+      transcript.clear();
+      memory_.eos.clear();
+    }
+  }
+
+  // In case of 2 or more sentences, we generally want to keep the last one
+  // intact. This would provide a bit of stability to the algorithm.
+  else {
+    const auto &secondTolastEntry = memory_.eos[memory_.eos.size() - 2];
+
+    const size_t cut = static_cast<size_t>(secondTolastEntry.tmstpend *
+                                           constants::kSamplingRate);
+    const size_t lastCommittedPos = secondTolastEntry.position;
+
+    audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut);
+
+    // Move all words up to the last committed position (inclusive) to the
+    // committed buffer.
+    committed.insert(
+        committed.end(), std::make_move_iterator(transcript.begin()),
+        std::make_move_iterator(transcript.begin() + lastCommittedPos + 1));
+    transcript.erase(transcript.begin(),
+                     transcript.begin() + lastCommittedPos + 1);
+
+    // Retain only the most recent EOS entry, shifting both its timestamp
+    // and its position to match the new (truncated) transcript origin.
+    memory_.eos.erase(memory_.eos.begin(), memory_.eos.end() - 1);
+    memory_.eos[0].tmstpend -= secondTolastEntry.tmstpend;
+    memory_.eos[0].position -= lastCommittedPos + 1;
+  }
+
+  return committed;
 }
 
 } // namespace rnexecutorch::models::speech_to_text::whisper::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h
index 0c0b65f40e..7547d16bd5 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h
@@ -51,23 +51,35 @@ class OnlineASR : public schema::OnlineASR {
   void reset() override;
 
 private:
+  // Cleans up the buffer and returns committed words based on given transcript.
+  std::vector<Word> commitAndClean(std::vector<Word> &transcript);
+
   // ASR module connection for transcribing the audio
   const ASR *asr_;
 
   // Audio buffer (input) - accumulates obtained audio samples.
   std::vector<float> audioBuffer_ = {};
-  mutable std::mutex audioBufferMutex_;
+  mutable std::mutex streamingMutex; // Covers both buffer & memory
+
+  // Streaming memory.
+  // In general, helps to navigate continous streaming state and improve buffer
+  // handling algorithms.
+  struct Memory {
+    // State management helper.
+    struct EOSEntry {
+      size_t position; // An absolute position (index) in the transcription
+                       // (word sequence).
+      std::string preceeding; // A preceeding word in the transcription
+      float tmstpend;         // Ending timestamp of the sentence.
+    };
 
-  // State management helper.
-  struct EOSEntry {
-    size_t position; // An absolute position (index) in the transcription (word
-                     // sequence).
-    std::string preceeding; // A preceeding word in the transcription
-    float tmstpend;         // Ending timestamp of the sentence.
-  };
-  // Stores saved EOS entries in most recent transcription
-  // and allows to clear the buffer in a smart, non invasive way.
-  std::vector<EOSEntry> eos_;
+    std::vector<Word>
+        transcript; // The most recent transcription result (uncommitted only!).
+    std::vector<EOSEntry>
+        eos; // End of sentence points from the most recent transcription.
+    std::vector<Word> toCommit; // Words to be committed in the next iteration
+                                // (next process() call).
+  } memory_;
 };
 
 } // namespace rnexecutorch::models::speech_to_text::whisper::stream

From 3bf68bf5b38bab76d904503b1a9ad01ae1937f12 Mon Sep 17 00:00:00 2001
From: IgorSwat <igorswat2002@o2.pl>
Date: Thu, 14 May 2026 10:52:47 +0200
Subject: [PATCH 08/20] Update urls & audio-api

---
 .../src/constants/modelUrls.ts                | 24 +++++++++----------
 yarn.lock                                     | 14 -----------
 2 files changed, 12 insertions(+), 26 deletions(-)

diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
index a87117d863..2a15be25dd 100644
--- a/packages/react-native-executorch/src/constants/modelUrls.ts
+++ b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -774,28 +774,28 @@ export const STYLE_TRANSFER_UDNIE_QUANTIZED = {
 
 // S2T
 const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/tokenizer.json`;
-const WHISPER_TINY_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack.pte`;
-const WHISPER_TINY_EN_MODEL_COREML = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/coreml/whisper_tiny_en_coreml.pte`;
+const WHISPER_TINY_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack_fp32.pte`;
+const WHISPER_TINY_EN_MODEL_COREML = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/coreml/whisper_tiny_en_coreml_fp32.pte`;
 
 const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/tokenizer.json`;
-const WHISPER_BASE_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/xnnpack/whisper_base_en_xnnpack.pte`;
-const WHISPER_BASE_EN_MODEL_COREML = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/coreml/whisper_base_en_coreml.pte`;
+const WHISPER_BASE_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/xnnpack/whisper_base_en_xnnpack_fp32.pte`;
+const WHISPER_BASE_EN_MODEL_COREML = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/coreml/whisper_base_en_coreml_fp32.pte`;
 
 const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/tokenizer.json`;
-const WHISPER_SMALL_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/xnnpack/whisper_small_en_xnnpack.pte`;
-const WHISPER_SMALL_EN_MODEL_COREML = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/coreml/whisper_small_en_coreml.pte`;
+const WHISPER_SMALL_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/xnnpack/whisper_small_en_xnnpack_fp32.pte`;
+const WHISPER_SMALL_EN_MODEL_COREML = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/coreml/whisper_small_en_coreml_fp32.pte`;
 
 const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/tokenizer.json`;
-const WHISPER_TINY_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_xnnpack.pte`;
-const WHISPER_TINY_MODEL_COREML = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/coreml/whisper_tiny_coreml.pte`;
+const WHISPER_TINY_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_xnnpack_fp32.pte`;
+const WHISPER_TINY_MODEL_COREML = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/coreml/whisper_tiny_coreml_fp32.pte`;
 
 const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/tokenizer.json`;
-const WHISPER_BASE_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/xnnpack/whisper_base_xnnpack.pte`;
-const WHISPER_BASE_MODEL_COREML = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/coreml/whisper_base_coreml.pte`;
+const WHISPER_BASE_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/xnnpack/whisper_base_xnnpack_fp32.pte`;
+const WHISPER_BASE_MODEL_COREML = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/coreml/whisper_base_coreml_fp32.pte`;
 
 const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/tokenizer.json`;
-const WHISPER_SMALL_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/xnnpack/whisper_small_xnnpack.pte`;
-const WHISPER_SMALL_MODEL_COREML = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/coreml/whisper_small_coreml.pte`;
+const WHISPER_SMALL_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/xnnpack/whisper_small_xnnpack_fp32.pte`;
+const WHISPER_SMALL_MODEL_COREML = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/coreml/whisper_small_coreml_fp32.pte`;
 
 /**
  * @category Models - Speech To Text
diff --git a/yarn.lock b/yarn.lock
index a4439a0f33..7f335abe71 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -15283,20 +15283,6 @@ __metadata:
   languageName: node
   linkType: hard
 
-"react-native-audio-api@npm:0.11.5":
-  version: 0.11.5
-  resolution: "react-native-audio-api@npm:0.11.5"
-  dependencies:
-    semver: "npm:^7.7.3"
-  peerDependencies:
-    react: "*"
-    react-native: "*"
-  bin:
-    setup-rn-audio-api-web: scripts/setup-rn-audio-api-web.js
-  checksum: 10/f8a388954c42cfd390b9adbfe6781f9d8049d43ea6ab83a8b229a0d0082df3489d9b48072d7166403ae95a33e8d741aab86ba2307d1bd4ff949fdb72e14ef42d
-  languageName: node
-  linkType: hard
-
 "react-native-audio-api@npm:0.12.0":
   version: 0.12.0
   resolution: "react-native-audio-api@npm:0.12.0"

From 27769d4a73292f9f01c4e8acaebd3664a067e97e Mon Sep 17 00:00:00 2001
From: IgorSwat <igorswat2002@o2.pl>
Date: Tue, 19 May 2026 13:05:03 +0200
Subject: [PATCH 09/20] Apply review suggestions

---
 apps/speech/screens/SpeechToTextScreen.tsx    |  2 +-
 .../models/speech_to_text/common/types/Word.h |  2 +-
 .../models/speech_to_text/whisper/ASR.cpp     | 10 +--
 .../speech_to_text/whisper/OnlineASR.cpp      | 67 ++++++++++---------
 .../models/speech_to_text/whisper/Utils.h     |  2 +-
 5 files changed, 40 insertions(+), 43 deletions(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index ad4f6505c8..2942d5e718 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -18,9 +18,9 @@ import {
   WHISPER_BASE_EN,
   WHISPER_BASE_EN_COREML,
   WHISPER_SMALL_EN,
+  WHISPER_SMALL_EN_COREML,
   TranscriptionResult,
   SpeechToTextProps,
-  WHISPER_SMALL_EN_COREML,
 } from 'react-native-executorch';
 import { ModelPicker, ModelOption } from '../components/ModelPicker';
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h
index 2343d1faab..fcf7759b24 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h
@@ -5,7 +5,7 @@
 namespace rnexecutorch::models::speech_to_text {
 
 /**
- * Basically a different representation of token,
+ * Different representation of a token,
  * with timestamps calculated.
  */
 struct Word {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
index d2555a79fa..5a925e6eba 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
@@ -265,15 +265,11 @@ ASR::generate(std::span<const float> waveform, const DecodingOptions &options,
   uint64_t startPos = 0;
 
   // Prefill: feed each initial token individually so decode() always sees 1
-  // token
-  std::span<uint64_t> firstToken(sequenceIds.data(), 1);
-  executorch::aten::Tensor logitsTensor =
-      this->decode(firstToken, encoderFeatures, startPos);
-  ++startPos;
-  for (size_t i = 1; i < sequenceIds.size(); ++i) {
+  // token.
+  executorch::aten::Tensor logitsTensor{nullptr};
+  for (size_t i = 0; i < sequenceIds.size(); i++, startPos++) {
     std::span<uint64_t> single(sequenceIds.data() + i, 1);
     logitsTensor = this->decode(single, encoderFeatures, startPos);
-    ++startPos;
   }
 
   // Autoregressive decoding: always 1 token at a time
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
index 188c77d80d..ced3193531 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
@@ -2,6 +2,7 @@
 
 #include <algorithm>
 #include <iterator>
+#include <ranges>
 #include <utility>
 
 #include "Constants.h"
@@ -11,18 +12,17 @@
 namespace rnexecutorch::models::speech_to_text::whisper::stream {
 
 OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) {
-  // Reserve an expected amount of memory for audio buffer.
   audioBuffer_.reserve((constants::kChunkSize + 1) * constants::kSamplingRate);
 }
 
 bool OnlineASR::isReady() const {
-  std::scoped_lock<std::mutex> lock(streamingMutex);
+  std::scoped_lock lock(streamingMutex);
 
   return audioBuffer_.size() >= constants::kMinChunkSamples;
 }
 
 void OnlineASR::insertAudioChunk(std::span<const float> audio) {
-  std::scoped_lock<std::mutex> lock(streamingMutex);
+  std::scoped_lock lock(streamingMutex);
 
   audioBuffer_.insert(audioBuffer_.end(), audio.begin(), audio.end());
 
@@ -41,12 +41,15 @@ void OnlineASR::insertAudioChunk(std::span<const float> audio) {
 }
 
 ProcessResult OnlineASR::process(const DecodingOptions &options) {
+  constexpr float kStreamSafeBufferMaxSamples =
+      params::kStreamSafeBufferDuration * constants::kSamplingRate;
+
   std::vector<float> audioCopy;
 
   // Copy the audio buffer to avoid keeping the lock during the entire
   // transcription process.
   {
-    std::scoped_lock<std::mutex> lock(streamingMutex);
+    std::scoped_lock lock(streamingMutex);
     audioCopy = audioBuffer_;
   }
 
@@ -60,25 +63,23 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
   std::vector<Segment> transcriptions = asr_->transcribe(input, options);
 
   // Flatten segments into a single word sequence.
-  // This is basically our 'nonCommitted' part for now.
+  // This is our 'nonCommitted' part for now.
   std::vector<Word> words;
   for (auto &segment : transcriptions) {
-    std::move(segment.words.begin(), segment.words.end(),
-              std::back_inserter(words));
+    std::ranges::move(segment.words, std::back_inserter(words));
   }
 
   // Aquire lock for the rest of the method (extensive usage of audioBuffer_).
-  std::scoped_lock<std::mutex> lock(streamingMutex);
+  std::scoped_lock lock(streamingMutex);
 
   // Step 1: examine all previously saved EOS points.
   // The idea is to remove entries which have changed or no longer exist
   // due to model correcting it's output.
-  for (size_t i = 0; i < memory_.eos.size(); i++) {
-    const auto &eos = memory_.eos[i];
-    if (eos.position >= words.size() || !utils::isEos(words[eos.position]) ||
-        (eos.position > 0 &&
-         eos.preceeding != words[eos.position - 1].content)) {
-      memory_.eos.erase(memory_.eos.begin() + i, memory_.eos.end());
+  for (auto it = memory_.eos.begin(); it != memory_.eos.end(); it++) {
+    if (it->position >= words.size() || !utils::isEos(words[it->position]) ||
+        (it->position > 0 &&
+         it->preceeding != words[it->position - 1].content)) {
+      memory_.eos.erase(it, memory_.eos.end());
       break;
     }
   }
@@ -92,7 +93,6 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
     // Because of step 1, we know that if the last EOS exist in eos_,
     // then it must be the last entry.
     if (memory_.eos.empty() || memory_.eos.back().position != lastEosIndex) {
-      // Register last EOS entry
       std::string preceeding =
           lastEosIndex > 0 ? words[lastEosIndex - 1].content : "";
       memory_.eos.emplace_back(lastEosIndex, preceeding, lastEosIt->end);
@@ -115,8 +115,8 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
   // in a 'good' spot - where it will remove a significant audio chunk, yet
   // won't affect most recent, unfinished speech samples.
   size_t bufferSize = audioBuffer_.size();
-  if (bufferSize > static_cast<size_t>(params::kStreamSafeBufferDuration *
-                                       constants::kSamplingRate)) {
+  if (std::cmp_greater<size_t, size_t>(bufferSize,
+                                       kStreamSafeBufferMaxSamples)) {
     auto newCommitted = commitAndClean(words);
 
     committed.insert(committed.end(),
@@ -139,7 +139,7 @@ std::vector<Word> OnlineASR::finish(const DecodingOptions &options) {
 
   // Last-tick committed delta + whatever never made it past the commit
   // threshold.
-  std::vector<Word> residual = std::move(result.committed);
+  std::vector<Word> residual{std::move(result.committed)};
   residual.insert(residual.end(),
                   std::make_move_iterator(result.nonCommitted.begin()),
                   std::make_move_iterator(result.nonCommitted.end()));
@@ -150,7 +150,7 @@ std::vector<Word> OnlineASR::finish(const DecodingOptions &options) {
 }
 
 void OnlineASR::reset() {
-  std::scoped_lock<std::mutex> lock(streamingMutex);
+  std::scoped_lock lock(streamingMutex);
 
   audioBuffer_.clear();
 
@@ -161,8 +161,16 @@ void OnlineASR::reset() {
 }
 
 std::vector<Word> OnlineASR::commitAndClean(std::vector<Word> &transcript) {
+  constexpr float kMidpointAnchorTime = params::kStreamMaxDuration / 2.0F;
+  constexpr size_t kMidpointAnchorSamples =
+      static_cast<size_t>(kMidpointAnchorTime * constants::kSamplingRate);
+  constexpr size_t kSafetyMarginSamples = static_cast<size_t>(
+      params::kStreamSafetyThreshold * constants::kSamplingRate);
+  constexpr float kMaxSafeEosTime =
+      params::kStreamSafeBufferDuration - params::kStreamSafetyThreshold;
+  constexpr float kMinDurationToCalculateDensity = 0.1F;
+
   const size_t bufferSize = audioBuffer_.size();
-  const float midBufferThreshold = params::kStreamMaxDuration / 2.0F;
 
   std::vector<Word> committed;
 
@@ -184,35 +192,30 @@ std::vector<Word> OnlineASR::commitAndClean(std::vector<Word> &transcript) {
     const float eosTimestamp = memory_.eos[0].tmstpend;
 
     const float upperHalfDuration =
-        std::max(0.0F, eosTimestamp - midBufferThreshold);
+        std::max(0.0F, eosTimestamp - kMidpointAnchorTime);
     const float wordsPerSecond =
-        upperHalfDuration > 0.1F
+        upperHalfDuration > kMinDurationToCalculateDensity
             ? static_cast<float>(transcript.size()) / upperHalfDuration
             : 0.0F;
 
     // The EOS sits early enough that cutting up to the safety margin won't
     // touch the ongoing (post-EOS) speech.
-    const bool eosSafe = eosTimestamp < params::kStreamSafeBufferDuration -
-                                            params::kStreamSafetyThreshold;
+    const bool eosSafe = eosTimestamp < kMaxSafeEosTime;
 
     if (!eosSafe && wordsPerSecond < params::kWordsPerSecondLow) {
       // EOS lies past the midpoint, but a low word density implies the spoken
       // audio is concentrated in the upper half. Drop the lower half and
       // shift the EOS accordingly.
       audioBuffer_.erase(audioBuffer_.begin(),
-                         audioBuffer_.begin() +
-                             static_cast<size_t>(midBufferThreshold *
-                                                 constants::kSamplingRate));
-      memory_.eos[0].tmstpend -= midBufferThreshold;
+                         audioBuffer_.begin() + kMidpointAnchorSamples);
+      memory_.eos[0].tmstpend -= kMidpointAnchorTime;
     } else {
       // Cut everything up to and including the sentence — either by the
       // safety margin (when EOS is early) or (more aggresively) right at the
       // EOS boundary — and commit its words.
       const size_t cut =
           eosSafe
-              ? bufferSize -
-                    static_cast<size_t>(params::kStreamSafetyThreshold *
-                                        constants::kSamplingRate)
+              ? bufferSize - kSafetyMarginSamples
               : static_cast<size_t>(eosTimestamp * constants::kSamplingRate);
 
       audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut);
@@ -237,8 +240,6 @@ std::vector<Word> OnlineASR::commitAndClean(std::vector<Word> &transcript) {
 
     audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut);
 
-    // Move all words up to the last committed position (inclusive) to the
-    // committed buffer.
     committed.insert(
         committed.end(), std::make_move_iterator(transcript.begin()),
         std::make_move_iterator(transcript.begin() + lastCommittedPos + 1));
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h
index 48c84a84b7..ae461c27cf 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h
@@ -14,7 +14,7 @@ namespace rnexecutorch::models::speech_to_text::whisper::utils {
  *
  * @param word The word to check.
  */
-constexpr inline bool isEos(const Word &word) {
+inline bool isEos(const Word &word) {
   return word.content.size() == 1 &&
          constants::kEosPunctations.contains(word.content[0]);
 }

From c5b142d9a946dc04ba8dc0da9c977cb860b72894 Mon Sep 17 00:00:00 2001
From: IgorSwat <igorswat2002@o2.pl>
Date: Tue, 19 May 2026 13:17:24 +0200
Subject: [PATCH 10/20] Rebase with main

---
 .../src/constants/modelUrls.ts                | 36 +++++++++----------
 .../references/reference-models.md            | 21 ++++++-----
 .../references/reference-models.md            | 21 ++++++-----
 3 files changed, 42 insertions(+), 36 deletions(-)

diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
index 2a15be25dd..aec9da1c0f 100644
--- a/packages/react-native-executorch/src/constants/modelUrls.ts
+++ b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -773,29 +773,29 @@ export const STYLE_TRANSFER_UDNIE_QUANTIZED = {
 } as const;
 
 // S2T
-const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/tokenizer.json`;
-const WHISPER_TINY_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack_fp32.pte`;
-const WHISPER_TINY_EN_MODEL_COREML = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/coreml/whisper_tiny_en_coreml_fp32.pte`;
+const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/tokenizer.json`;
+const WHISPER_TINY_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack_fp32.pte`;
+const WHISPER_TINY_EN_MODEL_COREML = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/coreml/whisper_tiny_en_coreml_fp32.pte`;
 
-const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/tokenizer.json`;
-const WHISPER_BASE_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/xnnpack/whisper_base_en_xnnpack_fp32.pte`;
-const WHISPER_BASE_EN_MODEL_COREML = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/coreml/whisper_base_en_coreml_fp32.pte`;
+const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/tokenizer.json`;
+const WHISPER_BASE_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/xnnpack/whisper_base_en_xnnpack_fp32.pte`;
+const WHISPER_BASE_EN_MODEL_COREML = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/coreml/whisper_base_en_coreml_fp32.pte`;
 
-const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/tokenizer.json`;
-const WHISPER_SMALL_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/xnnpack/whisper_small_en_xnnpack_fp32.pte`;
-const WHISPER_SMALL_EN_MODEL_COREML = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/coreml/whisper_small_en_coreml_fp32.pte`;
+const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/tokenizer.json`;
+const WHISPER_SMALL_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/xnnpack/whisper_small_en_xnnpack_fp32.pte`;
+const WHISPER_SMALL_EN_MODEL_COREML = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/coreml/whisper_small_en_coreml_fp32.pte`;
 
-const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/tokenizer.json`;
-const WHISPER_TINY_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_xnnpack_fp32.pte`;
-const WHISPER_TINY_MODEL_COREML = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/coreml/whisper_tiny_coreml_fp32.pte`;
+const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/tokenizer.json`;
+const WHISPER_TINY_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/xnnpack/whisper_tiny_xnnpack_fp32.pte`;
+const WHISPER_TINY_MODEL_COREML = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/coreml/whisper_tiny_coreml_fp32.pte`;
 
-const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/tokenizer.json`;
-const WHISPER_BASE_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/xnnpack/whisper_base_xnnpack_fp32.pte`;
-const WHISPER_BASE_MODEL_COREML = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/coreml/whisper_base_coreml_fp32.pte`;
+const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/tokenizer.json`;
+const WHISPER_BASE_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/xnnpack/whisper_base_xnnpack_fp32.pte`;
+const WHISPER_BASE_MODEL_COREML = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/coreml/whisper_base_coreml_fp32.pte`;
 
-const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/tokenizer.json`;
-const WHISPER_SMALL_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/xnnpack/whisper_small_xnnpack_fp32.pte`;
-const WHISPER_SMALL_MODEL_COREML = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/coreml/whisper_small_coreml_fp32.pte`;
+const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/tokenizer.json`;
+const WHISPER_SMALL_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/xnnpack/whisper_small_xnnpack_fp32.pte`;
+const WHISPER_SMALL_MODEL_COREML = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/coreml/whisper_small_coreml_fp32.pte`;
 
 /**
  * @category Models - Speech To Text
diff --git a/skills/canary/react-native-executorch/references/reference-models.md b/skills/canary/react-native-executorch/references/reference-models.md
index f6010a7793..02134f4513 100644
--- a/skills/canary/react-native-executorch/references/reference-models.md
+++ b/skills/canary/react-native-executorch/references/reference-models.md
@@ -195,18 +195,21 @@ For a list of all available Speech to Text models reference [this Hugging Face c
 
 ### Whisper Models (English only)
 
-- **WHISPER_TINY_EN** - Whisper Tiny English-only
-- **WHISPER_TINY_EN_QUANTIZED** - Whisper Tiny English-only quantized
-- **WHISPER_BASE_EN** - Whisper Base English-only
-- **WHISPER_BASE_EN_QUANTIZED** - Whisper Base English-only quantized
-- **WHISPER_SMALL_EN** - Whisper Small English-only
-- **WHISPER_SMALL_EN_QUANTIZED** - Whisper Small English-only quantized
+- **WHISPER_TINY_EN** - Whisper Tiny English-only (XNNPACK)
+- **WHISPER_TINY_EN_COREML** - Whisper Tiny English-only (CoreML)
+- **WHISPER_BASE_EN** - Whisper Base English-only (XNNPACK)
+- **WHISPER_BASE_EN_COREML** - Whisper Base English-only (CoreML)
+- **WHISPER_SMALL_EN** - Whisper Small English-only (XNNPACK)
+- **WHISPER_SMALL_EN_COREML** - Whisper Small English-only (CoreML)
 
 ### Whisper Models (Multilingual)
 
-- **WHISPER_TINY** - Whisper Tiny multilingual
-- **WHISPER_BASE** - Whisper Base multilingual
-- **WHISPER_SMALL** - Whisper Small multilingual
+- **WHISPER_TINY** - Whisper Tiny multilingual (XNNPACK)
+- **WHISPER_TINY_COREML** - Whisper Tiny multilingual (CoreML)
+- **WHISPER_BASE** - Whisper Base multilingual (XNNPACK)
+- **WHISPER_BASE_COREML** - Whisper Base multilingual (CoreML)
+- **WHISPER_SMALL** - Whisper Small multilingual (XNNPACK)
+- **WHISPER_SMALL_COREML** - Whisper Small multilingual (CoreML)
 
 ---
 
diff --git a/skills/react-native-executorch/references/reference-models.md b/skills/react-native-executorch/references/reference-models.md
index f6010a7793..02134f4513 100644
--- a/skills/react-native-executorch/references/reference-models.md
+++ b/skills/react-native-executorch/references/reference-models.md
@@ -195,18 +195,21 @@ For a list of all available Speech to Text models reference [this Hugging Face c
 
 ### Whisper Models (English only)
 
-- **WHISPER_TINY_EN** - Whisper Tiny English-only
-- **WHISPER_TINY_EN_QUANTIZED** - Whisper Tiny English-only quantized
-- **WHISPER_BASE_EN** - Whisper Base English-only
-- **WHISPER_BASE_EN_QUANTIZED** - Whisper Base English-only quantized
-- **WHISPER_SMALL_EN** - Whisper Small English-only
-- **WHISPER_SMALL_EN_QUANTIZED** - Whisper Small English-only quantized
+- **WHISPER_TINY_EN** - Whisper Tiny English-only (XNNPACK)
+- **WHISPER_TINY_EN_COREML** - Whisper Tiny English-only (CoreML)
+- **WHISPER_BASE_EN** - Whisper Base English-only (XNNPACK)
+- **WHISPER_BASE_EN_COREML** - Whisper Base English-only (CoreML)
+- **WHISPER_SMALL_EN** - Whisper Small English-only (XNNPACK)
+- **WHISPER_SMALL_EN_COREML** - Whisper Small English-only (CoreML)
 
 ### Whisper Models (Multilingual)
 
-- **WHISPER_TINY** - Whisper Tiny multilingual
-- **WHISPER_BASE** - Whisper Base multilingual
-- **WHISPER_SMALL** - Whisper Small multilingual
+- **WHISPER_TINY** - Whisper Tiny multilingual (XNNPACK)
+- **WHISPER_TINY_COREML** - Whisper Tiny multilingual (CoreML)
+- **WHISPER_BASE** - Whisper Base multilingual (XNNPACK)
+- **WHISPER_BASE_COREML** - Whisper Base multilingual (CoreML)
+- **WHISPER_SMALL** - Whisper Small multilingual (XNNPACK)
+- **WHISPER_SMALL_COREML** - Whisper Small multilingual (CoreML)
 
 ---
 

From 6bba141d959abb030c1bace7dbbc214b6573f4c9 Mon Sep 17 00:00:00 2001
From: IgorSwat <igorswat2002@o2.pl>
Date: Wed, 20 May 2026 13:39:27 +0200
Subject: [PATCH 11/20] Minor fixes

---
 .../models/speech_to_text/whisper/ASR.cpp          | 13 +++++++------
 .../models/speech_to_text/whisper/OnlineASR.cpp    | 14 ++++++--------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
index 5a925e6eba..dbf7155d23 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
@@ -1,8 +1,3 @@
-#include <algorithm>
-#include <array>
-#include <numeric>
-#include <random>
-
 #include "ASR.h"
 #include "Constants.h"
 #include "Params.h"
@@ -11,6 +6,12 @@
 #include <rnexecutorch/data_processing/Numerical.h>
 #include <rnexecutorch/data_processing/gzip.h>
 
+#include <algorithm>
+#include <array>
+#include <numeric>
+#include <random>
+#include <ranges>
+
 namespace rnexecutorch::models::speech_to_text::whisper {
 
 using executorch::runtime::etensor::ScalarType;
@@ -451,7 +452,7 @@ ASR::estimateWordLevelTimestampsLinear(std::span<const uint64_t> tokens,
       puncts += w.back();
       w.pop_back();
     }
-    std::reverse(puncts.begin(), puncts.end());
+    std::ranges::reverse(puncts);
 
     // Add the core word.
     wordObjs.emplace_back(std::move(w), wStart, wEnd);
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
index ced3193531..0567716bbe 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
@@ -1,14 +1,13 @@
 #include "OnlineASR.h"
+#include "Constants.h"
+#include "Params.h"
+#include "Utils.h"
 
 #include <algorithm>
 #include <iterator>
 #include <ranges>
 #include <utility>
 
-#include "Constants.h"
-#include "Params.h"
-#include "Utils.h"
-
 namespace rnexecutorch::models::speech_to_text::whisper::stream {
 
 OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) {
@@ -41,8 +40,8 @@ void OnlineASR::insertAudioChunk(std::span<const float> audio) {
 }
 
 ProcessResult OnlineASR::process(const DecodingOptions &options) {
-  constexpr float kStreamSafeBufferMaxSamples =
-      params::kStreamSafeBufferDuration * constants::kSamplingRate;
+  constexpr size_t kStreamSafeBufferMaxSamples = static_cast<size_t>(
+      params::kStreamSafeBufferDuration * constants::kSamplingRate);
 
   std::vector<float> audioCopy;
 
@@ -115,8 +114,7 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
   // in a 'good' spot - where it will remove a significant audio chunk, yet
   // won't affect most recent, unfinished speech samples.
   size_t bufferSize = audioBuffer_.size();
-  if (std::cmp_greater<size_t, size_t>(bufferSize,
-                                       kStreamSafeBufferMaxSamples)) {
+  if (bufferSize > kStreamSafeBufferMaxSamples) {
     auto newCommitted = commitAndClean(words);
 
     committed.insert(committed.end(),

From 1aebae62ef64abe920a4324f95f39812a3d20087 Mon Sep 17 00:00:00 2001
From: IgorSwat <igorswat2002@o2.pl>
Date: Wed, 20 May 2026 17:59:52 +0200
Subject: [PATCH 12/20] Fix broken test build

---
 .../common/rnexecutorch/tests/CMakeLists.txt                     | 1 -
 1 file changed, 1 deletion(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
index 06a30a13f7..1fcad420cc 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
@@ -262,7 +262,6 @@ add_rn_test(SpeechToTextTests integration/SpeechToTextTest.cpp
     SOURCES
         ${RNEXECUTORCH_DIR}/models/speech_to_text/SpeechToText.cpp
         ${RNEXECUTORCH_DIR}/models/speech_to_text/whisper/ASR.cpp
-        ${RNEXECUTORCH_DIR}/models/speech_to_text/whisper/HypothesisBuffer.cpp
         ${RNEXECUTORCH_DIR}/models/speech_to_text/whisper/OnlineASR.cpp
         ${RNEXECUTORCH_DIR}/data_processing/gzip.cpp
         ${TOKENIZER_SOURCES}

From 88185d5dcfbb3b659348d6dc609437b50c460667 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <msluszniak1@gmail.com>
Date: Wed, 20 May 2026 19:22:13 +0200
Subject: [PATCH 13/20] chore(stt): drop unused transcribeStringOnly
 declaration

The method was declared in SpeechToText.h but never defined or referenced
anywhere in the package. Removing it cleans up the public API surface.
---
 .../common/rnexecutorch/models/speech_to_text/SpeechToText.h  | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
index ec51862793..ae053008cd 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
@@ -34,10 +34,6 @@ class SpeechToText {
                                  std::string languageOption,
                                  bool verbose) const;
 
-  [[nodiscard("Registered non-void function")]]
-  std::vector<char> transcribeStringOnly(std::span<float> waveform,
-                                         std::string languageOption) const;
-
   size_t getMemoryLowerBound() const noexcept;
 
   // Stream

From 44f69310636a0b90327c87d0ba4c54bc7f377bd2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <msluszniak1@gmail.com>
Date: Wed, 20 May 2026 19:23:20 +0200
Subject: [PATCH 14/20] fix(stt): preserve pending committed words in OnlineASR

insertAudioChunk's overflow path was overwriting memory_.toCommit on
each cap-hit. Two cap-hits before the next process() call silently
dropped the first batch. Append instead of assign.
---
 .../models/speech_to_text/whisper/OnlineASR.cpp            | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
index 0567716bbe..e663c5bfab 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
@@ -34,8 +34,11 @@ void OnlineASR::insertAudioChunk(std::span<const float> audio) {
   // process() method is called regularly within reasonable steps of time.
   if (audioBuffer_.size() > constants::kMaxSamples) {
     // Note that results are not actually committed now, but saved for
-    // a later call of process().
-    memory_.toCommit = commitAndClean(memory_.transcript);
+    // a later call of process(). Append rather than assign so that two
+    // back-to-back buffer-cap hits (e.g. while VAD is muted) don't drop the
+    // first batch.
+    auto pending = commitAndClean(memory_.transcript);
+    std::ranges::move(pending, std::back_inserter(memory_.toCommit));
   }
 }
 

From aaeac64ba01284525a53d92a99dc7c716ada9c97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <msluszniak1@gmail.com>
Date: Wed, 20 May 2026 19:23:44 +0200
Subject: [PATCH 15/20] fix(stt): use scores.size() as avgLogProb denominator

The previous tokens.size() + 1 matched neither a literal mean (would be
scores.size()) nor OpenAI Whisper's formula (len(full_seq) + 1, where
full_seq includes the SOT prefix and EOT). Align with whisper.cpp,
which divides by the number of summed log-probs.
---
 .../common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
index dbf7155d23..d57a06cd6c 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
@@ -214,7 +214,9 @@ std::vector<Segment> ASR::generate(std::span<const float> waveform,
         scores.begin(), scores.end(), 0.0f, std::plus<>(),
         [](float s) { return std::log(std::max(s, 1e-9f)); });
 
-    const float avgLogProb = cumLogProb / static_cast<float>(tokens.size() + 1);
+    // Match whisper.cpp: divide by the number of summed log-probs.
+    const float avgLogProb =
+        cumLogProb / static_cast<float>(std::max<size_t>(1, scores.size()));
     const std::string text = tokenizer_->decode(tokens, true);
     const float compressionRatio = this->calculateCompressionRatio(text);
 

From 7f465402f208b89c2bcb15a18ab74a1ae0824df4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <msluszniak1@gmail.com>
Date: Wed, 20 May 2026 19:24:21 +0200
Subject: [PATCH 16/20] perf(stt): hoist mt19937 out of the sampling loop

random_device was consulted and a fresh Mersenne state constructed for
every sampled token. Seed once per generate() call instead.
---
 .../common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
index d57a06cd6c..a9f2b152b4 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
@@ -275,6 +275,9 @@ ASR::generate(std::span<const float> waveform, const DecodingOptions &options,
     logitsTensor = this->decode(single, encoderFeatures, startPos);
   }
 
+  // Seed once per generate() call rather than per sampled token.
+  std::mt19937 gen(std::random_device{}());
+
   // Autoregressive decoding: always 1 token at a time
   while (std::cmp_less(startPos, constants::kMaxDecodeLength)) {
     const size_t logitsInnerDim = logitsTensor.size(1);
@@ -307,7 +310,6 @@ ASR::generate(std::span<const float> waveform, const DecodingOptions &options,
       nextProb = *maxIt;
     } else {
       std::discrete_distribution<> dist(probs.begin(), probs.end());
-      std::mt19937 gen((std::random_device{}()));
       nextId = dist(gen);
       nextProb = probs[nextId];
     }

From 72912ade9f72c6c49f6bc3c54d0aacdbb1e9b615 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <msluszniak1@gmail.com>
Date: Wed, 20 May 2026 19:26:30 +0200
Subject: [PATCH 17/20] fix(stt)!: drop quantized variants from
 SpeechToTextModelName
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The whisper-*-en-quantized constants are removed in this PR, but the
SpeechToTextModelName union still accepted those literals — type-safe
to pass, runtime-failing to use. Drop them from the union as part of
the same breaking-change.
---
 packages/react-native-executorch/src/types/stt.ts | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts
index 20f1013ef0..f9a2fb56d8 100644
--- a/packages/react-native-executorch/src/types/stt.ts
+++ b/packages/react-native-executorch/src/types/stt.ts
@@ -7,11 +7,8 @@ import { RnExecutorchError } from '../errors/errorUtils';
  */
 export type SpeechToTextModelName =
   | 'whisper-tiny-en'
-  | 'whisper-tiny-en-quantized'
   | 'whisper-base-en'
-  | 'whisper-base-en-quantized'
   | 'whisper-small-en'
-  | 'whisper-small-en-quantized'
   | 'whisper-tiny'
   | 'whisper-base'
   | 'whisper-small';

From 06059503eb1907986b0b8e5d30fb1564761762bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <msluszniak1@gmail.com>
Date: Wed, 20 May 2026 19:26:46 +0200
Subject: [PATCH 18/20] chore(stt): align stream() declaration with definition

The header had bool enableTimestamps; the .cpp uses bool verbose (which
matches the JS-side DecodingOptions.verbose). Rename here for
consistency.
---
 .../common/rnexecutorch/models/speech_to_text/SpeechToText.h   | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
index ae053008cd..16e94ef88b 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
@@ -38,8 +38,7 @@ class SpeechToText {
 
   // Stream
   void stream(std::shared_ptr<jsi::Function> callback,
-              std::string languageOption, bool enableTimestamps,
-              uint32_t timeout);
+              std::string languageOption, bool verbose, uint32_t timeout);
   void streamStop();
   void streamInsert(std::span<float> waveform);
 

From a5c366273a8a7b3dbd0cf12b15d19e2f0bf1cb34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <msluszniak1@gmail.com>
Date: Wed, 20 May 2026 19:27:45 +0200
Subject: [PATCH 19/20] fix(stt): break streamStop() out of the timeout pause
 immediately

The streaming loop slept sleep_for(timeout) ms unconditionally between
inferences, so streamStop() couldn't take effect until the next pause
expired (final flush delayed by the full timeout). Replace with a
condition_variable wait that streamStop() signals; inserts intentionally
do not wake the loop, preserving the throttle.
---
 .../models/speech_to_text/SpeechToText.cpp          | 13 ++++++++++---
 .../models/speech_to_text/SpeechToText.h            |  7 +++++++
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index 9537642d58..3acd076779 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -158,8 +158,12 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
     // The reasoning is very simple: with the current liberal threshold values,
     // running transcriptions too rapidly (before the audio buffer is filled
     // with significant amount of new data) can cause streamer to commit wrong
-    // phrases.
-    std::this_thread::sleep_for(std::chrono::milliseconds(timeout));
+    // phrases. We wait on a condition_variable so streamStop() can break the
+    // pause immediately — inserts intentionally do not wake us, to preserve
+    // the throttle.
+    std::unique_lock<std::mutex> lock(streamCvMutex_);
+    streamCv_.wait_for(lock, std::chrono::milliseconds(timeout),
+                       [this] { return !isStreaming_.load(); });
   }
 
   std::vector<Word> finalWords = streamer_->finish(options);
@@ -170,7 +174,10 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
   resetStreamState();
 }
 
-void SpeechToText::streamStop() { isStreaming_ = false; }
+void SpeechToText::streamStop() {
+  isStreaming_ = false;
+  streamCv_.notify_all();
+}
 
 void SpeechToText::streamInsert(std::span<float> waveform) {
   streamer_->insertAudioChunk(waveform);
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
index 16e94ef88b..adcfd8ae99 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
@@ -1,6 +1,8 @@
 #pragma once
 
 #include <atomic>
+#include <condition_variable>
+#include <mutex>
 #include <span>
 #include <string>
 #include <vector>
@@ -54,6 +56,11 @@ class SpeechToText {
   std::unique_ptr<schema::OnlineASR> streamer_ = nullptr;
   std::atomic<bool> isStreaming_ = false;
   std::atomic<bool> readyToProcess_ = false;
+
+  // Lets streamStop() wake the streaming loop immediately instead of
+  // waiting for the next throttling interval to expire.
+  std::mutex streamCvMutex_;
+  std::condition_variable streamCv_;
 };
 
 } // namespace models::speech_to_text

From ef92351be9e3c32327d8a31bb4a8068c7cc3b45e Mon Sep 17 00:00:00 2001
From: IgorSwat <igorswat2002@o2.pl>
Date: Thu, 21 May 2026 09:59:47 +0200
Subject: [PATCH 20/20] docs: simplify & update STT docs

---
 .../useSpeechToText.md                        | 321 ++++++------------
 .../SpeechToTextModule.md                     | 215 +++++-------
 2 files changed, 176 insertions(+), 360 deletions(-)

diff --git a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
index 089b844eb0..dc9f88179c 100644
--- a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
+++ b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
@@ -17,20 +17,31 @@ keywords:
 description: "Learn how to use speech-to-text models in your React Native applications with React Native ExecuTorch's useSpeechToText hook."
 ---
 
-Speech to text is a task that allows to transform spoken language to written text. It is commonly used to implement features such as transcription or voice assistants.
+Speech to text (STT) converts spoken audio into written text. This hook allows you to implement features like voice assistants, real-time transcription, and audio file processing directly on-device.
 
 :::info
-It is recommended to use models provided by us, which are available at our [Hugging Face repository](https://huggingface.co/collections/software-mansion/speech-to-text-68d0ec99ed794250491b8bbe). You can also use [constants](https://github.com/software-mansion/react-native-executorch/blob/main/packages/react-native-executorch/src/constants/modelUrls.ts) shipped with our library.
+We recommend using our optimized models available on [Hugging Face](https://huggingface.co/collections/software-mansion/speech-to-text-68d0ec99ed794250491b8bbe). You can also use pre-defined [constants](https://github.com/software-mansion/react-native-executorch/blob/main/packages/react-native-executorch/src/constants/modelUrls.ts) included in the library.
 :::
 
 ## API Reference
 
-- For detailed API Reference for `useSpeechToText` see: [`useSpeechToText` API Reference](../../06-api-reference/functions/useSpeechToText.md).
-- For all speech to text models available out-of-the-box in React Native ExecuTorch see: [STT Models](../../06-api-reference/index.md#models---speech-to-text).
+- [`useSpeechToText` API Reference](../../06-api-reference/functions/useSpeechToText.md)
+- [STT Models List](../../06-api-reference/index.md#models---speech-to-text)
 
-## High Level Overview
+## Basic Usage (File Transcription)
 
-You can obtain waveform from audio in any way most suitable to you, however in the snippet below we utilize [`react-native-audio-api`](https://docs.swmansion.com/react-native-audio-api/) library to process a `.mp3` file.
+Use `transcribe` for processing pre-recorded audio or short clips. The input should be a `Float32Array` of audio samples at **16 kHz**.
+
+### Transcribe Options
+
+The `transcribe()` function accepts an optional configuration object:
+
+- `language`: The language code (e.g., `'es'`, `'fr'`). Required for multilingual models.
+- `verbose`: If `true`, the method returns a detailed `TranscriptionResult` object following the OpenAI Whisper `verbose_json` format (including segments and word-level timestamps).
+
+In this example, we use [`react-native-audio-api`](https://docs.swmansion.com/react-native-audio-api/) to decode an audio file into the required format.
+
+### Example
 
 ```typescript
 import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch';
@@ -41,181 +52,46 @@ const model = useSpeechToText({
   model: WHISPER_TINY_EN,
 });
 
+// 1. Get audio file
 const { uri } = await FileSystem.downloadAsync(
   'https://some-audio-url.com/file.mp3',
-  FileSystem.cacheDirectory + 'audio_file'
+  `${FileSystem.cacheDirectory}audio_file`
 );
 
+// 2. Decode to 16kHz PCM Float32Array
 const audioContext = new AudioContext({ sampleRate: 16000 });
 const decodedAudioData = await audioContext.decodeAudioData(uri);
 const audioBuffer = decodedAudioData.getChannelData(0);
 
+// 3. Transcribe
 try {
-  const transcription = await model.transcribe(audioBuffer);
-  console.log(transcription.text);
+  const result = await model.transcribe(audioBuffer);
+  console.log('Transcription:', result.text);
 } catch (error) {
-  console.error('Error during audio transcription', error);
+  console.error('Transcription failed:', error);
 }
 ```
 
-### Streaming
-
-Since speech-to-text models can only process audio segments up to 30 seconds long, we need to split longer inputs into chunks. However, simple chunking may cut speech mid-sentence, making it harder for the model to understand. To address this, we use the [whisper-streaming](https://aclanthology.org/2023.ijcnlp-demo.3.pdf) algorithm. While this introduces some overhead, it enables accurate processing of audio inputs of arbitrary length.
-
-### Arguments
-
-`useSpeechToText` takes [`SpeechToTextProps`](../../06-api-reference/interfaces/SpeechToTextProps.md) that consists of:
-
-- `model` of type [`SpeechToTextConfig`](../../06-api-reference/interfaces/SpeechToTextModelConfig.md), containing the [`isMultilingual` flag](../../06-api-reference/interfaces/SpeechToTextModelConfig.md#ismultilingual), [tokenizer source](../../06-api-reference/interfaces/SpeechToTextModelConfig.md#tokenizersource) and [model source](../../06-api-reference/interfaces/SpeechToTextModelConfig.md#modelsource).
-- An optional flag [`preventLoad`](../../06-api-reference/interfaces/SpeechToTextProps.md#preventload) which prevents auto-loading of the model.
-
-You need more details? Check the following resources:
+## Live Streaming Transcription
 
-- For detailed information about `useSpeechToText` arguments check this section: [`useSpeechToText` arguments](../../06-api-reference/functions/useSpeechToText.md#parameters)
-- For all speech to text models available out-of-the-box in React Native ExecuTorch see: [STT Models](../../06-api-reference/index.md#models---speech-to-text).
-- For more information on loading resources, take a look at [loading models](../../01-fundamentals/02-loading-models.md) page.
-
-### Returns
-
-`useSpeechToText` returns an object called `SpeechToTextType` containing bunch of functions to interact with STT.
-
-Please note, that both [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) and [`stream`](../../06-api-reference/interfaces/SpeechToTextType.md#stream) functions accept [`DecodingOptions`](../../06-api-reference/interfaces/DecodingOptions.md) type as an argument. It accepts language abbreviation, you can check them out in [`language`](../../06-api-reference/interfaces/DecodingOptions.md#language) property of this config of type [`SpeechToTextLanguage`](../../06-api-reference/type-aliases/SpeechToTextLanguage.md).
-
-To get more details please read: [`SpeechToTextType` API Reference](../../06-api-reference/interfaces/SpeechToTextType.md).
-
-## Running the model
-
-Before running the model's [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) method, make sure to extract the audio waveform you want to transcribe. You'll need to handle this step yourself, ensuring the audio is sampled at 16 kHz. Once you have the waveform, pass it as an argument to the transcribe method. The method returns a promise that resolves to the generated transcription on success, or an error if inference fails.
-
-### Multilingual transcription
-
-If you want to transcribe speech in languages other than English, use the multilingual version of Whisper. To generate the output in your desired language, pass the [`language`](../../06-api-reference/interfaces/DecodingOptions.md#language) option to the [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) method.
-
-```typescript
-import { useSpeechToText, WHISPER_TINY } from 'react-native-executorch';
+For real-time applications or audio streams of arbitrary length, use the **Streaming API**. This is optimized for live input, handling the 30-second window limitation of Whisper models automatically to ensure context isn't lost between chunks.
 
-const model = useSpeechToText({
-  model: WHISPER_TINY,
-});
+### How it works:
 
-const transcription = await model.transcribe(spanishAudio, { language: 'es' });
-```
+1.  **Feed audio**: Use `streamInsert` to push small chunks of audio (e.g., 100ms) as they arrive from the microphone.
+2.  **Get results**: The `stream` generator yields two types of text:
+    - `committed`: Finalized text that won't change.
+    - `nonCommitted`: Temporary text that might update as the model gets more context from the audio.
 
-### Timestamps & Transcription Stat Data
+### Streaming Options
 
-You can obtain word-level timestamps and other useful parameters from transcription ([`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) and [`stream`](../../06-api-reference/interfaces/SpeechToTextType.md#stream) methods) by setting `verbose: true` in the options. The result mimics the _verbose_json_ format from OpenAI Whisper API. For more information please read [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe), [`stream`](../../06-api-reference/interfaces/SpeechToTextType.md#stream), and [`TranscriptionResult`](../../06-api-reference/interfaces/TranscriptionResult.md) API References.
+The `stream()` function accepts several optional parameters:
 
-```typescript
-const transcription = await model.transcribe(audioBuffer, { verbose: true });
-// Example result
-//
-// transcription: {
-//   task: "transcription",
-//   text: "Example text for a ...",
-//   duration: 9.05,
-//   language: "en",
-//   segments: [
-//     {
-//       start: 0,
-//       end: 5.4,
-//       text: "Example text for",
-//       words: [
-//         {
-//            word: "Example",
-//            start: 0,
-//            end: 1.4
-//         },
-//         ...
-//       ]
-//       tokens: [1, 32, 45, ...],
-//       temperature: 0.0,
-//       avgLogprob: -1.235,
-//       compressionRatio: 1.632
-//     },
-//     ...
-//   ]
-// }
-```
+- `language`: The language code (e.g., `'es'`, `'fr'`). Required for multilingual models.
+- `verbose`: If `true`, includes word-level timestamps and segment metadata in the result objects.
+- `timeout`: (Advanced) The interval (in milliseconds) between processing consecutive audio chunks in streaming mode. Lower values provide more frequent updates and lower latency, while higher values reduce CPU consumption. Defaults to `100`.
 
-## Example
-
-```tsx
-import React, { useState } from 'react';
-import { Button, Text, View } from 'react-native';
-import {
-  useSpeechToText,
-  WHISPER_TINY_EN,
-  TranscriptionResult,
-} from 'react-native-executorch';
-import { AudioContext } from 'react-native-audio-api';
-import * as FileSystem from 'expo-file-system';
-
-function App() {
-  const model = useSpeechToText({
-    model: WHISPER_TINY_EN,
-  });
-
-  const [transcription, setTranscription] = useState<TranscriptionResult>(null);
-
-  const loadAudio = async () => {
-    const { uri } = await FileSystem.downloadAsync(
-      'https://some-audio-url.com/file.mp3',
-      FileSystem.cacheDirectory + 'audio_file'
-    );
-
-    const audioContext = new AudioContext({ sampleRate: 16000 });
-    const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
-    const audioBuffer = decodedAudioData.getChannelData(0);
-
-    return audioBuffer;
-  };
-
-  const handleTranscribe = async () => {
-    const audio = await loadAudio();
-    // Default text transcription
-    const result = await model.transcribe(audio);
-    setTranscription(result);
-  };
-
-  const handleTranscribeWithTimestamps = async () => {
-    const audio = await loadAudio();
-    // Transcription with timestamps
-    const result = await model.transcribe(audio, { verbose: true });
-    setTranscription(result);
-  };
-
-  // Custom logic for printing transcription
-  // e.g.
-
-  const renderContent = () => {
-    if (!transcription) return <Text>Press a button to transcribe</Text>;
-
-    if (transcription.segments && transcription.segments.length > 0) {
-      return (
-        <Text>
-          {transcription.text +
-            '\n\nNum segments: ' +
-            transcription.segments.length.toString()}
-        </Text>
-      );
-    }
-    return <Text>{transcription.text}</Text>;
-  };
-
-  return (
-    <View>
-      {renderContent()}
-      <Button onPress={handleTranscribe} title="Transcribe (Text)" />
-      <Button
-        onPress={handleTranscribeWithTimestamps}
-        title="Transcribe (Timestamps)"
-      />
-    </View>
-  );
-}
-```
-
-### Streaming transcription
+### Example
 
 ```tsx
 import React, { useEffect, useState, useRef } from 'react';
@@ -223,70 +99,41 @@ import { Text, Button, View, SafeAreaView } from 'react-native';
 import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch';
 import { AudioManager, AudioRecorder } from 'react-native-audio-api';
 
-export default function App() {
-  const model = useSpeechToText({
-    model: WHISPER_TINY_EN,
-  });
-
-  const [transcribedText, setTranscribedText] = useState('');
-
+export default function LiveTranscriber() {
+  const model = useSpeechToText({ model: WHISPER_TINY_EN });
+  const [text, setText] = useState('');
   const isRecordingRef = useRef(false);
-
   const [recorder] = useState(() => new AudioRecorder());
 
-  useEffect(() => {
-    AudioManager.setAudioSessionOptions({
-      iosCategory: 'playAndRecord',
-      iosMode: 'spokenAudio',
-      iosOptions: ['allowBluetooth', 'defaultToSpeaker'],
-    });
-    AudioManager.requestRecordingPermissions();
-  }, []);
-
-  const handleStartStreamingTranscribe = async () => {
+  const startLiveStreaming = async () => {
     isRecordingRef.current = true;
-    setTranscribedText('');
-
-    const sampleRate = 16000;
+    setText('');
 
+    // 1. Capture microphone input
     recorder.onAudioReady(
-      {
-        sampleRate,
-        bufferLength: 0.1 * sampleRate,
-        channelCount: 1,
-      },
-      (chunk) => {
-        model.streamInsert(chunk.buffer.getChannelData(0));
-      }
+      { sampleRate: 16000, bufferLength: 1600, channelCount: 1 },
+      (chunk) => model.streamInsert(chunk.buffer.getChannelData(0))
     );
 
-    try {
-      await recorder.start();
-    } catch (e) {
-      console.error('Recorder failed:', e);
-      return;
-    }
+    await recorder.start();
 
+    // 2. Process the stream
     try {
-      let accumulatedCommitted = '';
-
+      let finalizedText = '';
       const streamIter = model.stream({ verbose: false });
 
       for await (const { committed, nonCommitted } of streamIter) {
         if (!isRecordingRef.current) break;
 
-        if (committed.text) {
-          accumulatedCommitted += committed.text;
-        }
-
-        setTranscribedText(accumulatedCommitted + nonCommitted.text);
+        if (committed.text) finalizedText += committed.text;
+        setText(finalizedText + nonCommitted.text);
       }
     } catch (error) {
-      console.error('Error during streaming transcription:', error);
+      console.error('Streaming error:', error);
     }
   };
 
-  const handleStopStreamingTranscribe = () => {
+  const stopLiveStreaming = () => {
     isRecordingRef.current = false;
     recorder.stop();
     model.streamStop();
@@ -294,28 +141,58 @@ export default function App() {
 
   return (
     <SafeAreaView>
-      <View style={{ padding: 20 }}>
-        <Text style={{ marginBottom: 20, fontSize: 18 }}>
-          {transcribedText || 'Press start to speak...'}
-        </Text>
-
-        <Button
-          onPress={handleStartStreamingTranscribe}
-          title="Start Streaming"
-          disabled={model.isGenerating}
-        />
-        <View style={{ height: 10 }} />
-        <Button
-          onPress={handleStopStreamingTranscribe}
-          title="Stop Streaming"
-          color="red"
-        />
-      </View>
+      <Text>{text || 'Press start and speak...'}</Text>
+      <Button
+        onPress={startLiveStreaming}
+        title="Start Live"
+        disabled={model.isGenerating}
+      />
+      <Button onPress={stopLiveStreaming} title="Stop" color="red" />
     </SafeAreaView>
   );
 }
 ```
 
+## Advanced Features
+
+### Multilingual Transcription
+
+To transcribe languages other than English, use a multilingual model (e.g., `WHISPER_TINY`) and specify the corresponding language code:
+
+```typescript
+// Transcribe in Spanish
+const result = await model.transcribe(spanishAudio, { language: 'es' });
+```
+
+### Timestamps & Metadata
+
+Set `verbose: true` to receive word-level timestamps and confidence scores. The output follows the OpenAI Whisper `verbose_json` format.
+
+```typescript
+const result = await model.transcribe(audioBuffer, { verbose: true });
+// result.segments[0].words -> [{ word: "Hello", start: 0.5, end: 1.0 }, ...]
+```
+
+## Configuration
+
+### Arguments
+
+`useSpeechToText` accepts a configuration object:
+
+- `model`: Model source and tokenizer settings (see [ModelConfig](../../06-api-reference/interfaces/SpeechToTextModelConfig.md)).
+- `preventLoad`: (Optional) If `true`, the model won't load until you call `load()`.
+
+### Returns
+
+The hook returns an object with:
+
+- `transcribe(audio, options)`: One-shot transcription.
+- `stream(options)`: Async generator for streaming results.
+- `streamInsert(audio)`: Push audio to the stream buffer.
+- `streamStop()`: Finish the current stream.
+- `isGenerating`: Boolean indicating if the model is busy.
+- `loading`: Boolean indicating if the model is being loaded.
+
 ## Supported models
 
 | Model                                                              |   Language   |
diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
index d4d8897e7c..f190af8234 100644
--- a/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
+++ b/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
@@ -2,47 +2,43 @@
 title: SpeechToTextModule
 ---
 
-TypeScript API implementation of the [useSpeechToText](../../03-hooks/01-natural-language-processing/useSpeechToText.md) hook.
+The `SpeechToTextModule` class provides a direct interface to the library's speech-to-text (STT) capabilities. While [`useSpeechToText`](../../03-hooks/01-natural-language-processing/useSpeechToText.md) is the preferred way for React components, this module offers full control over the model's lifecycle and is suitable for non-React contexts or advanced use cases.
 
 ## API Reference
 
-- For detailed API Reference for `SpeechToTextModule` see: [`SpeechToTextModule` API Reference](../../06-api-reference/classes/SpeechToTextModule.md).
-- For all speech to text models available out-of-the-box in React Native ExecuTorch see: [STT Models](../../06-api-reference/index.md#models---speech-to-text).
+- [`SpeechToTextModule` API Reference](../../06-api-reference/classes/SpeechToTextModule.md)
+- [STT Models List](../../06-api-reference/index.md#models---speech-to-text)
 
 ## High Level Overview
 
+You can transcribe audio in two ways: **one-shot** (for files/short clips) and **streaming** (for live microphone input).
+
 ```typescript
 import { SpeechToTextModule, WHISPER_TINY_EN } from 'react-native-executorch';
 
+// Initialize the model
 const model = await SpeechToTextModule.fromModelName(
   WHISPER_TINY_EN,
   (progress) => {
-    console.log(progress);
+    console.log(`Loading: ${progress * 100}%`);
   }
 );
 
-// Standard transcription (returns string)
-const text = await model.transcribe(waveform);
+// 1. One-shot transcription (returns TranscriptionResult)
+const result = await model.transcribe(waveform);
+console.log(result.text);
 
-// Transcription with timestamps (returns Word[])
-const textWithTimestamps = await model.transcribe(waveform, {
-  enableTimestamps: true,
-});
+// 2. Live streaming (yields partial/stable results)
+model.streamInsert(audioChunk);
+const stream = model.stream();
+for await (const { committed, nonCommitted } of stream) {
+  // Update UI live with stable and partial text
+}
 ```
 
-### Methods
-
-All methods of `SpeechToTextModule` are explained in details here: [`SpeechToTextModule API Reference`](../../06-api-reference/classes/SpeechToTextModule.md)
-
-:::info
-
-- `committed` contains the latest part of the transcription that is finalized and will not change. To obtain the full transcription during streaming, concatenate all the `committed` values yielded over time. Useful for displaying stable results during streaming.
-- `nonCommitted` contains the part of the transcription that is still being processed and may change. Useful for displaying live, partial results during streaming.
-  :::
-
 ## Loading the model
 
-Use the static [`fromModelName`](../../06-api-reference/classes/SpeechToTextModule.md#frommodelname) factory method. It accepts an object with the following fields:
+Use the static [`fromModelName`](../../06-api-reference/classes/SpeechToTextModule.md#frommodelname) factory method. It accepts a configuration object with the following fields:
 
 - [`isMultilingual`](../../06-api-reference/interfaces/SpeechToTextModelConfig.md#ismultilingual) - Flag indicating if model is multilingual.
 - [`modelSource`](../../06-api-reference/interfaces/SpeechToTextModelConfig.md#modelsource) - The location of the used model (bundled encoder + decoder functionality).
@@ -50,132 +46,73 @@ Use the static [`fromModelName`](../../06-api-reference/classes/SpeechToTextModu
 
 And an optional second argument:
 
-- `onDownloadProgress` - Callback to track download progress.
+- `onDownloadProgress` - Callback to track download progress (returns a value between 0 and 1).
 
-This method returns a promise resolving to a `SpeechToTextModule` instance.
+For more information on resource management, see [loading models](../../01-fundamentals/02-loading-models.md).
 
-For more information on loading resources, take a look at [loading models](../../01-fundamentals/02-loading-models.md) page.
+## Transcription (Files & Short Clips)
 
-## Running the model
+To run transcription on a complete audio clip, use the [`transcribe`](../../06-api-reference/classes/SpeechToTextModule.md#transcribe) method. It accepts a `Float32Array` representing a waveform at **16kHz sampling rate**.
 
-To run the model, you can use the [`transcribe`](../../06-api-reference/classes/SpeechToTextModule.md#transcribe) method. It accepts one argument, which is an array of type `Float32Array` representing a waveform at 16kHz sampling rate. The method returns a promise, which can resolve either to an error or a string containing the output text.
+### Transcribe Options
 
-### Multilingual transcription
+The `transcribe()` function accepts an optional configuration object:
 
-If you aim to obtain a transcription in other languages than English, use the multilingual version of whisper. To obtain the output text in your desired language, pass the [`DecodingOptions`](../../06-api-reference/interfaces/DecodingOptions.md) object with the [`language`](../../06-api-reference/interfaces/DecodingOptions.md#language) field set to your desired language code.
+- `language`: The language code (e.g., `'es'`, `'fr'`). Required for multilingual models.
+- `verbose`: If `true`, the method returns a detailed `TranscriptionResult` object following the OpenAI Whisper `verbose_json` format (including segments and word-level timestamps).
 
-```typescript
-import { SpeechToTextModule, WHISPER_TINY } from 'react-native-executorch';
+### Multilingual transcription
 
-const model = await SpeechToTextModule.fromModelName(
-  WHISPER_TINY,
-  (progress) => {
-    console.log(progress);
-  }
-);
+If you aim to obtain a transcription in languages other than English, use a multilingual Whisper model. To get the output in your desired language, pass the [`DecodingOptions`](../../06-api-reference/interfaces/DecodingOptions.md) object with the [`language`](../../06-api-reference/interfaces/DecodingOptions.md#language) field set to the target language code.
 
+```typescript
 const transcription = await model.transcribe(spanishAudio, { language: 'es' });
 ```
 
-### Timestamps & Transcription Stat Data
+### Timestamps & Detailed Results
 
-You can obtain word-level timestamps and other useful parameters from transcription ([`transcribe`](../../06-api-reference/classes/SpeechToTextModule.md#transcribe) and [`stream`](../../06-api-reference/classes/SpeechToTextModule.md#stream) methods) by setting `verbose: true` in the options. The result mimics the _verbose_json_ format from OpenAI Whisper API. For more information please read [`transcribe`](../../06-api-reference/classes/SpeechToTextModule.md#transcribe), [`stream`](../../06-api-reference/classes/SpeechToTextModule.md#stream), and [`TranscriptionResult`](../../06-api-reference/interfaces/TranscriptionResult.md) API References.
+Set `verbose: true` in the options to obtain word-level timestamps and other parameters. The result mimics the _verbose_json_ format from OpenAI Whisper API.
 
 ```typescript
-const transcription = await model.transcribe(audioBuffer, { verbose: true });
-// Example result
-//
-// transcription: {
-//   task: "transcription",
-//   text: "Example text for a ...",
-//   duration: 9.05,
-//   language: "en",
+const result = await model.transcribe(audioBuffer, { verbose: true });
+// Example result:
+// {
+//   text: "Example text...",
 //   segments: [
-//     {
-//       start: 0,
-//       end: 5.4,
-//       text: "Example text for",
-//       words: [
-//         {
-//            word: "Example",
-//            start: 0,
-//            end: 1.4
-//         },
-//         ...
-//       ]
-//       tokens: [1, 32, 45, ...],
-//       temperature: 0.0,
-//       avgLogprob: -1.235,
-//       compressionRatio: 1.632
-//     },
-//     ...
-//   ]
+//     { start: 0, end: 5.4, text: "Example text", words: [...] }
+//   ],
+//   language: "en"
 // }
 ```
 
-## Example
+## Live Streaming Transcription
 
-### Transcription
+The **Streaming API** is optimized for live microphone input or real-time audio feeds. It handles audio inputs of arbitrary length by automatically managing context windows to bypass the standard 30-second limit.
 
-```tsx
-import { SpeechToTextModule, WHISPER_TINY_EN } from 'react-native-executorch';
-import { AudioContext } from 'react-native-audio-api';
-import * as FileSystem from 'expo-file-system';
-
-const transcribeAudio = async () => {
-  // Initialize with the model config
-  const model = await SpeechToTextModule.fromModelName(
-    WHISPER_TINY_EN,
-    (progress) => {
-      console.log(progress);
-    }
-  );
-
-  // Download the audio file
-  const { uri } = await FileSystem.downloadAsync(
-    'https://some-audio-url.com/file.mp3',
-    FileSystem.cacheDirectory + 'audio_file'
-  );
-
-  // Decode the audio data (Correct as per your previous code)
-  const audioContext = new AudioContext({ sampleRate: 16000 });
-  const decodedAudioData = await audioContext.decodeAudioData(uri);
-  const audioBuffer = decodedAudioData.getChannelData(0);
-
-  // Transcribe the audio
-  try {
-    // Option 1: Text only
-    const resultText = await model.transcribe(audioBuffer);
-    console.log('Text:', resultText.text); // .text is the standard property now
-
-    // Option 2: With timestamps (Use 'verbose' instead of 'enableTimestamps')
-    const resultVerbose = await model.transcribe(audioBuffer, {
-      verbose: true,
-    });
-
-    console.log('Full Text:', resultVerbose.text);
-    console.log('Segments:', resultVerbose.segments); // Contains start/end/more parameters
-  } catch (error) {
-    console.error('Error during audio transcription', error);
-  }
-};
-```
+:::iStreaming Options
+The `stream()` function accepts several optional parameters:
+
+- `language`: The language code (e.g., `'es'`, `'fr'`). Required for multilingual models.
+- `verbose`: If `true`, includes word-level timestamps and segment metadata in the result objects.
+- `timeout`: (Advanced) The interval (in milliseconds) between processing consecutive audio chunks. Lower values provide more frequent updates, while higher values reduce CPU consumption. Defaults to `100`.
+
+### nfo
+
+- **`committed`**: Finalized transcription that is stable and will not change. Useful for building a persistent transcript record.
+- **`nonCommitted`**: Partial transcription that is still being processed and may update as more context arrives. Useful for live UI updates.
+  :::
+
+### Live Example
 
-### Streaming Transcription
+In this example, we use [`react-native-audio-api`](https://docs.swmansion.com/react-native-audio-api/) to feed live audio into the model.
 
 ```tsx
 import { SpeechToTextModule, WHISPER_TINY_EN } from 'react-native-executorch';
 import { AudioManager, AudioRecorder } from 'react-native-audio-api';
 
-// Load the model
-const model = await SpeechToTextModule.fromModelName(
-  WHISPER_TINY_EN,
-  (progress) => {
-    console.log(progress);
-  }
-);
+const model = await SpeechToTextModule.fromModelName(WHISPER_TINY_EN);
 
-// Configure audio session
+// 1. Configure audio session & permissions
 AudioManager.setAudioSessionOptions({
   iosCategory: 'playAndRecord',
   iosMode: 'spokenAudio',
@@ -183,44 +120,46 @@ AudioManager.setAudioSessionOptions({
 });
 await AudioManager.requestRecordingPermissions();
 
-// Initialize audio recorder with FULL config in constructor
+// 2. Setup Audio Recorder
 const recorder = new AudioRecorder({
   sampleRate: 16000,
   channelCount: 1,
-  bitsPerSample: 16,
-  bufferLengthInSamples: 16000, // e.g. 1 second buffer
 });
 
-// Pass ONLY the callback to onAudioReady
 recorder.onAudioReady((chunk) => {
-  // Insert the audio into the streaming transcription
+  // Feed chunks directly into the model's buffer
   model.streamInsert(chunk.buffer.getChannelData(0));
 });
 
 await recorder.start();
 
-// Start streaming transcription
+// 3. Process the Stream
 try {
-  let finalTranscription = '';
-
-  // Use 'verbose' flag for timestamps/segments
-  const streamIter = model.stream({ verbose: true });
+  let stableTranscript = '';
+  const streamIter = model.stream({ verbose: false });
 
   for await (const { committed, nonCommitted } of streamIter) {
-    // Note: committed/nonCommitted are objects { text, segments } now
-    console.log('Committed Text:', committed.text);
-    console.log('Live Text:', nonCommitted.text);
+    if (committed.text) stableTranscript += committed.text;
 
-    if (committed.text) {
-      finalTranscription += committed.text;
-    }
+    // UI should display: stableTranscript + nonCommitted.text
+    console.log('Live Transcript:', stableTranscript + nonCommitted.text);
   }
-  console.log('Final transcription:', finalTranscription);
 } catch (error) {
-  console.error('Error during streaming transcription:', error);
+  console.error('Streaming error:', error);
 }
 
-// Stop streaming transcription
+// 4. Cleanup
 model.streamStop();
 recorder.stop();
 ```
+
+## Supported models
+
+| Model                                                              |   Language   |
+| ------------------------------------------------------------------ | :----------: |
+| [whisper-tiny.en](https://huggingface.co/openai/whisper-tiny.en)   |   English    |
+| [whisper-tiny](https://huggingface.co/openai/whisper-tiny)         | Multilingual |
+| [whisper-base.en](https://huggingface.co/openai/whisper-base.en)   |   English    |
+| [whisper-base](https://huggingface.co/openai/whisper-base)         | Multilingual |
+| [whisper-small.en](https://huggingface.co/openai/whisper-small.en) |   English    |
+| [whisper-small](https://huggingface.co/openai/whisper-small)       | Multilingual |