software-mansion · mkopcins · Apr 23, 2026 · Apr 24, 2026 · Apr 30, 2026 · May 21, 2026
diff --git a/apps/llm/app/llm/index.tsx b/apps/llm/app/llm/index.tsx
@@ -11,7 +11,7 @@ import {
   View,
 } from 'react-native';
 import SendIcon from '../../assets/icons/send_icon.svg';
-import { useLLM, LLAMA3_2_1B_SPINQUANT } from 'react-native-executorch';
+import { useLLM, QWEN3_0_6B_QUANTIZED } from 'react-native-executorch';
 import { ModelPicker } from '../../components/ModelPicker';
 import { LLM_MODELS, LLMModelSources } from '../../components/llmModels';
 import PauseIcon from '../../assets/icons/pause_icon.svg';
@@ -42,9 +42,8 @@ function LLMScreen() {
   const { bottom } = useSafeAreaInsets();
   const [isTextInputFocused, setIsTextInputFocused] = useState(false);
   const [userInput, setUserInput] = useState('');
-  const [selectedModel, setSelectedModel] = useState<LLMModelSources>(
-    LLAMA3_2_1B_SPINQUANT
-  );
+  const [selectedModel, setSelectedModel] =
+    useState<LLMModelSources>(QWEN3_0_6B_QUANTIZED);
   const textInputRef = useRef<TextInput>(null);
   const { setGlobalGenerating } = useContext(GeneratingContext);
 
@@ -76,6 +75,7 @@ function LLMScreen() {
     }
   };
 
+  console.log(llm.messageHistory)
   return !llm.isReady && !llm.error ? (
     <Spinner
       visible={true}

diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx
@@ -12,6 +12,11 @@ import {
   View,
 } from 'react-native';
 import { launchImageLibrary } from 'react-native-image-picker';
+import {
+  AudioManager,
+  AudioRecorder,
+  AudioContext,
+} from 'react-native-audio-api';
 import { useIsFocused } from '@react-navigation/native';
 import { useSafeAreaInsets } from 'react-native-safe-area-context';
 import { useLLM, LFM2_5_VL_1_6B_QUANTIZED } from 'react-native-executorch';
@@ -29,6 +34,7 @@ const SUGGESTED_PROMPTS = [
   'Describe this scene in detail',
   'What objects can you see?',
   'What text appears in this image?',
+  'Transcribe the audio?',
 ];
 import { useLLMStats } from '../../hooks/useLLMStats';
 import { StatsBar } from '../../components/StatsBar';
@@ -46,7 +52,15 @@ function MultimodalLLMScreen() {
   const textInputRef = useRef<TextInput>(null);
   const { setGlobalGenerating } = useContext(GeneratingContext);
 
-  // Added error state
+  const [audioBuffer, setAudioBuffer] = useState<Float32Array | null>(null);
+  const [audioLabel, setAudioLabel] = useState<string | null>(null);
+  const [audioUrl, setAudioUrl] = useState('');
+  const [isFetchingAudio, setIsFetchingAudio] = useState(false);
+  const [isRecording, setIsRecording] = useState(false);
+  const [hasMicPermission, setHasMicPermission] = useState(false);
+  const recorder = useRef(new AudioRecorder());
+  const recordChunks = useRef<Float32Array[]>([]);
+
   const [error, setError] = useState<string | null>(null);
 
   const vlm = useLLM({
@@ -68,6 +82,87 @@ function MultimodalLLMScreen() {
     if (vlm.error) setError(String(vlm.error));
   }, [vlm.error]);
 
+  useEffect(() => {
+    AudioManager.setAudioSessionOptions({
+      iosCategory: 'playAndRecord',
+      iosMode: 'spokenAudio',
+      iosOptions: ['allowBluetoothHFP', 'defaultToSpeaker'],
+    });
+    (async () => {
+      const status = await AudioManager.requestRecordingPermissions();
+      setHasMicPermission(status === 'Granted');
+    })();
+  }, []);
+
+  const loadAudioFromUrl = async () => {
+    const url = audioUrl.trim();
+    if (!url) return;
+    setIsFetchingAudio(true);
+    try {
+      const ctx = new AudioContext({ sampleRate: 16000 });
+      const decoded = await ctx.decodeAudioData(url);
+      const pcm = decoded.getChannelData(0);
+      const name = url.split('/').pop() || 'audio';
+      setAudioBuffer(pcm);
+      setAudioLabel(`${name} · ${(pcm.length / 16000).toFixed(1)}s`);
+    } catch (e) {
+      setError(e instanceof Error ? e.message : String(e));
+    } finally {
+      setIsFetchingAudio(false);
+    }
+  };
+
+  const startRecording = async () => {
+    if (!hasMicPermission) {
+      setError('Microphone permission denied. Please enable it in Settings.');
+      return;
+    }
+    recordChunks.current = [];
+    const sampleRate = 16000;
+    recorder.current.onAudioReady(
+      { sampleRate, bufferLength: 0.1 * sampleRate, channelCount: 1 },
+      ({ buffer }) => {
+        recordChunks.current.push(new Float32Array(buffer.getChannelData(0)));
+      }
+    );
+    try {
+      const ok = await AudioManager.setAudioSessionActivity(true);
+      if (!ok) {
+        setError('Cannot start audio session');
+        return;
+      }
+      const result = recorder.current.start();
+      if (result.status === 'error') {
+        setError(`Recording problems: ${result.message}`);
+        return;
+      }
+      setIsRecording(true);
+    } catch (e) {
+      setError(e instanceof Error ? e.message : String(e));
+    }
+  };
+
+  const stopRecording = () => {
+    recorder.current.stop();
+    setIsRecording(false);
+    const total = recordChunks.current.reduce((n, c) => n + c.length, 0);
+    if (total === 0) return;
+    const pcm = new Float32Array(total);
+    let off = 0;
+    for (const c of recordChunks.current) {
+      pcm.set(c, off);
+      off += c.length;
+    }
+    recordChunks.current = [];
+    setAudioBuffer(pcm);
+    setAudioLabel(`Recording · ${(pcm.length / 16000).toFixed(1)}s`);
+  };
+
+  const clearAudio = () => {
+    setAudioBuffer(null);
+    setAudioLabel(null);
+  };
+
   const pickImage = async () => {
     try {
       const result = await launchImageLibrary({ mediaType: 'photo' });
@@ -81,19 +176,27 @@ function MultimodalLLMScreen() {
   };
 
   const sendMessage = async () => {
-    if (!userInput.trim() || vlm.isGenerating) return;
+    if (!(imageUri || audioBuffer || userInput.trim()) || vlm.isGenerating)
+      return;
     onMessageSend();
     const text = userInput.trim();
     setUserInput('');
     textInputRef.current?.clear();
     Keyboard.dismiss();
     const currentImageUri = imageUri;
+    const currentAudio = audioBuffer;
     setImageUri(null);
+    setAudioBuffer(null);
+    setAudioLabel(null);
     try {
-      await vlm.sendMessage(
-        text,
-        currentImageUri ? { imagePath: currentImageUri } : undefined
-      );
+      const media =
+        currentImageUri || currentAudio
+          ? {
+              ...(currentImageUri ? { imagePath: currentImageUri } : {}),
+              ...(currentAudio ? { audioBuffer: currentAudio } : {}),
+            }
+          : undefined;
+      await vlm.sendMessage(text, media);
     } catch (e) {
       // Updated to set UI error instead of just console.error
       setError(e instanceof Error ? e.message : String(e));
@@ -159,6 +262,42 @@ function MultimodalLLMScreen() {
             </TouchableOpacity>
           )}
 
+          {/* Audio URL input */}
+          <View style={styles.audioUrlRow}>
+            <TextInput
+              placeholder="Audio URL (mp3/wav/…)"
+              placeholderTextColor="#C1C6E5"
+              style={styles.audioUrlInput}
+              value={audioUrl}
+              onChangeText={setAudioUrl}
+              autoCapitalize="none"
+              autoCorrect={false}
+            />
+            <TouchableOpacity
+              style={[
+                styles.audioUrlButton,
+                (!audioUrl.trim() || isFetchingAudio || vlm.isGenerating) &&
+                  styles.disabled,
+              ]}
+              onPress={loadAudioFromUrl}
+              disabled={!audioUrl.trim() || isFetchingAudio || vlm.isGenerating}
+            >
+              <Text style={styles.audioUrlButtonText}>
+                {isFetchingAudio ? '…' : 'Load'}
+              </Text>
+            </TouchableOpacity>
+          </View>
+
+          {/* Audio attachment strip */}
+          {audioLabel && (
+            <View style={styles.audioAttachmentContainer}>
+              <Text style={styles.audioAttachmentText}>🎵 {audioLabel}</Text>
+              <TouchableOpacity onPress={clearAudio}>
+                <Text style={styles.audioAttachmentClear}>✕</Text>
+              </TouchableOpacity>
+            </View>
+          )}
+
           <StatsBar stats={stats} />
           <View
             style={[
@@ -178,6 +317,17 @@ function MultimodalLLMScreen() {
               <Text style={styles.imageButtonText}>📷</Text>
             </TouchableOpacity>
 
+            {/* Mic record / stop button */}
+            <TouchableOpacity
+              style={styles.imageButton}
+              onPress={isRecording ? stopRecording : startRecording}
+              disabled={vlm.isGenerating}
+            >
+              <Text style={styles.imageButtonText}>
+                {isRecording ? '⏹️' : '🎤'}
+              </Text>
+            </TouchableOpacity>
+
             <TextInput
               autoCorrect={false}
               ref={textInputRef}
@@ -198,14 +348,15 @@ function MultimodalLLMScreen() {
               onChangeText={setUserInput}
             />
 
-            {userInput.trim() && !vlm.isGenerating && (
-              <TouchableOpacity
-                style={styles.sendChatTouchable}
-                onPress={sendMessage}
-              >
-                <SendIcon height={24} width={24} padding={4} margin={8} />
-              </TouchableOpacity>
-            )}
+            {(imageUri || audioBuffer || userInput.trim()) &&
+              !vlm.isGenerating && (
+                <TouchableOpacity
+                  style={styles.sendChatTouchable}
+                  onPress={sendMessage}
+                >
+                  <SendIcon height={24} width={24} padding={4} margin={8} />
+                </TouchableOpacity>
+              )}
             {vlm.isGenerating && (
               <TouchableOpacity
                 style={styles.sendChatTouchable}
@@ -319,6 +470,64 @@ const styles = StyleSheet.create({
     fontFamily: 'regular',
     color: ColorPalette.blueDark,
   },
+  audioAttachmentContainer: {
+    flexDirection: 'row',
+    alignItems: 'center',
+    justifyContent: 'space-between',
+    paddingHorizontal: 16,
+    paddingVertical: 8,
+    marginHorizontal: 16,
+    marginBottom: 4,
+    borderRadius: 8,
+    borderWidth: 1,
+    borderColor: ColorPalette.blueLight,
+    backgroundColor: '#fafbff',
+  },
+  audioAttachmentText: {
+    fontSize: 13,
+    fontFamily: 'regular',
+    color: ColorPalette.blueDark,
+  },
+  audioAttachmentClear: {
+    fontSize: 16,
+    color: ColorPalette.blueDark,
+    paddingHorizontal: 8,
+  },
+  audioUrlRow: {
+    flexDirection: 'row',
+    alignItems: 'center',
+    marginHorizontal: 16,
+    marginBottom: 4,
+  },
+  audioUrlInput: {
+    flex: 1,
+    padding: 10,
+    borderTopLeftRadius: 8,
+    borderBottomLeftRadius: 8,
+    borderWidth: 1,
+    borderColor: ColorPalette.blueLight,
+    borderRightWidth: 0,
+    fontFamily: 'regular',
+    fontSize: 13,
+    color: ColorPalette.primary,
+  },
+  audioUrlButton: {
+    paddingVertical: 10,
+    paddingHorizontal: 16,
+    backgroundColor: ColorPalette.strongPrimary,
+    borderTopRightRadius: 8,
+    borderBottomRightRadius: 8,
+    justifyContent: 'center',
+    alignItems: 'center',
+  },
+  audioUrlButtonText: {
+    color: '#fff',
+    fontFamily: 'medium',
+    fontSize: 13,
+  },
+  disabled: {
+    opacity: 0.5,
+  },
   bottomContainer: {
     height: 100,
     width: '100%',

diff --git a/apps/llm/components/llmModels.ts b/apps/llm/components/llmModels.ts
@@ -57,6 +57,7 @@ export const LLM_MODELS: ModelOption<LLMModelSources>[] = [
   { label: 'Qwen3 0.6B', value: QWEN3_0_6B },
   { label: 'Qwen3 0.6B Quantized', value: QWEN3_0_6B_QUANTIZED },
   { label: 'Qwen3 1.7B', value: QWEN3_1_7B },
+  { label: 'Gemma4 e2b Quantized', value: GEMMA4_E2B_QUANTIZED },
   { label: 'Qwen3 1.7B Quantized', value: QWEN3_1_7B_QUANTIZED },
   { label: 'Qwen3 4B', value: QWEN3_4B },
   { label: 'Qwen3 4B Quantized', value: QWEN3_4B_QUANTIZED },

diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -223,6 +223,22 @@ inline std::vector<float> getValue<std::vector<float>>(const jsi::Value &val,
   return getArrayAsVector<float>(val, runtime);
 }
 
+template <>
+inline std::vector<std::vector<float>>
+getValue<std::vector<std::vector<float>>>(const jsi::Value &val,
+                                          jsi::Runtime &runtime) {
+  jsi::Array array = val.asObject(runtime).asArray(runtime);
+  const size_t length = array.size(runtime);
+  std::vector<std::vector<float>> result;
+  result.reserve(length);
+  for (size_t i = 0; i < length; ++i) {
+    auto span =
+        getTypedArrayAsSpan<float>(array.getValueAtIndex(runtime, i), runtime);
+    result.emplace_back(span.begin(), span.end());
+  }
+  return result;
+}
+
 template <>
 inline std::vector<int64_t>
 getValue<std::vector<int64_t>>(const jsi::Value &val, jsi::Runtime &runtime) {