software-mansion · mkopcins · May 19, 2025 · Apr 22, 2025 · May 8, 2025 · May 15, 2025
diff --git a/docs/docs/natural-language-processing/useSpeechToText.md b/docs/docs/natural-language-processing/useSpeechToText.md
diff --git a/docs/docs/typescript-api/SpeechToTextModule.md b/docs/docs/typescript-api/SpeechToTextModule.md
@@ -37,20 +37,27 @@ const transcribedText = await SpeechToTextModule.transcribe(waveform);
 
 ### Methods
 
-| Method               | Type                                                                                                                                                                                                                                                                                                     | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
-| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `load`               | <code>(modelName: 'whisper' &#124 'moonshine' &#124 'whisperMultilingual', transcribeCallback?: (sequence: string) => void, modelDownloadProgressCallback?: (downloadProgress: number) => void, encoderSource?: ResourceSource, decoderSource?: ResourceSource, tokenizerSource?: ResourceSource)</code> | Loads the model specified with `modelName`, where `encoderSource`, `decoderSource`, `tokenizerSource` are strings specifying the location of the binaries for the models. `modelDownloadProgressCallback` allows you to monitor the current progress of the model download, while `transcribeCallback` is invoked with each generated token                                                                                                                                                                                                                                                                                            |
-| `transcribe`         | `(waveform: number[], audioLanguage?: SpeechToTextLanguage): Promise<string>`                                                                                                                                                                                                                            | Starts a transcription process for a given input array, which should be a waveform at 16kHz. Resolves a promise with the output transcription when the model is finished. For multilingual models, you have to specify the audioLanguage flag, which is the language of the spoken language in the audio.                                                                                                                                                                                                                                                                                                                              |
-| `encode`             | `(waveform: number[]) => Promise<number[]>`                                                                                                                                                                                                                                                              | Runs the encoding part of the model. Returns a float array representing the output of the encoder.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
-| `decode`             | `(tokens: number[], encodings?: number[]) => Promise<number[]>`                                                                                                                                                                                                                                          | Runs the decoder of the model. Returns a single token representing a next token in the output sequence. If `encodings` are provided then they are used for decoding process, if not then the cached encodings from most recent `encode` call are used. The cached option is much faster due to very large overhead for communication between native and react layers.                                                                                                                                                                                                                                                                  |
-| `configureStreaming` | <code>(overlapSeconds?: number, windowSize?: number, streamingConfig?: 'fast' &#124; 'balanced' &#124; 'quality') => void</code>                                                                                                                                                                         | Configures options for the streaming algorithm: <ul><li>`overlapSeconds` determines how much adjacent audio chunks overlap (increasing it slows down transcription, decreases probability of weird wording at the chunks intersection, setting it larger than 3 seconds generally is discouraged), </li><li>`windowSize` describes size of the audio chunks (increasing it speeds up the end to end transcription time, but increases latency for the first token to be returned),</li><li> `streamingConfig` predefined configs for `windowSize` and `overlapSeconds` values.</li></ul> Keep `windowSize + 2 * overlapSeconds <= 30`. |
+| Method                | Type                                                                                                                                                                                                                                                                                                     | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| --------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `load`                | <code>(modelName: 'whisper' &#124 'moonshine' &#124 'whisperMultilingual', transcribeCallback?: (sequence: string) => void, modelDownloadProgressCallback?: (downloadProgress: number) => void, encoderSource?: ResourceSource, decoderSource?: ResourceSource, tokenizerSource?: ResourceSource)</code> | Loads the model specified with `modelName`, where `encoderSource`, `decoderSource`, `tokenizerSource` are strings specifying the location of the binaries for the models. `modelDownloadProgressCallback` allows you to monitor the current progress of the model download, while `transcribeCallback` is invoked with each generated token                                                                                                                                                                                                                                                                                                                                  |
+| `transcribe`          | `(waveform: number[], audioLanguage?: SpeechToTextLanguage): Promise<string>`                                                                                                                                                                                                                            | Starts a transcription process for a given input array, which should be a waveform at 16kHz. Resolves a promise with the output transcription when the model is finished. For multilingual models, you have to specify the audioLanguage flag, which is the language of the spoken language in the audio.                                                                                                                                                                                                                                                                                                                                                                    |
+| `streamingTranscribe` | `(streamingAction: STREAMING_ACTION, waveform?: number[], audioLanguage?: SpeechToTextLanguage) => Promise<string>`                                                                                                                                                                                      | This allows for running transcription process on-line, which means where the whole audio is not known beforehand i.e. when transcribing from a live microphone feed. `streamingAction` defines the type of package sent to the model: <li>`START` - initializes the process, allows for optional `waveform` data</li><li>`DATA` - this package should contain consecutive audio data chunks sampled in 16k Hz</li><li>`STOP` - the last data chunk for this transcription, ends the transcription process and flushes internal buffers</li> Each call returns most recent transcription. Returns error when called when module is in use (i.e. processing `transcribe` call) |
+| `encode`              | `(waveform: number[]) => Promise<number[]>`                                                                                                                                                                                                                                                              | Runs the encoding part of the model. Returns a float array representing the output of the encoder.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| `decode`              | `(tokens: number[], encodings?: number[]) => Promise<number[]>`                                                                                                                                                                                                                                          | Runs the decoder of the model. Returns a single token representing a next token in the output sequence. If `encodings` are provided then they are used for decoding process, if not then the cached encodings from most recent `encode` call are used. The cached option is much faster due to very large overhead for communication between native and react layers.                                                                                                                                                                                                                                                                                                        |
+| `configureStreaming`  | <code>(overlapSeconds?: number, windowSize?: number, streamingConfig?: 'fast' &#124; 'balanced' &#124; 'quality') => void</code>                                                                                                                                                                         | Configures options for the streaming algorithm: <ul><li>`overlapSeconds` determines how much adjacent audio chunks overlap (increasing it slows down transcription, decreases probability of weird wording at the chunks intersection, setting it larger than 3 seconds generally is discouraged), </li><li>`windowSize` describes size of the audio chunks (increasing it speeds up the end to end transcription time, but increases latency for the first token to be returned),</li><li> `streamingConfig` predefined configs for `windowSize` and `overlapSeconds` values.</li></ul> Keep `windowSize + 2 * overlapSeconds <= 30`.                                       |
 
 <details>
 <summary>Type definitions</summary>
 
 ```typescript
 type ResourceSource = string | number | object;
 
+enum STREAMING_ACTION {
+  START,
+  DATA,
+  STOP,
+}
+
 enum SpeechToTextLanguage {
   Afrikaans = 'af',
   Albanian = 'sq',

diff --git a/examples/llm/App.tsx b/examples/llm/App.tsx
@@ -14,9 +14,11 @@ import {
 } from 'react-native';
 import LLMScreen from './screens/LLMScreen';
 import LLMToolCallingScreen from './screens/LLMToolCallingScreen';
+import VoiceChatScreen from './screens/VocieChatScreen';
 
 enum Mode {
   LLM,
+  LLM_VOICE_CHAT,
   LLM_TOOL_CALLING,
 }
 
@@ -39,6 +41,9 @@ export default function App() {
       case Mode.LLM:
         return <LLMScreen setIsGenerating={setIsGenerating} />;
 
+      case Mode.LLM_VOICE_CHAT:
+        return <VoiceChatScreen setIsGenerating={setIsGenerating} />;
+
       case Mode.LLM_TOOL_CALLING:
         return <LLMToolCallingScreen setIsGenerating={setIsGenerating} />;
 
@@ -61,7 +66,7 @@ export default function App() {
             {!isGenerating ? (
               <View style={styles.wheelPickerContainer}>
                 <ScrollPicker
-                  dataSource={['Chat with LLM', 'Tool calling']}
+                  dataSource={['Chat with LLM', 'Talk to LLM', 'Tool calling']}
                   onValueChange={(_, selectedIndex) => {
                     handleModeChange(selectedIndex);
                   }}

diff --git a/examples/llm/assets/icons/mic_icon.svg b/examples/llm/assets/icons/mic_icon.svg
diff --git a/examples/llm/assets/icons/stop_icon.svg b/examples/llm/assets/icons/stop_icon.svg