From 95efba2e046d36bcaff37595668ce4fbb307217b Mon Sep 17 00:00:00 2001
From: Chris Hasson
Date: Thu, 18 Dec 2025 15:50:29 -0800
Subject: [PATCH 1/6] Remove cache from speech-to-text availability check
- Remove cachedResult variable and CACHE_DURATION_MS constant
- Remove cache checking logic and forceRecheck parameter
- Always perform fresh check without caching
- Update function documentation to reflect no caching behavior
Remove speechToTextStatus from getStateToPostToWebview
- Remove expensive checkSpeechToTextAvailable call from state generation
- Remove speechToTextStatus from returned state object
- Remove unused import
- Status will now be fetched on-demand instead of on every state update
Add speechToTextStatus field to ExtensionMessage interface
- Add field for speechToTextStatusResponse message type
- Place near other STT-related fields for consistency
Add checkSpeechToTextAvailable message handler
- Add handler case for checkSpeechToTextAvailable message type
- Dynamically import checkSpeechToTextAvailable function
- Send speechToTextStatusResponse with status to webview
- Place handler near other STT-related handlers
Update ChatTextArea to use local state for speechToTextStatus
- Remove speechToTextStatus from useExtensionState() destructuring
- Add local useState for speechToTextStatus
- Add useEffect to request STT check on mount (only if experiment enabled)
- Add message handler for speechToTextStatusResponse to update local state
Add onMouseEnter prop to MicrophoneButton component
- Add onMouseEnter optional prop to MicrophoneButtonProps interface
- Pass onMouseEnter handler to button element
- Enables hover-triggered STT availability checks
Implement hover handler for microphone button
- Add handleMicrophoneHover callback that requests STT availability check
- Connect handler to MicrophoneButton onMouseEnter prop
- Enables real-time status updates when user hovers over microphone icon
Rename checkSpeechToTextAvailable to stt:checkAvailability
- Rename message type to follow stt: prefix convention
- Update handler case in webviewMessageHandler
- Update message calls in ChatTextArea (useEffect and hover handler)
- Consistent with other STT events (stt:start, stt:stop, stt:cancel)
Rename speechToTextStatusResponse to stt:statusResponse
- Rename response message type to follow stt: prefix convention
- Update handler in webviewMessageHandler
- Update message listener in ChatTextArea
- Consistent with other STT message types
Move stt:statusResponse handler to separate useEvent block
- Separate STT status handler from TTS handlers
- Add comment explaining it's separate from recording events in useSTT hook
- Remove debug console.log from useEffect
- Better organization: TTS handlers, then STT status handler
Create STTSetupPopover component and update translations
- Create new STTSetupPopover component for interactive STT setup help
- Update English translations: simplify FFmpeg message, add popover strings
- Update Arabic translations: add popover strings
- Component shows error message and help actions based on availability reason
Update FFmpeg help to use Trans component with clickable link
- Change FFmpeg help from Button to Trans component with VSCodeLink
- Only 'Click here' text is clickable, rest is plain text
- Update translation strings to include placeholder
- Matches pattern used in other components like TelemetryBanner
Update MicrophoneButton to remove tooltipContent and onMouseEnter props
- Remove tooltipContent prop (no longer needed for disabled state)
- Remove onMouseEnter prop (hover check removed)
- Button is always clickable (removed disabled attribute)
- Keep visual disabled styling via className when disabled prop is true
Integrate STTSetupPopover into ChatTextArea
- Add popover state management
- Update handleMicrophoneClick to open popover when STT unavailable
- Remove handleMicrophoneHover (no longer needed)
- Wrap MicrophoneButton with STTSetupPopover
- Implement handleFfmpegHelpClick to send help message to chat
- Remove tooltipContent and onMouseEnter props from MicrophoneButton usage
Improve both unavailable case in STTSetupPopover
- Show both FFmpeg and OpenAI help options when both are missing
- Display help actions in a flex column layout
- Provides complete setup guidance for users missing both requirements
Add optimistic UI updates for microphone button and fix text clearing issue
- Add optimistic state management in useSTT hook for immediate UI feedback
- Update microphone button icon immediately on click without waiting for backend
- Fix text clearing issue by keeping live transcript visible after stop until onComplete
- Sync optimistic state with backend state when events arrive
- Remove debug logging while keeping optimistic functionality
Enhance STTSetupPopover with detailed setup guidance and translations
- Introduce STTSetupPopoverContent for improved structure and clarity
- Update translations for multiple languages to include detailed setup instructions
- Modify popover behavior to show relevant help based on missing requirements (FFmpeg and OpenAI)
- Remove redundant code and streamline the component for better maintainability
---
.../stories/STTSetupPopover.stories.tsx | 42 ++++++
src/core/webview/ClineProvider.ts | 10 --
src/core/webview/speechToTextCheck.ts | 23 +---
src/core/webview/webviewMessageHandler.ts | 11 +-
src/shared/ExtensionMessage.ts | 2 +
src/shared/WebviewMessage.ts | 1 +
.../src/components/chat/ChatTextArea.tsx | 72 +++++++---
.../src/components/chat/MicrophoneButton.tsx | 7 +-
.../src/components/chat/STTSetupPopover.tsx | 127 ++++++++++++++++++
webview-ui/src/hooks/useSTT.ts | 19 ++-
webview-ui/src/i18n/locales/ar/kilocode.json | 16 ++-
webview-ui/src/i18n/locales/ca/kilocode.json | 10 +-
webview-ui/src/i18n/locales/cs/kilocode.json | 10 +-
webview-ui/src/i18n/locales/de/kilocode.json | 10 +-
webview-ui/src/i18n/locales/en/kilocode.json | 10 +-
webview-ui/src/i18n/locales/es/kilocode.json | 10 +-
webview-ui/src/i18n/locales/fr/kilocode.json | 10 +-
webview-ui/src/i18n/locales/hi/kilocode.json | 14 +-
webview-ui/src/i18n/locales/id/kilocode.json | 12 +-
webview-ui/src/i18n/locales/it/kilocode.json | 12 +-
webview-ui/src/i18n/locales/ja/kilocode.json | 10 +-
webview-ui/src/i18n/locales/ko/kilocode.json | 10 +-
webview-ui/src/i18n/locales/nl/kilocode.json | 12 +-
webview-ui/src/i18n/locales/pl/kilocode.json | 10 +-
.../src/i18n/locales/pt-BR/kilocode.json | 10 +-
webview-ui/src/i18n/locales/ru/kilocode.json | 10 +-
webview-ui/src/i18n/locales/th/kilocode.json | 14 +-
webview-ui/src/i18n/locales/tr/kilocode.json | 10 +-
webview-ui/src/i18n/locales/uk/kilocode.json | 10 +-
webview-ui/src/i18n/locales/vi/kilocode.json | 12 +-
.../src/i18n/locales/zh-CN/kilocode.json | 10 +-
.../src/i18n/locales/zh-TW/kilocode.json | 10 +-
32 files changed, 422 insertions(+), 134 deletions(-)
create mode 100644 apps/storybook/stories/STTSetupPopover.stories.tsx
create mode 100644 webview-ui/src/components/chat/STTSetupPopover.tsx
diff --git a/apps/storybook/stories/STTSetupPopover.stories.tsx b/apps/storybook/stories/STTSetupPopover.stories.tsx
new file mode 100644
index 00000000000..c44e7d22c04
--- /dev/null
+++ b/apps/storybook/stories/STTSetupPopover.stories.tsx
@@ -0,0 +1,42 @@
+import type { Meta, StoryObj } from "@storybook/react-vite"
+import { STTSetupPopoverContent } from "@/components/chat/STTSetupPopover"
+
+const meta = {
+ title: "Components/STTSetupPopover",
+ component: STTSetupPopoverContent,
+ parameters: {
+ layout: "centered",
+ },
+ tags: ["autodocs"],
+ render: (args) => (
+
+
+
+ ),
+ args: {
+ onFfmpegHelpClick: () => {
+ console.log("FFmpeg help clicked")
+ },
+ },
+} satisfies Meta
+
+export default meta
+type Story = StoryObj
+
+export const FFmpegNotInstalled: Story = {
+ name: "FFmpeg not installed",
+ args: {
+ reason: "ffmpegNotInstalled",
+ onFfmpegHelpClick: () => {
+ console.log("FFmpeg help clicked")
+ },
+ },
+}
+
+export const OpenAIKeyMissing: Story = {
+ args: {
+ reason: "openaiKeyMissing",
+ },
+}
+
+export const BothMissing: Story = {}
diff --git a/src/core/webview/ClineProvider.ts b/src/core/webview/ClineProvider.ts
index 3640411b129..655a9f7bd79 100644
--- a/src/core/webview/ClineProvider.ts
+++ b/src/core/webview/ClineProvider.ts
@@ -96,7 +96,6 @@ import { Task } from "../task/Task"
import { getSystemPromptFilePath } from "../prompts/sections/custom-system-prompt"
import { webviewMessageHandler } from "./webviewMessageHandler"
-import { checkSpeechToTextAvailable } from "./speechToTextCheck" // kilocode_change
import type { ClineMessage, TodoItem } from "@roo-code/types"
import { readApiMessages, saveApiMessages, saveTaskMessages } from "../task-persistence"
import { readTaskMessages } from "../task-persistence/taskMessages"
@@ -2215,14 +2214,6 @@ ${prompt}
: undefined
// kilocode_change end
- // kilocode_change start - checkSpeechToTextAvailable (only when experiment enabled)
- let speechToTextStatus: { available: boolean; reason?: "openaiKeyMissing" | "ffmpegNotInstalled" } | undefined =
- undefined
- if (experiments?.speechToText) {
- speechToTextStatus = await checkSpeechToTextAvailable(this.providerSettingsManager)
- }
- // kilocode_change end - checkSpeechToTextAvailable
-
let cloudOrganizations: CloudOrganizationMembership[] = []
try {
@@ -2447,7 +2438,6 @@ ${prompt}
featureRoomoteControlEnabled,
virtualQuotaActiveModel, // kilocode_change: Include virtual quota active model in state
debug: vscode.workspace.getConfiguration(Package.name).get("debug", false),
- speechToTextStatus, // kilocode_change: Speech-to-text availability status with failure reason
}
}
diff --git a/src/core/webview/speechToTextCheck.ts b/src/core/webview/speechToTextCheck.ts
index 9279df24691..f5ca5d7ce84 100644
--- a/src/core/webview/speechToTextCheck.ts
+++ b/src/core/webview/speechToTextCheck.ts
@@ -11,13 +11,6 @@ export type SpeechToTextAvailabilityResult = {
reason?: "openaiKeyMissing" | "ffmpegNotInstalled"
}
-/**
- * Cached availability result with timestamp
- */
-let cachedResult: { available: boolean; reason?: "openaiKeyMissing" | "ffmpegNotInstalled"; timestamp: number } | null =
- null
-const CACHE_DURATION_MS = 30000 // 30 seconds
-
/**
* Check if speech-to-text prerequisites are available
*
@@ -26,43 +19,29 @@ const CACHE_DURATION_MS = 30000 // 30 seconds
* 2. FFmpeg is installed and available
*
* Note: The experiment flag is checked on the frontend, not here.
- * Results are cached for 30 seconds to prevent redundant FFmpeg checks.
+ * This function always performs a fresh check without caching.
*
* @param providerSettingsManager - Provider settings manager for API configuration
- * @param forceRecheck - Force a fresh check, ignoring cache (default: false)
* @returns Promise - Result with availability status and failure reason if unavailable
*/
export async function checkSpeechToTextAvailable(
providerSettingsManager: ProviderSettingsManager,
- forceRecheck = false,
): Promise {
- // Return cached result if valid and not forcing recheck
- if (cachedResult !== null && !forceRecheck) {
- const age = Date.now() - cachedResult.timestamp
- if (age < CACHE_DURATION_MS) {
- return { available: cachedResult.available, reason: cachedResult.reason }
- }
- }
-
try {
// Check 1: OpenAI API key
const apiKey = await getOpenAiApiKey(providerSettingsManager)
if (!apiKey) {
- cachedResult = { available: false, reason: "openaiKeyMissing", timestamp: Date.now() }
return { available: false, reason: "openaiKeyMissing" }
}
// Check 2: FFmpeg installed
const ffmpegResult = FFmpegCaptureService.findFFmpeg()
if (!ffmpegResult.available) {
- cachedResult = { available: false, reason: "ffmpegNotInstalled", timestamp: Date.now() }
return { available: false, reason: "ffmpegNotInstalled" }
}
- cachedResult = { available: true, timestamp: Date.now() }
return { available: true }
} catch (error) {
- cachedResult = { available: false, timestamp: Date.now() }
return { available: false }
}
}
diff --git a/src/core/webview/webviewMessageHandler.ts b/src/core/webview/webviewMessageHandler.ts
index 99deac1e3cb..267f4e0b1bf 100644
--- a/src/core/webview/webviewMessageHandler.ts
+++ b/src/core/webview/webviewMessageHandler.ts
@@ -3657,6 +3657,7 @@ export const webviewMessageHandler = async (
}
break
}
+ // kilocode_change end: Type-safe global state handler
// kilocode_change start: STT (Speech-to-Text) handlers
case "stt:start":
case "stt:stop":
@@ -3665,7 +3666,15 @@ export const webviewMessageHandler = async (
await handleSTTCommand(provider, message as any)
break
}
- // kilocode_change end: Type-safe global state handler
+ case "stt:checkAvailability": {
+ const { checkSpeechToTextAvailable } = await import("./speechToTextCheck")
+ provider.postMessageToWebview({
+ type: "stt:statusResponse",
+ speechToTextStatus: await checkSpeechToTextAvailable(provider.providerSettingsManager),
+ })
+ break
+ }
+ // kilocode_change end: STT (Speech-to-Text) handlers
case "insertTextToChatArea":
provider.postMessageToWebview({ type: "insertTextToChatArea", text: message.text })
break
diff --git a/src/shared/ExtensionMessage.ts b/src/shared/ExtensionMessage.ts
index 81248c55042..1139b4864d4 100644
--- a/src/shared/ExtensionMessage.ts
+++ b/src/shared/ExtensionMessage.ts
@@ -138,6 +138,7 @@ export interface ExtensionMessage {
| "stt:transcript" // kilocode_change: STT transcript update
| "stt:volume" // kilocode_change: STT volume level
| "stt:stopped" // kilocode_change: STT session stopped
+ | "stt:statusResponse" // kilocode_change: Response to stt:checkAvailability request
| "setHistoryPreviewCollapsed"
| "commandExecutionStatus"
| "mcpExecutionStatus"
@@ -275,6 +276,7 @@ export interface ExtensionMessage {
isFinal?: boolean // kilocode_change: STT transcript is final
level?: number // kilocode_change: STT volume level (0-1)
reason?: "completed" | "cancelled" | "error" // kilocode_change: STT stop reason
+ speechToTextStatus?: { available: boolean; reason?: "openaiKeyMissing" | "ffmpegNotInstalled" } // kilocode_change: Speech-to-text availability status response
requestId?: string
promptText?: string
results?: { path: string; type: "file" | "folder"; label?: string }[]
diff --git a/src/shared/WebviewMessage.ts b/src/shared/WebviewMessage.ts
index 5bfedc5cfd2..656f1a0fb2d 100644
--- a/src/shared/WebviewMessage.ts
+++ b/src/shared/WebviewMessage.ts
@@ -145,6 +145,7 @@ export interface WebviewMessage {
| "stt:start" // kilocode_change: Start STT recording
| "stt:stop" // kilocode_change: Stop STT recording
| "stt:cancel" // kilocode_change: Cancel STT recording
+ | "stt:checkAvailability" // kilocode_change: Check STT availability on demand
| "includeTaskHistoryInEnhance" // kilocode_change
| "snoozeAutocomplete" // kilocode_change
| "autoApprovalEnabled"
diff --git a/webview-ui/src/components/chat/ChatTextArea.tsx b/webview-ui/src/components/chat/ChatTextArea.tsx
index 801a1dc0463..472126f28c9 100644
--- a/webview-ui/src/components/chat/ChatTextArea.tsx
+++ b/webview-ui/src/components/chat/ChatTextArea.tsx
@@ -36,6 +36,7 @@ import { IndexingStatusBadge } from "./IndexingStatusBadge"
import { MicrophoneButton } from "./MicrophoneButton" // kilocode_change: STT microphone button
import { VolumeVisualizer } from "./VolumeVisualizer" // kilocode_change: STT volume level visual
import { VoiceRecordingCursor } from "./VoiceRecordingCursor" // kilocode_change: STT recording cursor
+import { STTSetupPopover } from "./STTSetupPopover" // kilocode_change: STT setup help popover
import { cn } from "@/lib/utils"
import { usePromptHistory } from "./hooks/usePromptHistory"
import { useSTT } from "@/hooks/useSTT" // kilocode_change: STT hook
@@ -162,9 +163,17 @@ export const ChatTextArea = forwardRef(
ghostServiceSettings, // kilocode_change
language, // User's VSCode display language
experiments, // kilocode_change: For speechToText experiment flag
- speechToTextStatus, // kilocode_change: Speech-to-text availability status with failure reason
} = useExtensionState()
+ // kilocode_change: Local state for speech-to-text availability (fetched on-demand)
+ const [speechToTextStatus, setSpeechToTextStatus] = useState<
+ | {
+ available: boolean
+ reason?: "openaiKeyMissing" | "ffmpegNotInstalled"
+ }
+ | undefined
+ >(undefined)
+
// kilocode_change start - autocomplete profile type system
// Filter out autocomplete profiles - only show chat profiles in the chat interface
const listApiConfigMeta = useMemo(() => {
@@ -458,13 +467,31 @@ export const ChatTextArea = forwardRef(
setImageWarning(null)
}, [setImageWarning])
+ // kilocode_change start: Popover state for STT setup help
+ const [sttSetupPopoverOpen, setSttSetupPopoverOpen] = useState(false)
const handleMicrophoneClick = useCallback(() => {
+ // If STT is unavailable, open setup popover instead of starting recording
+ if (!speechToTextStatus?.available) {
+ setSttSetupPopoverOpen(true)
+ return
+ }
+
if (isRecording) {
stopSTT()
} else {
startSTT(language || "en") // Pass user's language from extension state
}
- }, [isRecording, startSTT, stopSTT, language])
+ }, [isRecording, startSTT, stopSTT, language, speechToTextStatus?.available])
+ // kilocode_change end: Popover state for STT setup help
+ // kilocode_change: FFmpeg help - send message to chat
+ const handleFfmpegHelpClick = useCallback(() => {
+ const helpMessage = t("kilocode:speechToText.setupPopover.ffmpegMessage")
+ setInputValue(helpMessage)
+
+ setTimeout(() => {
+ onSend()
+ }) // Trigger send after a brief delay to ensure input is set
+ }, [t, setInputValue, onSend])
// kilocode_change start: Auto-clear images when model changes to non-image-supporting
const prevShouldDisableImages = useRef(shouldDisableImages)
@@ -1326,6 +1353,22 @@ export const ChatTextArea = forwardRef(
}
})
+ // kilocode_change start: STT status message handler
+
+ // kilocode_change: Request STT availability check on mount (only once)
+ useEffect(() => {
+ if (experiments?.speechToText) {
+ vscode.postMessage({ type: "stt:checkAvailability" })
+ }
+ }, [experiments?.speechToText])
+ useEvent("message", (event: MessageEvent) => {
+ const message: ExtensionMessage = event.data
+ if (message.type === "stt:statusResponse" && message.speechToTextStatus) {
+ setSpeechToTextStatus(message.speechToTextStatus)
+ }
+ })
+ // kilocode_change end: STT status message handler
+
const placeholderBottomText = `\n(${t("chat:addContext")}${shouldDisableImages ? `, ${t("chat:dragFiles")}` : `, ${t("chat:dragFilesImages")}`})`
// Common mode selector handler
@@ -1709,20 +1752,17 @@ export const ChatTextArea = forwardRef(
{/* kilocode_change start: Show microphone button only if experiment enabled */}
{experiments?.speechToText && (
-
+
+
+
)}
{/* kilocode_change end */}
diff --git a/webview-ui/src/components/chat/MicrophoneButton.tsx b/webview-ui/src/components/chat/MicrophoneButton.tsx
index 5365e762cba..118a8edf553 100644
--- a/webview-ui/src/components/chat/MicrophoneButton.tsx
+++ b/webview-ui/src/components/chat/MicrophoneButton.tsx
@@ -9,8 +9,7 @@ interface MicrophoneButtonProps {
isRecording: boolean
onClick: () => void
containerWidth?: number
- disabled?: boolean
- tooltipContent?: string
+ disabled?: boolean // Visual disabled state only - button is always clickable
}
export const MicrophoneButton: React.FC = ({
@@ -18,7 +17,6 @@ export const MicrophoneButton: React.FC = ({
onClick,
containerWidth,
disabled = false,
- tooltipContent,
}) => {
const { t } = useTranslation()
@@ -27,12 +25,11 @@ export const MicrophoneButton: React.FC = ({
: t("kilocode:speechToText.startRecording")
return (
-
+