From 95efba2e046d36bcaff37595668ce4fbb307217b Mon Sep 17 00:00:00 2001 From: Chris Hasson Date: Thu, 18 Dec 2025 15:50:29 -0800 Subject: [PATCH 1/6] Remove cache from speech-to-text availability check - Remove cachedResult variable and CACHE_DURATION_MS constant - Remove cache checking logic and forceRecheck parameter - Always perform fresh check without caching - Update function documentation to reflect no caching behavior Remove speechToTextStatus from getStateToPostToWebview - Remove expensive checkSpeechToTextAvailable call from state generation - Remove speechToTextStatus from returned state object - Remove unused import - Status will now be fetched on-demand instead of on every state update Add speechToTextStatus field to ExtensionMessage interface - Add field for speechToTextStatusResponse message type - Place near other STT-related fields for consistency Add checkSpeechToTextAvailable message handler - Add handler case for checkSpeechToTextAvailable message type - Dynamically import checkSpeechToTextAvailable function - Send speechToTextStatusResponse with status to webview - Place handler near other STT-related handlers Update ChatTextArea to use local state for speechToTextStatus - Remove speechToTextStatus from useExtensionState() destructuring - Add local useState for speechToTextStatus - Add useEffect to request STT check on mount (only if experiment enabled) - Add message handler for speechToTextStatusResponse to update local state Add onMouseEnter prop to MicrophoneButton component - Add onMouseEnter optional prop to MicrophoneButtonProps interface - Pass onMouseEnter handler to button element - Enables hover-triggered STT availability checks Implement hover handler for microphone button - Add handleMicrophoneHover callback that requests STT availability check - Connect handler to MicrophoneButton onMouseEnter prop - Enables real-time status updates when user hovers over microphone icon Rename checkSpeechToTextAvailable to stt:checkAvailability - Rename message type to follow stt: prefix convention - Update handler case in webviewMessageHandler - Update message calls in ChatTextArea (useEffect and hover handler) - Consistent with other STT events (stt:start, stt:stop, stt:cancel) Rename speechToTextStatusResponse to stt:statusResponse - Rename response message type to follow stt: prefix convention - Update handler in webviewMessageHandler - Update message listener in ChatTextArea - Consistent with other STT message types Move stt:statusResponse handler to separate useEvent block - Separate STT status handler from TTS handlers - Add comment explaining it's separate from recording events in useSTT hook - Remove debug console.log from useEffect - Better organization: TTS handlers, then STT status handler Create STTSetupPopover component and update translations - Create new STTSetupPopover component for interactive STT setup help - Update English translations: simplify FFmpeg message, add popover strings - Update Arabic translations: add popover strings - Component shows error message and help actions based on availability reason Update FFmpeg help to use Trans component with clickable link - Change FFmpeg help from Button to Trans component with VSCodeLink - Only 'Click here' text is clickable, rest is plain text - Update translation strings to include placeholder - Matches pattern used in other components like TelemetryBanner Update MicrophoneButton to remove tooltipContent and onMouseEnter props - Remove tooltipContent prop (no longer needed for disabled state) - Remove onMouseEnter prop (hover check removed) - Button is always clickable (removed disabled attribute) - Keep visual disabled styling via className when disabled prop is true Integrate STTSetupPopover into ChatTextArea - Add popover state management - Update handleMicrophoneClick to open popover when STT unavailable - Remove handleMicrophoneHover (no longer needed) - Wrap MicrophoneButton with STTSetupPopover - Implement handleFfmpegHelpClick to send help message to chat - Remove tooltipContent and onMouseEnter props from MicrophoneButton usage Improve both unavailable case in STTSetupPopover - Show both FFmpeg and OpenAI help options when both are missing - Display help actions in a flex column layout - Provides complete setup guidance for users missing both requirements Add optimistic UI updates for microphone button and fix text clearing issue - Add optimistic state management in useSTT hook for immediate UI feedback - Update microphone button icon immediately on click without waiting for backend - Fix text clearing issue by keeping live transcript visible after stop until onComplete - Sync optimistic state with backend state when events arrive - Remove debug logging while keeping optimistic functionality Enhance STTSetupPopover with detailed setup guidance and translations - Introduce STTSetupPopoverContent for improved structure and clarity - Update translations for multiple languages to include detailed setup instructions - Modify popover behavior to show relevant help based on missing requirements (FFmpeg and OpenAI) - Remove redundant code and streamline the component for better maintainability --- .../stories/STTSetupPopover.stories.tsx | 42 ++++++ src/core/webview/ClineProvider.ts | 10 -- src/core/webview/speechToTextCheck.ts | 23 +--- src/core/webview/webviewMessageHandler.ts | 11 +- src/shared/ExtensionMessage.ts | 2 + src/shared/WebviewMessage.ts | 1 + .../src/components/chat/ChatTextArea.tsx | 72 +++++++--- .../src/components/chat/MicrophoneButton.tsx | 7 +- .../src/components/chat/STTSetupPopover.tsx | 127 ++++++++++++++++++ webview-ui/src/hooks/useSTT.ts | 19 ++- webview-ui/src/i18n/locales/ar/kilocode.json | 16 ++- webview-ui/src/i18n/locales/ca/kilocode.json | 10 +- webview-ui/src/i18n/locales/cs/kilocode.json | 10 +- webview-ui/src/i18n/locales/de/kilocode.json | 10 +- webview-ui/src/i18n/locales/en/kilocode.json | 10 +- webview-ui/src/i18n/locales/es/kilocode.json | 10 +- webview-ui/src/i18n/locales/fr/kilocode.json | 10 +- webview-ui/src/i18n/locales/hi/kilocode.json | 14 +- webview-ui/src/i18n/locales/id/kilocode.json | 12 +- webview-ui/src/i18n/locales/it/kilocode.json | 12 +- webview-ui/src/i18n/locales/ja/kilocode.json | 10 +- webview-ui/src/i18n/locales/ko/kilocode.json | 10 +- webview-ui/src/i18n/locales/nl/kilocode.json | 12 +- webview-ui/src/i18n/locales/pl/kilocode.json | 10 +- .../src/i18n/locales/pt-BR/kilocode.json | 10 +- webview-ui/src/i18n/locales/ru/kilocode.json | 10 +- webview-ui/src/i18n/locales/th/kilocode.json | 14 +- webview-ui/src/i18n/locales/tr/kilocode.json | 10 +- webview-ui/src/i18n/locales/uk/kilocode.json | 10 +- webview-ui/src/i18n/locales/vi/kilocode.json | 12 +- .../src/i18n/locales/zh-CN/kilocode.json | 10 +- .../src/i18n/locales/zh-TW/kilocode.json | 10 +- 32 files changed, 422 insertions(+), 134 deletions(-) create mode 100644 apps/storybook/stories/STTSetupPopover.stories.tsx create mode 100644 webview-ui/src/components/chat/STTSetupPopover.tsx diff --git a/apps/storybook/stories/STTSetupPopover.stories.tsx b/apps/storybook/stories/STTSetupPopover.stories.tsx new file mode 100644 index 00000000000..c44e7d22c04 --- /dev/null +++ b/apps/storybook/stories/STTSetupPopover.stories.tsx @@ -0,0 +1,42 @@ +import type { Meta, StoryObj } from "@storybook/react-vite" +import { STTSetupPopoverContent } from "@/components/chat/STTSetupPopover" + +const meta = { + title: "Components/STTSetupPopover", + component: STTSetupPopoverContent, + parameters: { + layout: "centered", + }, + tags: ["autodocs"], + render: (args) => ( +
+ +
+ ), + args: { + onFfmpegHelpClick: () => { + console.log("FFmpeg help clicked") + }, + }, +} satisfies Meta + +export default meta +type Story = StoryObj + +export const FFmpegNotInstalled: Story = { + name: "FFmpeg not installed", + args: { + reason: "ffmpegNotInstalled", + onFfmpegHelpClick: () => { + console.log("FFmpeg help clicked") + }, + }, +} + +export const OpenAIKeyMissing: Story = { + args: { + reason: "openaiKeyMissing", + }, +} + +export const BothMissing: Story = {} diff --git a/src/core/webview/ClineProvider.ts b/src/core/webview/ClineProvider.ts index 3640411b129..655a9f7bd79 100644 --- a/src/core/webview/ClineProvider.ts +++ b/src/core/webview/ClineProvider.ts @@ -96,7 +96,6 @@ import { Task } from "../task/Task" import { getSystemPromptFilePath } from "../prompts/sections/custom-system-prompt" import { webviewMessageHandler } from "./webviewMessageHandler" -import { checkSpeechToTextAvailable } from "./speechToTextCheck" // kilocode_change import type { ClineMessage, TodoItem } from "@roo-code/types" import { readApiMessages, saveApiMessages, saveTaskMessages } from "../task-persistence" import { readTaskMessages } from "../task-persistence/taskMessages" @@ -2215,14 +2214,6 @@ ${prompt} : undefined // kilocode_change end - // kilocode_change start - checkSpeechToTextAvailable (only when experiment enabled) - let speechToTextStatus: { available: boolean; reason?: "openaiKeyMissing" | "ffmpegNotInstalled" } | undefined = - undefined - if (experiments?.speechToText) { - speechToTextStatus = await checkSpeechToTextAvailable(this.providerSettingsManager) - } - // kilocode_change end - checkSpeechToTextAvailable - let cloudOrganizations: CloudOrganizationMembership[] = [] try { @@ -2447,7 +2438,6 @@ ${prompt} featureRoomoteControlEnabled, virtualQuotaActiveModel, // kilocode_change: Include virtual quota active model in state debug: vscode.workspace.getConfiguration(Package.name).get("debug", false), - speechToTextStatus, // kilocode_change: Speech-to-text availability status with failure reason } } diff --git a/src/core/webview/speechToTextCheck.ts b/src/core/webview/speechToTextCheck.ts index 9279df24691..f5ca5d7ce84 100644 --- a/src/core/webview/speechToTextCheck.ts +++ b/src/core/webview/speechToTextCheck.ts @@ -11,13 +11,6 @@ export type SpeechToTextAvailabilityResult = { reason?: "openaiKeyMissing" | "ffmpegNotInstalled" } -/** - * Cached availability result with timestamp - */ -let cachedResult: { available: boolean; reason?: "openaiKeyMissing" | "ffmpegNotInstalled"; timestamp: number } | null = - null -const CACHE_DURATION_MS = 30000 // 30 seconds - /** * Check if speech-to-text prerequisites are available * @@ -26,43 +19,29 @@ const CACHE_DURATION_MS = 30000 // 30 seconds * 2. FFmpeg is installed and available * * Note: The experiment flag is checked on the frontend, not here. - * Results are cached for 30 seconds to prevent redundant FFmpeg checks. + * This function always performs a fresh check without caching. * * @param providerSettingsManager - Provider settings manager for API configuration - * @param forceRecheck - Force a fresh check, ignoring cache (default: false) * @returns Promise - Result with availability status and failure reason if unavailable */ export async function checkSpeechToTextAvailable( providerSettingsManager: ProviderSettingsManager, - forceRecheck = false, ): Promise { - // Return cached result if valid and not forcing recheck - if (cachedResult !== null && !forceRecheck) { - const age = Date.now() - cachedResult.timestamp - if (age < CACHE_DURATION_MS) { - return { available: cachedResult.available, reason: cachedResult.reason } - } - } - try { // Check 1: OpenAI API key const apiKey = await getOpenAiApiKey(providerSettingsManager) if (!apiKey) { - cachedResult = { available: false, reason: "openaiKeyMissing", timestamp: Date.now() } return { available: false, reason: "openaiKeyMissing" } } // Check 2: FFmpeg installed const ffmpegResult = FFmpegCaptureService.findFFmpeg() if (!ffmpegResult.available) { - cachedResult = { available: false, reason: "ffmpegNotInstalled", timestamp: Date.now() } return { available: false, reason: "ffmpegNotInstalled" } } - cachedResult = { available: true, timestamp: Date.now() } return { available: true } } catch (error) { - cachedResult = { available: false, timestamp: Date.now() } return { available: false } } } diff --git a/src/core/webview/webviewMessageHandler.ts b/src/core/webview/webviewMessageHandler.ts index 99deac1e3cb..267f4e0b1bf 100644 --- a/src/core/webview/webviewMessageHandler.ts +++ b/src/core/webview/webviewMessageHandler.ts @@ -3657,6 +3657,7 @@ export const webviewMessageHandler = async ( } break } + // kilocode_change end: Type-safe global state handler // kilocode_change start: STT (Speech-to-Text) handlers case "stt:start": case "stt:stop": @@ -3665,7 +3666,15 @@ export const webviewMessageHandler = async ( await handleSTTCommand(provider, message as any) break } - // kilocode_change end: Type-safe global state handler + case "stt:checkAvailability": { + const { checkSpeechToTextAvailable } = await import("./speechToTextCheck") + provider.postMessageToWebview({ + type: "stt:statusResponse", + speechToTextStatus: await checkSpeechToTextAvailable(provider.providerSettingsManager), + }) + break + } + // kilocode_change end: STT (Speech-to-Text) handlers case "insertTextToChatArea": provider.postMessageToWebview({ type: "insertTextToChatArea", text: message.text }) break diff --git a/src/shared/ExtensionMessage.ts b/src/shared/ExtensionMessage.ts index 81248c55042..1139b4864d4 100644 --- a/src/shared/ExtensionMessage.ts +++ b/src/shared/ExtensionMessage.ts @@ -138,6 +138,7 @@ export interface ExtensionMessage { | "stt:transcript" // kilocode_change: STT transcript update | "stt:volume" // kilocode_change: STT volume level | "stt:stopped" // kilocode_change: STT session stopped + | "stt:statusResponse" // kilocode_change: Response to stt:checkAvailability request | "setHistoryPreviewCollapsed" | "commandExecutionStatus" | "mcpExecutionStatus" @@ -275,6 +276,7 @@ export interface ExtensionMessage { isFinal?: boolean // kilocode_change: STT transcript is final level?: number // kilocode_change: STT volume level (0-1) reason?: "completed" | "cancelled" | "error" // kilocode_change: STT stop reason + speechToTextStatus?: { available: boolean; reason?: "openaiKeyMissing" | "ffmpegNotInstalled" } // kilocode_change: Speech-to-text availability status response requestId?: string promptText?: string results?: { path: string; type: "file" | "folder"; label?: string }[] diff --git a/src/shared/WebviewMessage.ts b/src/shared/WebviewMessage.ts index 5bfedc5cfd2..656f1a0fb2d 100644 --- a/src/shared/WebviewMessage.ts +++ b/src/shared/WebviewMessage.ts @@ -145,6 +145,7 @@ export interface WebviewMessage { | "stt:start" // kilocode_change: Start STT recording | "stt:stop" // kilocode_change: Stop STT recording | "stt:cancel" // kilocode_change: Cancel STT recording + | "stt:checkAvailability" // kilocode_change: Check STT availability on demand | "includeTaskHistoryInEnhance" // kilocode_change | "snoozeAutocomplete" // kilocode_change | "autoApprovalEnabled" diff --git a/webview-ui/src/components/chat/ChatTextArea.tsx b/webview-ui/src/components/chat/ChatTextArea.tsx index 801a1dc0463..472126f28c9 100644 --- a/webview-ui/src/components/chat/ChatTextArea.tsx +++ b/webview-ui/src/components/chat/ChatTextArea.tsx @@ -36,6 +36,7 @@ import { IndexingStatusBadge } from "./IndexingStatusBadge" import { MicrophoneButton } from "./MicrophoneButton" // kilocode_change: STT microphone button import { VolumeVisualizer } from "./VolumeVisualizer" // kilocode_change: STT volume level visual import { VoiceRecordingCursor } from "./VoiceRecordingCursor" // kilocode_change: STT recording cursor +import { STTSetupPopover } from "./STTSetupPopover" // kilocode_change: STT setup help popover import { cn } from "@/lib/utils" import { usePromptHistory } from "./hooks/usePromptHistory" import { useSTT } from "@/hooks/useSTT" // kilocode_change: STT hook @@ -162,9 +163,17 @@ export const ChatTextArea = forwardRef( ghostServiceSettings, // kilocode_change language, // User's VSCode display language experiments, // kilocode_change: For speechToText experiment flag - speechToTextStatus, // kilocode_change: Speech-to-text availability status with failure reason } = useExtensionState() + // kilocode_change: Local state for speech-to-text availability (fetched on-demand) + const [speechToTextStatus, setSpeechToTextStatus] = useState< + | { + available: boolean + reason?: "openaiKeyMissing" | "ffmpegNotInstalled" + } + | undefined + >(undefined) + // kilocode_change start - autocomplete profile type system // Filter out autocomplete profiles - only show chat profiles in the chat interface const listApiConfigMeta = useMemo(() => { @@ -458,13 +467,31 @@ export const ChatTextArea = forwardRef( setImageWarning(null) }, [setImageWarning]) + // kilocode_change start: Popover state for STT setup help + const [sttSetupPopoverOpen, setSttSetupPopoverOpen] = useState(false) const handleMicrophoneClick = useCallback(() => { + // If STT is unavailable, open setup popover instead of starting recording + if (!speechToTextStatus?.available) { + setSttSetupPopoverOpen(true) + return + } + if (isRecording) { stopSTT() } else { startSTT(language || "en") // Pass user's language from extension state } - }, [isRecording, startSTT, stopSTT, language]) + }, [isRecording, startSTT, stopSTT, language, speechToTextStatus?.available]) + // kilocode_change end: Popover state for STT setup help + // kilocode_change: FFmpeg help - send message to chat + const handleFfmpegHelpClick = useCallback(() => { + const helpMessage = t("kilocode:speechToText.setupPopover.ffmpegMessage") + setInputValue(helpMessage) + + setTimeout(() => { + onSend() + }) // Trigger send after a brief delay to ensure input is set + }, [t, setInputValue, onSend]) // kilocode_change start: Auto-clear images when model changes to non-image-supporting const prevShouldDisableImages = useRef(shouldDisableImages) @@ -1326,6 +1353,22 @@ export const ChatTextArea = forwardRef( } }) + // kilocode_change start: STT status message handler + + // kilocode_change: Request STT availability check on mount (only once) + useEffect(() => { + if (experiments?.speechToText) { + vscode.postMessage({ type: "stt:checkAvailability" }) + } + }, [experiments?.speechToText]) + useEvent("message", (event: MessageEvent) => { + const message: ExtensionMessage = event.data + if (message.type === "stt:statusResponse" && message.speechToTextStatus) { + setSpeechToTextStatus(message.speechToTextStatus) + } + }) + // kilocode_change end: STT status message handler + const placeholderBottomText = `\n(${t("chat:addContext")}${shouldDisableImages ? `, ${t("chat:dragFiles")}` : `, ${t("chat:dragFilesImages")}`})` // Common mode selector handler @@ -1709,20 +1752,17 @@ export const ChatTextArea = forwardRef( {/* kilocode_change start: Show microphone button only if experiment enabled */} {experiments?.speechToText && ( - + + + )} {/* kilocode_change end */} diff --git a/webview-ui/src/components/chat/MicrophoneButton.tsx b/webview-ui/src/components/chat/MicrophoneButton.tsx index 5365e762cba..118a8edf553 100644 --- a/webview-ui/src/components/chat/MicrophoneButton.tsx +++ b/webview-ui/src/components/chat/MicrophoneButton.tsx @@ -9,8 +9,7 @@ interface MicrophoneButtonProps { isRecording: boolean onClick: () => void containerWidth?: number - disabled?: boolean - tooltipContent?: string + disabled?: boolean // Visual disabled state only - button is always clickable } export const MicrophoneButton: React.FC = ({ @@ -18,7 +17,6 @@ export const MicrophoneButton: React.FC = ({ onClick, containerWidth, disabled = false, - tooltipContent, }) => { const { t } = useTranslation() @@ -27,12 +25,11 @@ export const MicrophoneButton: React.FC = ({ : t("kilocode:speechToText.startRecording") return ( - +