refactor(streaming): simplify response display and fix duplication issues (#104)

jonigl · web-flow · commit 61499b09e4e5 · 2025-10-18T17:37:10.000+02:00
- Replace Rich Live widget with direct console printing to eliminate text duplication when scrolling
- Stream content character-by-character for typing effect, then render final markdown
- Fix spinner behavior: stop on first thinking chunk (if show_thinking=True) or first content/tool chunk
- Respect show_thinking parameter throughout the streaming process
- Add proper return statement to fix TypeError when unpacking response
- Maintain streaming illusion while avoiding terminal scrollback duplication

This approach trades off real-time markdown rendering during streaming for a cleaner terminal history without duplicated text.
diff --git a/mcp_client_for_ollama/utils/streaming.py b/mcp_client_for_ollama/utils/streaming.py
@@ -5,10 +5,6 @@
     StreamingManager: Handles streaming responses from Ollama.
 """
 from rich.markdown import Markdown
-from rich.live import Live
-from rich.spinner import Spinner
-from rich.table import Table
-from rich.text import Text
 from .metrics import display_metrics, extract_metrics
 
 class StreamingManager:
@@ -22,36 +18,6 @@ def __init__(self, console):
         """
         self.console = console
 
-    def _create_working_display(self):
-        """Create a display showing working status with spinner"""
-        table = Table.grid()
-        spinner = Spinner("dots", style="cyan")
-        working_text = Text("working...", style="cyan")
-        header = Table.grid(padding=(0, 1))
-        header.add_row(spinner, working_text)
-        table.add_row(header)
-        return table
-
-    def _create_content_display(self, content, thinking_content="", show_thinking=True, has_tool_calls=False):
-        """Create a display for content with optional thinking section"""
-        if thinking_content and show_thinking:
-            # Only add separator and Answer label if there's actual content
-            if content:
-                if has_tool_calls:
-                    combined_content = thinking_content + "\n\n---\n\n" + content
-                else:
-                    combined_content = thinking_content + "\n\n---\n\n**Answer:**\n\n" + content
-            else:
-                # No content, just show thinking
-                combined_content = thinking_content
-            return Markdown(combined_content)
-        else:
-            # Don't add "Answer:" label when tools are being called or when content is empty
-            if has_tool_calls or not content:
-                return Markdown(content)
-            else:
-                return Markdown("**Answer:**\n\n" + content)
-
     async def process_streaming_response(self, stream, print_response=True, thinking_mode=False, show_thinking=True, show_metrics=False):
         """Process a streaming response from Ollama with status spinner and content updates
 
@@ -70,79 +36,86 @@ async def process_streaming_response(self, stream, print_response=True, thinking
         accumulated_text = ""
         thinking_content = ""
         tool_calls = []
-        showing_working = True  # Track if we're still showing the working display
         metrics = None  # Store metrics from final chunk
 
         if print_response:
-            with Live(console=self.console, refresh_per_second=10, vertical_overflow='visible') as live:
-                # Start with working display
-                live.update(self._create_working_display())
-
-                async for chunk in stream:
-                    # Capture metrics when chunk is done
-                    extracted_metrics = extract_metrics(chunk)
-                    if extracted_metrics:
-                        metrics = extracted_metrics
-
-                    # Handle thinking content
-                    if (thinking_mode and hasattr(chunk, 'message') and
-                        hasattr(chunk.message, 'thinking') and chunk.message.thinking):
-
-                        if not thinking_content:
-                            thinking_content = "🤔 **Thinking:**\n\n"
-                        thinking_content += chunk.message.thinking
-
-                        # Hide working display and show thinking content
-                        if showing_working:
-                            showing_working = False
-
-                        display = self._create_content_display(
-                            accumulated_text, thinking_content, show_thinking=True, has_tool_calls=False
-                        )
-                        live.update(display)
-
-                    # Handle regular content
-                    if (hasattr(chunk, 'message') and hasattr(chunk.message, 'content') and
-                        chunk.message.content):
-
-                        accumulated_text += chunk.message.content
-
-                        # Hide working display and show content
-                        if showing_working:
-                            showing_working = False
-
-                        # Update display based on thinking mode
-                        display = self._create_content_display(
-                            accumulated_text, thinking_content, show_thinking, has_tool_calls=False
-                        )
-                        live.update(display)
-
-                    # Handle tool calls
-                    if (hasattr(chunk, 'message') and hasattr(chunk.message, 'tool_calls') and
-                        chunk.message.tool_calls):
-                        # Hide working display and show final content if any before tool calls
-                        showing_working = False
-
-                        for tool in chunk.message.tool_calls:
-                            tool_calls.append(tool)
-
-                        # Show final content display if we have any accumulated text
-                        if accumulated_text or thinking_content:
-                            display = self._create_content_display(
-                                accumulated_text, thinking_content, show_thinking, has_tool_calls=True
-                            )
-                            live.update(display)
-                        else:
-                            # Clear the working display by showing empty content
-                            live.update(Markdown(""))
-
-            # Add spacing after streaming completes only if we showed content and no tool calls
-            if not showing_working and not tool_calls:
+            # Thinking header flag
+            thinking_started = False
+            # Show initial working spinner until first chunk arrives
+            first_chunk = True
+            status = self.console.status("[cyan]working...", spinner="dots")
+            status.start()
+
+            async for chunk in stream:
+                # Capture metrics when chunk is done
+                extracted_metrics = extract_metrics(chunk)
+                if extracted_metrics:
+                    metrics = extracted_metrics
+
+                # Handle thinking content
+                if (thinking_mode and hasattr(chunk, 'message') and
+                    hasattr(chunk.message, 'thinking') and chunk.message.thinking):
+                    # Stop spinner on first thinking chunk ONLY if show_thinking is True
+                    if first_chunk and show_thinking:
+                        status.stop()
+                        first_chunk = False
+
+                    if not thinking_content:
+                        thinking_content = "🤔 **Thinking:**\n\n"
+                        if not thinking_started and show_thinking:
+                            self.console.print(Markdown("🤔 **Thinking:**\n"))
+                            self.console.print(Markdown("---"))
+                            self.console.print()
+                            thinking_started = True
+                    thinking_content += chunk.message.thinking
+                    # Print thinking content as plain text only if show_thinking is True
+                    if show_thinking:
+                        self.console.print(chunk.message.thinking, end="")
+
+                # Handle regular content
+                if (hasattr(chunk, 'message') and hasattr(chunk.message, 'content') and
+                    chunk.message.content):
+                    # Stop spinner on first content chunk (always)
+                    if first_chunk:
+                        status.stop()
+                        first_chunk = False
+
+                    # Print separator and Answer label when transitioning from thinking to content
+                    if not accumulated_text:
+                        self.console.print()
+                        self.console.print(Markdown("📝 **Answer:**"))
+                        self.console.print(Markdown("---"))
+                        self.console.print()
+
+                    accumulated_text += chunk.message.content
+
+                    # Print only new content as plain text (will render full markdown at end)
+                    self.console.print(chunk.message.content, end="")
+
+                # Handle tool calls
+                if (hasattr(chunk, 'message') and hasattr(chunk.message, 'tool_calls') and
+                    chunk.message.tool_calls):
+                    # Stop spinner on first tool call chunk (always) - just in case no content arrives
+                    if first_chunk:
+                        status.stop()
+                        first_chunk = False
+
+                    for tool in chunk.message.tool_calls:
+                        tool_calls.append(tool)
+
+            # Print newline at end
+            self.console.print()
+
+            # Render final markdown content properly
+            if accumulated_text:
+                # Render in markdown format and state this
+                self.console.print()
+                self.console.print(Markdown("📝 **Answer (Markdown):**"))
+                self.console.print(Markdown("---"))
+                self.console.print()
+                self.console.print(Markdown(accumulated_text))
                 self.console.print()
 
-            # Display metrics if requested and available
-            if show_metrics and metrics and print_response:
-                display_metrics(self.console, metrics)
         else:
             # Silent processing without display
             async for chunk in stream:
@@ -164,4 +137,8 @@ async def process_streaming_response(self, stream, print_response=True, thinking
                     for tool in chunk.message.tool_calls:
                         tool_calls.append(tool)
 
+        # Display metrics if requested
+        if show_metrics and metrics:
+            display_metrics(self.console, metrics)
+
         return accumulated_text, tool_calls, metrics