jaseci-labs · udithishanka · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025
diff --git a/jac-byllm/byllm/llm.py b/jac-byllm/byllm/llm.py
@@ -20,7 +20,7 @@
 os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
 
 from .llm_connector import LLMConnector
-from .types import CompletionResult
+from .types import CompletionResult, Message, MessageRole, ToolCall
 
 SYSTEM_PERSONA = """\
 This is a task you must complete by returning only the output.
@@ -77,15 +77,20 @@ def call_params(self) -> dict[str, object]:
 
     def invoke(self, mtir: MTIR) -> object:
         """Invoke the LLM with the given caller and arguments."""
-        if mtir.stream:
+        # If streaming without tools, stream immediately
+        if mtir.stream and len(mtir.tools) == 0:
             return self._completion_streaming(mtir)
 
-        # Invoke the LLM and handle tool calls.
+        # Invoke the LLM and handle tool calls (ReAct loop).
         while True:
             resp = self._completion_no_streaming(mtir)
             if resp.tool_calls:
                 for tool_call in resp.tool_calls:
                     if tool_call.is_finish_call():
+                        # If streaming is enabled, make a new streaming call
+                        # to generate the final answer based on all context
+                        if mtir.stream:
+                            return self._stream_final_answer(mtir)
                         return tool_call.get_output()
                     else:
                         mtir.add_message(tool_call())
@@ -101,3 +106,34 @@ def _completion_no_streaming(self, mtir: MTIR) -> CompletionResult:
     def _completion_streaming(self, mtir: MTIR) -> Generator[str, None, None]:
         """Perform a streaming completion request with the LLM."""
         return self.llm_connector.dispatch_streaming(mtir)
+
+    def _stream_final_answer(self, mtir: MTIR) -> Generator[str, None, None]:
+        """Stream the final answer after ReAct tool calls complete.
+
+        This creates a new streaming LLM call with all the context from tool calls
+        to generate the final answer in real-time streaming mode.
+
+        The difference from _stream_finish_output:
+        - This makes a REAL streaming API call to the LLM
+        - _stream_finish_output just splits an already-complete string
+        """
+        # Add a message instructing the LLM to provide the final answer
+        # based on all the tool call results gathered so far
+        final_instruction = Message(
+            role=MessageRole.USER,
+            content="Based on the tool calls and their results above, provide your final answer. "
+            "Be comprehensive and synthesize all the information gathered.",
+        )
+        mtir.add_message(final_instruction)
+
+        # Remove tools and make a streaming call to get the real-time answer
+        # We temporarily clear tools so the LLM just responds with text
+        original_tools = mtir.tools
+        mtir.tools = []
+
+        try:
+            # Make the actual streaming call - this is REAL streaming from the LLM!
+            yield from self.llm_connector.dispatch_streaming(mtir)
+        finally:
+            # Restore tools (though we're done at this point)
+            mtir.tools = original_tools
diff --git a/jac-byllm/byllm/mtir.py b/jac-byllm/byllm/mtir.py
@@ -105,22 +105,9 @@ def factory(
         return_type = get_type_hints(caller).get("return")
         is_streaming = bool(call_params.get("stream", False))
 
-        if is_streaming:
-            if return_type is not str:
-                raise RuntimeError(
-                    "Streaming responses are only supported for str return types."
-                )
-            if tools:
-                raise RuntimeError(
-                    "Streaming responses are not supported with tool calls yet."
-                )
-
-        # TODO: Support mockllm for mocktesting.
-        # Invoke streaming request, this will result in a generator that the caller
-        # should either do .next() or .__iter__() by calling `for tok in resp: ...`
-        if is_streaming and tools:
+        if is_streaming and return_type is not str:
             raise RuntimeError(
-                "Streaming responses are not supported with tool calls yet."
+                "Streaming responses are only supported for str return types."
             )
 
         if len(tools) > 0:

diff --git a/jac-byllm/tests/fixtures/demo_real_streaming.jac b/jac-byllm/tests/fixtures/demo_real_streaming.jac
@@ -0,0 +1,68 @@
+"""REAL streaming demo with ReAct method using actual LLM.
+
+This demonstrates ACTUAL streaming from the LLM, not mocked.
+You need to set your API key as an environment variable:
+  export OPENAI_API_KEY="your-key-here"
+
+Or use any other supported model.
+"""
+
+import from byllm.lib { Model }
+import from datetime { datetime }
+import from os { environ }
+
+# Real LLM - change this to your preferred model
+glob llm = Model(
+    model_name="gemini/gemini-2.0-flash", verbose=True,
+);
+
+"""Get the current date and time."""
+def get_current_datetime() -> str {
+    return datetime.now().strftime("%d-%m-%Y %H:%M:%S");
+}
+
+"""Calculate a mathematical expression."""
+def calculate(expression: str) -> str {
+    try {
+        result = eval(expression);
+        return f"The result is {result}";
+    } except Exception as e {
+        return f"Error: {str(e)}";
+    }
+}
+
+"""Answer the question using available tools. Be detailed in your response."""
+def answer_question(question: str) -> str by llm(
+    method="ReAct",
+    tools=[get_current_datetime, calculate],
+    stream=True,
+    temperature=0.7
+);
+
+with entry {
+    print("=" * 70);
+    print("REAL STREAMING DEMO - ReAct with Actual LLM");
+    print("=" * 70);
+    print("\nThis will make real API calls and stream the response token-by-token!");
+    print("\nWatch closely - you should see the text appear gradually...\n");
+
+    question = "What is the current date and time, and what is 127 multiplied by 89?";
+    print(f"Question: {question}\n");
+    print("Answer (streaming in real-time): ");
+    print("-" * 70);
+
+    # This will ACTUALLY stream from the LLM!
+    # You'll see:
+    # 1. Tool calls execute (not streamed)
+    # 2. Final answer streams token-by-token (REAL streaming!)
+    for chunk in answer_question(question) {
+        print(chunk, end='', flush=True);
+    }
+
+    print("\n" + "-" * 70);
+    print("\n✓ Streaming complete!");
+    print("\nWhat happened:");
+    print("1. LLM made tool calls (get_current_datetime, calculate)");
+    print("2. After gathering results, LLM streamed final answer");
+    print("3. You saw tokens appear in real-time!");
+}
diff --git a/jac-byllm/tests/fixtures/streaming_with_react.jac b/jac-byllm/tests/fixtures/streaming_with_react.jac
@@ -0,0 +1,62 @@
+"""Test streaming with ReAct method using MockLLM."""
+
+import from byllm.lib { Model, MockToolCall }
+import from datetime {datetime}
+
+# Mock function that will be used as a tool
+def get_current_date() -> str {
+    return "29-10-2025";
+}
+
+def calculate(expression: str) -> str {
+    return f"Result: {eval(expression)}";
+}
+
+# Create a mock LLM that simulates tool calling then final output
+glob llm = Model(
+    model_name="mockllm",
+    outputs=[
+        # First, LLM decides to call get_current_date
+        MockToolCall(tool=get_current_date, args={}),
+        # Then, LLM decides to call calculate
+        MockToolCall(tool=calculate, args={"expression": "25 * 4"}),
+        # Finally, LLM calls finish_tool with the answer
+        # Note: finish_tool is automatically added by the system
+        "Based on my calculations, today's date is 29-10-2025 and 25 multiplied by 4 equals 100. This demonstrates the ReAct pattern with streaming support.",
+    ]
+);
+
+"""Answer the question using available tools."""
+def answer_question(question: str) -> str by llm(
+    method="ReAct",
+    tools=[get_current_date, calculate],
+    stream=True
+);
+
+with entry {
+    print("Testing ReAct with streaming...\n");
+
+    question = "What is today's date and what is 25 * 4?";
+    print(f"Question: {question}\n");
+    print("Answer (streaming): ");
+
+    result = answer_question(question);
+
+    # Result should be a generator (for streaming)
+    assert result is not str, "Expected generator for streaming, got string";
+
+    # Collect all chunks
+    full_response = "";
+    for chunk in result {
+        print(chunk, end='', flush=True);
+        full_response += chunk;
+    }
+
+    print("\n");
+
+    # Verify the response contains expected content
+    assert "29-10-2025" in full_response, "Response should mention the date";
+    assert "100" in full_response, "Response should mention the calculation result";
+
+    print("\nTest passed! ✓");
+}
diff --git a/jac-byllm/tests/test_byllm.py b/jac-byllm/tests/test_byllm.py
@@ -84,6 +84,17 @@ def test_streaming_output(self) -> None:
         stdout_value = captured_output.getvalue()
         self.assertIn('The orca whale, or killer whale, is one of the most intelligent and adaptable marine predators', stdout_value)
 
+    def test_streaming_with_react(self) -> None:
+        """Test streaming output with ReAct method (tool calling)."""
+        captured_output = io.StringIO()
+        sys.stdout = captured_output
+        jac_import("streaming_with_react", base_path=self.fixture_abs_path("./"))
+        sys.stdout = sys.__stdout__
+        stdout_value = captured_output.getvalue()
+        self.assertIn('29-10-2025', stdout_value)
+        self.assertIn('100', stdout_value)
+        self.assertIn('Test passed!', stdout_value)
+
     def test_by_expr(self) -> None:
         """Test by llm['as'].expression instead of llm() call."""
         captured_output = io.StringIO()