Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 39 additions & 3 deletions jac-byllm/byllm/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"

from .llm_connector import LLMConnector
from .types import CompletionResult
from .types import CompletionResult, Message, MessageRole, ToolCall

SYSTEM_PERSONA = """\
This is a task you must complete by returning only the output.
Expand Down Expand Up @@ -77,15 +77,20 @@ def call_params(self) -> dict[str, object]:

def invoke(self, mtir: MTIR) -> object:
"""Invoke the LLM with the given caller and arguments."""
if mtir.stream:
# If streaming without tools, stream immediately
if mtir.stream and len(mtir.tools) == 0:
return self._completion_streaming(mtir)

# Invoke the LLM and handle tool calls.
# Invoke the LLM and handle tool calls (ReAct loop).
while True:
resp = self._completion_no_streaming(mtir)
if resp.tool_calls:
for tool_call in resp.tool_calls:
if tool_call.is_finish_call():
# If streaming is enabled, make a new streaming call
# to generate the final answer based on all context
if mtir.stream:
return self._stream_final_answer(mtir)
return tool_call.get_output()
else:
mtir.add_message(tool_call())
Expand All @@ -101,3 +106,34 @@ def _completion_no_streaming(self, mtir: MTIR) -> CompletionResult:
def _completion_streaming(self, mtir: MTIR) -> Generator[str, None, None]:
"""Perform a streaming completion request with the LLM."""
return self.llm_connector.dispatch_streaming(mtir)

def _stream_final_answer(self, mtir: MTIR) -> Generator[str, None, None]:
"""Stream the final answer after ReAct tool calls complete.

This creates a new streaming LLM call with all the context from tool calls
to generate the final answer in real-time streaming mode.

The difference from _stream_finish_output:
- This makes a REAL streaming API call to the LLM
- _stream_finish_output just splits an already-complete string
"""
# Add a message instructing the LLM to provide the final answer
# based on all the tool call results gathered so far
final_instruction = Message(
role=MessageRole.USER,
content="Based on the tool calls and their results above, provide your final answer. "
"Be comprehensive and synthesize all the information gathered.",
)
mtir.add_message(final_instruction)

# Remove tools and make a streaming call to get the real-time answer
# We temporarily clear tools so the LLM just responds with text
original_tools = mtir.tools
mtir.tools = []

try:
# Make the actual streaming call - this is REAL streaming from the LLM!
yield from self.llm_connector.dispatch_streaming(mtir)
finally:
# Restore tools (though we're done at this point)
mtir.tools = original_tools
17 changes: 2 additions & 15 deletions jac-byllm/byllm/mtir.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,22 +105,9 @@ def factory(
return_type = get_type_hints(caller).get("return")
is_streaming = bool(call_params.get("stream", False))

if is_streaming:
if return_type is not str:
raise RuntimeError(
"Streaming responses are only supported for str return types."
)
if tools:
raise RuntimeError(
"Streaming responses are not supported with tool calls yet."
)

# TODO: Support mockllm for mocktesting.
# Invoke streaming request, this will result in a generator that the caller
# should either do .next() or .__iter__() by calling `for tok in resp: ...`
if is_streaming and tools:
if is_streaming and return_type is not str:
raise RuntimeError(
"Streaming responses are not supported with tool calls yet."
"Streaming responses are only supported for str return types."
)

if len(tools) > 0:
Expand Down
68 changes: 68 additions & 0 deletions jac-byllm/tests/fixtures/demo_real_streaming.jac
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""REAL streaming demo with ReAct method using actual LLM.

This demonstrates ACTUAL streaming from the LLM, not mocked.
You need to set your API key as an environment variable:
export OPENAI_API_KEY="your-key-here"

Or use any other supported model.
"""

import from byllm.lib { Model }
import from datetime { datetime }
import from os { environ }

# Real LLM - change this to your preferred model
glob llm = Model(
model_name="gemini/gemini-2.0-flash", verbose=True,
);

"""Get the current date and time."""
def get_current_datetime() -> str {
return datetime.now().strftime("%d-%m-%Y %H:%M:%S");
}

"""Calculate a mathematical expression."""
def calculate(expression: str) -> str {
try {
result = eval(expression);
return f"The result is {result}";
} except Exception as e {
return f"Error: {str(e)}";
}
}

"""Answer the question using available tools. Be detailed in your response."""
def answer_question(question: str) -> str by llm(
method="ReAct",
tools=[get_current_datetime, calculate],
stream=True,
temperature=0.7
);

with entry {
print("=" * 70);
print("REAL STREAMING DEMO - ReAct with Actual LLM");
print("=" * 70);
print("\nThis will make real API calls and stream the response token-by-token!");
print("\nWatch closely - you should see the text appear gradually...\n");

question = "What is the current date and time, and what is 127 multiplied by 89?";
print(f"Question: {question}\n");
print("Answer (streaming in real-time): ");
print("-" * 70);

# This will ACTUALLY stream from the LLM!
# You'll see:
# 1. Tool calls execute (not streamed)
# 2. Final answer streams token-by-token (REAL streaming!)
for chunk in answer_question(question) {
print(chunk, end='', flush=True);
}

print("\n" + "-" * 70);
print("\n✓ Streaming complete!");
print("\nWhat happened:");
print("1. LLM made tool calls (get_current_datetime, calculate)");
print("2. After gathering results, LLM streamed final answer");
print("3. You saw tokens appear in real-time!");
}
62 changes: 62 additions & 0 deletions jac-byllm/tests/fixtures/streaming_with_react.jac
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""Test streaming with ReAct method using MockLLM."""

import from byllm.lib { Model, MockToolCall }
import from datetime {datetime}

# Mock function that will be used as a tool
def get_current_date() -> str {
return "29-10-2025";
}

def calculate(expression: str) -> str {
return f"Result: {eval(expression)}";
}

# Create a mock LLM that simulates tool calling then final output
glob llm = Model(
model_name="mockllm",
outputs=[
# First, LLM decides to call get_current_date
MockToolCall(tool=get_current_date, args={}),
# Then, LLM decides to call calculate
MockToolCall(tool=calculate, args={"expression": "25 * 4"}),
# Finally, LLM calls finish_tool with the answer
# Note: finish_tool is automatically added by the system
"Based on my calculations, today's date is 29-10-2025 and 25 multiplied by 4 equals 100. This demonstrates the ReAct pattern with streaming support.",
]
);

"""Answer the question using available tools."""
def answer_question(question: str) -> str by llm(
method="ReAct",
tools=[get_current_date, calculate],
stream=True
);

with entry {
print("Testing ReAct with streaming...\n");

question = "What is today's date and what is 25 * 4?";
print(f"Question: {question}\n");
print("Answer (streaming): ");

result = answer_question(question);

# Result should be a generator (for streaming)
assert result is not str, "Expected generator for streaming, got string";

# Collect all chunks
full_response = "";
for chunk in result {
print(chunk, end='', flush=True);
full_response += chunk;
}

print("\n");

# Verify the response contains expected content
assert "29-10-2025" in full_response, "Response should mention the date";
assert "100" in full_response, "Response should mention the calculation result";

print("\nTest passed! ✓");
}
11 changes: 11 additions & 0 deletions jac-byllm/tests/test_byllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,17 @@ def test_streaming_output(self) -> None:
stdout_value = captured_output.getvalue()
self.assertIn('The orca whale, or killer whale, is one of the most intelligent and adaptable marine predators', stdout_value)

def test_streaming_with_react(self) -> None:
"""Test streaming output with ReAct method (tool calling)."""
captured_output = io.StringIO()
sys.stdout = captured_output
jac_import("streaming_with_react", base_path=self.fixture_abs_path("./"))
sys.stdout = sys.__stdout__
stdout_value = captured_output.getvalue()
self.assertIn('29-10-2025', stdout_value)
self.assertIn('100', stdout_value)
self.assertIn('Test passed!', stdout_value)

def test_by_expr(self) -> None:
"""Test by llm['as'].expression instead of llm() call."""
captured_output = io.StringIO()
Expand Down