Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ BugZooka supports two complementary modes for monitoring Slack channels that can
# Run with both polling AND socket mode
make run ARGS="--product openshift --ci prow --enable-socket-mode"
```

**Socket Mode Requirements:**
- An app-level token (`xapp-*`) must be configured as `SLACK_APP_TOKEN`
- Socket Mode must be enabled in your Slack app settings
Expand Down Expand Up @@ -140,6 +140,7 @@ GENERIC_INFERENCE_URL="YOUR_INFERENCE_ENDPOINT"
GENERIC_INFERENCE_TOKEN="YOUR_INFERENCE_TOKEN"
GENERIC_MODEL="YOUR_INFERENCE_MODEL"
```

**Note**: Please make sure to provide details for all the mandatory attributes and for the product that is intended to be used for testing along with fallback (i.e. GENERIC details) to handle failover use-cases.


Expand Down Expand Up @@ -315,6 +316,7 @@ BugZooka has a dependency on [orion-mcp service](https://github.com/jtaleric/ori
export QUAY_CRED='<base64 encoded pull secret>'
export BUGZOOKA_IMAGE='<bugzooka image tag>'
export BUGZOOKA_NAMESPACE='<your namespace>'
export JIRA_MCP_IMAGE='<jira mcp server image>'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as other comments about JIRA MCP isolation.

make deploy

# Cleanup resources
Expand Down
11 changes: 10 additions & 1 deletion bugzooka/analysis/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@
- No emojis in tables
- Separate each config section with 80 equals signs.

**Remember:**
**Remember:**
- The tools provide percentage changes - use them as provided
- CHECK thresholds (5% and 10%) before categorizing
- SORT by absolute percentage change (highest first) - this is mandatory
Expand All @@ -127,3 +127,12 @@
Beginning analysis now.
""",
}
# Jira tool prompt - used when Jira MCP tools are available
JIRA_TOOL_PROMPT = {
"system": (
"\n\nIMPORTANT: You have access to JIRA search tools. After analyzing the error, "
"ALWAYS search for related issues in JIRA using the search_jira_issues tool with the OCPBUGS project. "
"Extract key error terms, component names, or operators from the log summary to search for similar issues. "
"Include the top 3 most relevant JIRA issues in your final response under a 'Related JIRA Issues' section."
),
}
148 changes: 86 additions & 62 deletions bugzooka/integrations/gemini_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
INFERENCE_MAX_TOOL_ITERATIONS,
)
from bugzooka.integrations.inference import InferenceAPIUnavailableError
from bugzooka.analysis.prompts import JIRA_TOOL_PROMPT
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Most of the changes here are related to formatting, can we avoid them in this PR please and only keep the real code changes? Those optimizations can be taken care in other follow ups if requried.



logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -44,7 +45,7 @@ def __init__(self, api_key=None, base_url=None, verify_ssl=None, timeout=None):
# Timeout configuration
if timeout is None:
timeout = float(os.getenv("GEMINI_TIMEOUT", "60.0"))

logger.debug("Gemini client timeout set to %.1f seconds", timeout)

# SSL verification configuration
Expand Down Expand Up @@ -79,19 +80,23 @@ def chat_completions_create(self, messages, model="gemini-2.0-flash", **kwargs):
"""
try:
logger.debug("Calling Gemini API: %s, Model=%s", self.base_url, model)

response = self.client.chat.completions.create(
model=model, messages=messages, **kwargs
)

# Log token usage information
if hasattr(response, 'usage') and response.usage:
if hasattr(response, "usage") and response.usage:
usage = response.usage
logger.info("📊 Token usage - Prompt: %d, Completion: %d, Total: %d",
usage.prompt_tokens, usage.completion_tokens, usage.total_tokens)
logger.info(
"📊 Token usage - Prompt: %d, Completion: %d, Total: %d",
usage.prompt_tokens,
usage.completion_tokens,
usage.total_tokens,
)
else:
logger.debug("No usage information available in response")

logger.debug("Gemini API call successful")
return response
except Exception as e:
Expand Down Expand Up @@ -145,10 +150,10 @@ async def execute_tool_call(tool_name, tool_args, available_tools):
logger.debug("Tool arguments: %s", json.dumps(tool_args, indent=2))

# Check if the tool is async (has coroutine attribute or ainvoke method)
if hasattr(tool, 'coroutine') and tool.coroutine:
if hasattr(tool, "coroutine") and tool.coroutine:
# MCP tools have a coroutine attribute
result = await tool.ainvoke(tool_args)
elif hasattr(tool, 'ainvoke'):
elif hasattr(tool, "ainvoke"):
# Some tools have ainvoke method
result = await tool.ainvoke(tool_args)
else:
Expand All @@ -158,19 +163,23 @@ async def execute_tool_call(tool_name, tool_args, available_tools):
# Log result
result_str = str(result)
result_length = len(result_str)

# Check for empty or minimal results
if not result_str or result_str.strip() in ["", "null", "None", "{}", "[]"]:
logger.warning("⚠️ Tool %s returned empty or null result", tool_name)
elif len(result_str.strip()) < 50:
logger.warning("⚠️ Tool %s returned small result (%d chars): %s",
tool_name, result_length, result_str)
logger.warning(
"⚠️ Tool %s returned small result (%d chars): %s",
tool_name,
result_length,
result_str,
)
else:
logger.info("✅ Tool %s completed (%d chars)", tool_name, result_length)

# Log full output at DEBUG level
logger.debug("Tool %s output: %s", tool_name, result_str)

return result_str
except Exception as e:
error_msg = f"Error executing tool '{tool_name}': {str(e)}"
Expand All @@ -181,20 +190,17 @@ async def execute_tool_call(tool_name, tool_args, available_tools):


async def analyze_with_gemini_agentic(
messages: list,
tools=None,
model="gemini-2.0-flash",
max_iterations=None
messages: list, tools=None, model="gemini-2.0-flash", max_iterations=None
):
"""
Generic agentic loop for Gemini with tool calling support.

This function implements the agentic pattern where Gemini can iteratively:
1. Analyze the current context
2. Decide to call tools if needed
3. Process tool results
4. Generate final answer

:param messages: List of message dictionaries (system, user, assistant prompts)
:param tools: List of LangChain tools available for Gemini to call (optional)
:param model: Gemini model to use (default: gemini-2.0-flash)
Expand All @@ -203,18 +209,21 @@ async def analyze_with_gemini_agentic(
"""
if max_iterations is None:
max_iterations = INFERENCE_MAX_TOOL_ITERATIONS

try:
gemini_client = GeminiClient()

# Convert LangChain tools to OpenAI format if provided
openai_tools = None
if tools:
openai_tools = convert_langchain_tools_to_openai_format(tools)
tool_names = [t["function"]["name"] for t in openai_tools]
logger.info("Starting Gemini analysis with %d tools: %s",
len(openai_tools), ", ".join(tool_names))

logger.info(
"Starting Gemini analysis with %d tools: %s",
len(openai_tools),
", ".join(tool_names),
)

logger.debug("Starting agentic loop with %d messages", len(messages))

# Tool calling loop - iterate until we get a final answer or hit max iterations
Expand All @@ -240,44 +249,51 @@ async def analyze_with_gemini_agentic(
response_message = response.choices[0].message

# Check if Gemini wants to call tools
tool_calls = getattr(response_message, 'tool_calls', None)
tool_calls = getattr(response_message, "tool_calls", None)

if not tool_calls:
# No tool calls - we have the final answer
content = response_message.content
if content:
logger.info("Analysis complete after %d iteration(s)", iteration)
logger.debug("Response: %s", content[:200] + "..." if len(content) > 200 else content)
logger.debug(
"Response: %s",
content[:200] + "..." if len(content) > 200 else content,
)
else:
logger.warning("Gemini returned None content, using empty string")
content = ""
return content

# Gemini wants to call tools - execute them
tool_names_called = [tc.function.name for tc in tool_calls]
logger.info("Calling %d tool(s): %s", len(tool_calls), ", ".join(tool_names_called))
logger.info(
"Calling %d tool(s): %s", len(tool_calls), ", ".join(tool_names_called)
)

# Add the assistant's message with tool calls to conversation
messages.append({
"role": "assistant",
"content": response_message.content or "",
"tool_calls": [
{
"id": tc.id,
"type": "function",
"function": {
"name": tc.function.name,
"arguments": tc.function.arguments
messages.append(
{
"role": "assistant",
"content": response_message.content or "",
"tool_calls": [
{
"id": tc.id,
"type": "function",
"function": {
"name": tc.function.name,
"arguments": tc.function.arguments,
},
}
}
for tc in tool_calls
]
})
for tc in tool_calls
],
}
)

# Execute each tool call and add results to messages
for tool_call in tool_calls:
function_name = tool_call.function.name

try:
function_args = json.loads(tool_call.function.arguments)
except json.JSONDecodeError as e:
Expand All @@ -286,25 +302,27 @@ async def analyze_with_gemini_agentic(
else:
# Execute the tool (await since it's now async)
function_result = await execute_tool_call(
function_name,
function_args,
tools
function_name, function_args, tools
)

# Add tool result to messages
messages.append({
"role": "tool",
"tool_call_id": tool_call.id,
"name": function_name,
"content": function_result
})
messages.append(
{
"role": "tool",
"tool_call_id": tool_call.id,
"name": function_name,
"content": function_result,
}
)

# Continue loop to let Gemini process tool results

# If we hit max iterations without a final answer
logger.warning("Reached maximum iterations (%d) without final answer", max_iterations)
logger.warning(
"Reached maximum iterations (%d) without final answer", max_iterations
)
return "Analysis incomplete: Maximum tool calling iterations reached. Please try again with a simpler query."

except Exception as e:
logger.error("Error in Gemini agentic loop: %s", str(e), exc_info=True)
raise InferenceAPIUnavailableError(
Expand All @@ -318,7 +336,7 @@ async def analyze_log_with_gemini(
error_summary: str,
model="gemini-2.0-flash",
tools=None,
max_iterations=None
max_iterations=None,
):
"""
Analyzes log summaries using Gemini API with product-specific prompts and optional tool calling.
Expand All @@ -333,7 +351,7 @@ async def analyze_log_with_gemini(
"""
try:
logger.info("Starting log analysis for product: %s", product)

prompt_config = product_config["prompt"][product]
try:
formatted_content = prompt_config["user"].format(
Expand All @@ -342,20 +360,26 @@ async def analyze_log_with_gemini(
except KeyError:
formatted_content = prompt_config["user"].format(summary=error_summary)

logger.debug("Error summary: %s", error_summary[:150] + "..." if len(error_summary) > 150 else error_summary)
logger.debug(
"Error summary: %s",
error_summary[:150] + "..." if len(error_summary) > 150 else error_summary,
)

# Append Jira prompt if Jira MCP tools are available
system_prompt = prompt_config["system"]
if tools and any(getattr(t, "name", "") == "search_jira_issues" for t in tools):
logger.info("Jira MCP tools detected - injecting Jira prompt")
system_prompt += JIRA_TOOL_PROMPT["system"]

messages = [
{"role": "system", "content": prompt_config["system"]},
{"role": "system", "content": system_prompt},
{"role": "user", "content": formatted_content},
{"role": "assistant", "content": prompt_config["assistant"]},
]

# Use the generic agentic loop
return await analyze_with_gemini_agentic(
messages=messages,
tools=tools,
model=model,
max_iterations=max_iterations
messages=messages, tools=tools, model=model, max_iterations=max_iterations
)

except Exception as e:
Expand Down
4 changes: 4 additions & 0 deletions kustomize/base/configmap-mcp-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ data:
"orion_mcp_server": {
"url": "http://orion-mcp.orion-mcp:3030/mcp",
"transport": "streamable_http"
},
"jira_mcp_server": {
"url": "http://jira-mcp.${BUGZOOKA_NAMESPACE}:3031/mcp",
"transport": "streamable_http"
}
}
}
10 changes: 5 additions & 5 deletions kustomize/base/configmap-prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ metadata:
data:
prompts.json: |
{
"OPENSHIFT_PROMPT": {
"system": "You are an expert in OpenShift, Kubernetes, and cloud infrastructure. Your task is to analyze logs and summaries related to OpenShift environments. Given a log summary, identify the root cause, potential fixes, and affected components. Be as consise as possible (under 5000 characters), but precise and avoid generic troubleshooting steps. Prioritize OpenShift-specific debugging techniques. Keep in mind that the cluster is ephemeral and is destroyed after the build is complete, but all relevant logs and metrics are available. Use markdown formatting for the output with only one level of bullet points, do not use bold text except for the headers.",
"user": "Here is the log summary from an OpenShift environment:\n\n{summary}\n\nBased on this summary, provide a structured breakdown of:\n- The OpenShift component likely affected (e.g., etcd, kube-apiserver, ingress, SDN, Machine API)\n- The probable root cause\n- Steps to verify the issue further\n- Suggested resolution, including OpenShift-specific commands or configurations.",
"assistant": "**Affected Component:** <Identified component>\n\n**Probable Root Cause:** <Describe why this issue might be occurring>\n\n**Verification Steps:**\n- <Step 1>\n- <Step 2>\n- <Step 3>\n\n**Suggested Resolution:**\n- <OpenShift CLI commands>\n- <Relevant OpenShift configurations>"
},
"OPENSHIFT_PROMPT": {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Strange, do you have an inbuilt linter in our IDE? Almost all the changes are being formatted.

"system": "You are an expert in OpenShift, Kubernetes, and cloud infrastructure. Your task is to analyze logs and summaries related to OpenShift environments. Given a log summary, identify the root cause, potential fixes, and affected components. Be as consise as possible (under 5000 characters), but precise and avoid generic troubleshooting steps. Prioritize OpenShift-specific debugging techniques. Keep in mind that the cluster is ephemeral and is destroyed after the build is complete, but all relevant logs and metrics are available. Use markdown formatting for the output with only one level of bullet points, do not use bold text except for the headers.",
"user": "Here is the log summary from an OpenShift environment:\n\n{summary}\n\nBased on this summary, provide a structured breakdown of:\n- The OpenShift component likely affected (e.g., etcd, kube-apiserver, ingress, SDN, Machine API)\n- The probable root cause\n- Steps to verify the issue further\n- Suggested resolution, including OpenShift-specific commands or configurations",
"assistant": "**Affected Component:** <Identified component>\n\n**Probable Root Cause:** <Describe why this issue might be occurring>\n\n**Verification Steps:**\n- <Step 1>\n- <Step 2>\n- <Step 3>\n\n**Suggested Resolution:**\n- <OpenShift CLI commands>\n- <Relevant OpenShift configurations>"
},
"ANSIBLE_PROMPT": {
"system": "You are an expert in Ansible automation, playbook debugging, and infrastructure as code (IaC). Your task is to analyze log summaries related to Ansible execution, playbook failures, and task errors. Given a log summary, identify the root cause, affected tasks, and potential fixes. Prioritize Ansible-specific debugging techniques over generic troubleshooting.",
"user": "Here is the log summary from an Ansible execution:\n\n{summary}\n\nBased on this summary, provide a structured breakdown of:\n- The failed Ansible task and module involved\n- The probable root cause\n- Steps to reproduce or verify the issue\n- Suggested resolution, including relevant playbook changes or command-line fixes.",
Expand Down
Loading