Skip to content

Commit b92cac9

Browse files
CyrusNuevoDiaclaude
andcommitted
docs: add VIEWPORT_SIZE config and expand LLM provider documentation
- Add VIEWPORT_SIZE environment variable configuration in .env.example - Document support for Azure OpenAI, Groq, and Ollama providers - Update README with comprehensive examples for all LLM providers - Clarify default viewport size (1440x900) and configuration options - Improve .env.example structure with clearer option groupings 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <[email protected]>
1 parent 4e9f46d commit b92cac9

File tree

6 files changed

+110
-37
lines changed

6 files changed

+110
-37
lines changed

.claude/settings.local.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
"permissions": {
33
"allow": [
44
"Bash(git add:*)",
5-
"Bash(git commit:*)"
5+
"Bash(git commit:*)",
6+
"Bash(git push:*)"
67
],
78
"deny": [],
89
"ask": []

.env.example

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,23 @@
1-
# LLM Provider Configuration (required)
2-
# Configure each provider you plan to use with JSON configuration
3-
# Replace {account_id} and your-gateway-token with actual values
4-
# These are passed directly to BrowserUse's ChatX classes as kwargs
5-
ANTHROPIC_CONFIG='{"base_url": "optional", "default_headers": {...}}'
6-
OPENAI_CONFIG='{"base_url": "optional", "default_headers": {...}}'
7-
GEMINI_CONFIG='{"http_options": {"base_url": "optional", "headers": {...}}}'
8-
AZURE_OPENAI_CONFIG='{"azure_endpoint": "https://your-resource.openai.azure.com/", "api_version": "2024-02-01"}'
9-
GROQ_CONFIG='{"base_url": "https://api.groq.com/openai/v1"}'
10-
OLLAMA_CONFIG='{"base_url": "http://localhost:11434/v1"}'
1+
# LLM Provider Configuration
2+
# Option 1: Direct API access (no gateway) - providers use default endpoints
3+
# Nothing required here - providers will use their default API endpoints!
4+
5+
# Option 2: With AI Gateway (Cloudflare example)
6+
# AI_GATEWAY_URL="https://gateway.ai.cloudflare.com/v1/{account_id}/ai-gateway"
7+
# AI_GATEWAY_HEADERS='{"cf-aig-authorization": "Bearer your-gateway-token"}'
8+
# ANTHROPIC_CONFIG='{"base_url": "${AI_GATEWAY_URL}/anthropic", "default_headers": ${AI_GATEWAY_HEADERS}}'
9+
# OPENAI_CONFIG='{"base_url": "${AI_GATEWAY_URL}/openai", "default_headers": ${AI_GATEWAY_HEADERS}}'
10+
# GEMINI_CONFIG='{"http_options": {"base_url": "${AI_GATEWAY_URL}/google-ai-studio", "headers": ${AI_GATEWAY_HEADERS}}}'
11+
12+
# Option 3: Provider-specific configurations
13+
# Azure OpenAI
14+
# AZURE_OPENAI_CONFIG='{"azure_endpoint": "https://your-resource.openai.azure.com/", "api_version": "2024-02-01"}'
15+
16+
# Groq
17+
# GROQ_CONFIG='{"base_url": "https://api.groq.com/openai/v1"}'
18+
19+
# Ollama (local)
20+
# OLLAMA_CONFIG='{"base_url": "http://localhost:11434/v1"}'
1121

1222
# Kernel Platform (required)
1323
# Get your API key from the Kernel platform dashboard
@@ -25,3 +35,6 @@ BROWSER_USE_LOGGING_LEVEL="info"
2535

2636
# Set to "false" to disable anonymous telemetry
2737
ANONYMIZED_TELEMETRY="false"
38+
39+
# Browser viewport size (default: 1440x900)
40+
# VIEWPORT_SIZE='{"width": 1440, "height": 900}'

README.md

Lines changed: 65 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@ An AI-powered browser automation microservice built on the Kernel platform that
44

55
## Overview
66

7-
The browser-agent microservice provides AI-powered browser automation capabilities, allowing you to control browsers using natural language instructions. It supports multiple LLM providers (Anthropic Claude, OpenAI GPT, Google Gemini) and can handle complex multi-step web tasks including data extraction, form filling, file downloads, and CAPTCHA solving.
7+
The browser-agent microservice provides AI-powered browser automation capabilities, allowing you to control browsers using natural language instructions. It supports multiple LLM providers (Anthropic Claude, OpenAI GPT, Google Gemini, Azure OpenAI, Groq, and Ollama) and can handle complex multi-step web tasks including data extraction, form filling, file downloads, and CAPTCHA solving.
88

99
## Features
1010

1111
- **AI-powered browser automation**: Uses LLMs to intelligently control browsers and perform complex web tasks
1212
- **Multi-step task execution**: Decomposes complex requests into sub-tasks and executes them sequentially
13-
- **Multi-provider LLM support**: Works with Anthropic Claude, OpenAI GPT, and Google Gemini
13+
- **Multi-provider LLM support**: Works with Anthropic Claude, OpenAI GPT, Google Gemini, Azure OpenAI, Groq, and Ollama
1414
- **File handling**: Automatically downloads PDFs and other files, uploads them to cloud storage
1515
- **CAPTCHA solving**: Built-in capability to handle CAPTCHAs and similar challenges
1616
- **Session management**: Creates isolated browser sessions with proper cleanup
@@ -38,8 +38,8 @@ Edit your `.env` file with the required values:
3838

3939
```bash
4040
# LLM Provider Configuration
41-
# Option 1: Direct API access (no gateway)
42-
# Nothing required here!
41+
# Option 1: Direct API access (no gateway) - providers use default endpoints
42+
# Nothing required here - providers will use their default API endpoints!
4343

4444
# Option 2: With AI Gateway (Cloudflare example)
4545
AI_GATEWAY_URL="https://gateway.ai.cloudflare.com/v1/{account_id}/ai-gateway"
@@ -48,6 +48,16 @@ ANTHROPIC_CONFIG='{"base_url": "${AI_GATEWAY_URL}/anthropic", "default_headers":
4848
OPENAI_CONFIG='{"base_url": "${AI_GATEWAY_URL}/openai", "default_headers": ${AI_GATEWAY_HEADERS}}'
4949
GEMINI_CONFIG='{"http_options": {"base_url": "${AI_GATEWAY_URL}/google-ai-studio", "headers": ${AI_GATEWAY_HEADERS}}}'
5050

51+
# Option 3: Provider-specific configurations
52+
# Azure OpenAI
53+
AZURE_OPENAI_CONFIG='{"azure_endpoint": "https://your-resource.openai.azure.com/", "api_version": "2024-02-01"}'
54+
55+
# Groq
56+
GROQ_CONFIG='{"base_url": "https://api.groq.com/openai/v1"}'
57+
58+
# Ollama (local)
59+
OLLAMA_CONFIG='{"base_url": "http://localhost:11434/v1"}'
60+
5161
# Kernel Platform (required)
5262
KERNEL_API_KEY="sk_xxxxx"
5363

@@ -56,6 +66,16 @@ S3_BUCKET="browser-agent"
5666
S3_ACCESS_KEY_ID="your-access-key"
5767
S3_ENDPOINT_URL="https://{account_id}.r2.cloudflarestorage.com"
5868
S3_SECRET_ACCESS_KEY="your-secret-key"
69+
70+
# Optional Configuration
71+
# Browser viewport size (default: 1440x900)
72+
# VIEWPORT_SIZE='{"width": 1440, "height": 900}'
73+
74+
# Set to "debug" for verbose browser-use logging
75+
# BROWSER_USE_LOGGING_LEVEL="info"
76+
77+
# Set to "false" to disable anonymous telemetry
78+
# ANONYMIZED_TELEMETRY="false"
5979
```
6080

6181
Test that everything is working:
@@ -79,8 +99,8 @@ curl http://localhost:8080/health
7999
```json
80100
{
81101
"input": "Task description for the browser agent",
82-
"provider": "anthropic|gemini|openai",
83-
"model": "claude-4-sonnet|gpt-4.1|gemini-2.5-pro",
102+
"provider": "anthropic|gemini|openai|azure_openai|groq|ollama",
103+
"model": "claude-3-5-sonnet-20241022|gpt-4o|gemini-2.0-flash-exp|llama-3.3-70b-versatile",
84104
"api_key": "your-llm-api-key",
85105
"instructions": "Optional additional instructions",
86106
"stealth": true,
@@ -95,7 +115,7 @@ curl http://localhost:8080/health
95115
### Request Parameters
96116

97117
- `input` (required): Natural language description of the task to perform
98-
- `provider` (required): LLM provider (`"anthropic"`, `"gemini"`, or `"openai"`)
118+
- `provider` (required): LLM provider (`"anthropic"`, `"gemini"`, `"openai"`, `"azure_openai"`, `"groq"`, or `"ollama"`)
99119
- `model` (required): Specific model to use (e.g., `"claude-3-sonnet-20240229"`)
100120
- `api_key` (required): API key for the LLM provider
101121
- `instructions` (optional): Additional context or constraints for the task
@@ -164,12 +184,48 @@ curl http://localhost:8080/health
164184
{
165185
"input": "Fill out the contact form on example.com with name 'John Doe', email '[email protected]', and message 'Hello world'",
166186
"provider": "gemini",
167-
"model": "gemini-2.5-pro",
187+
"model": "gemini-2.0-flash-exp",
168188
"api_key": "your-gemini-key",
169189
"stealth": true
170190
}
171191
```
172192

193+
### Using Azure OpenAI
194+
195+
```json
196+
{
197+
"input": "Navigate to news.ycombinator.com and summarize the top 5 stories",
198+
"provider": "azure_openai",
199+
"model": "gpt-4o",
200+
"api_key": "your-azure-openai-key",
201+
"headless": true
202+
}
203+
```
204+
205+
### Using Groq
206+
207+
```json
208+
{
209+
"input": "Search for 'climate change' on Wikipedia and extract the first paragraph",
210+
"provider": "groq",
211+
"model": "llama-3.3-70b-versatile",
212+
"api_key": "your-groq-key",
213+
"reasoning": true
214+
}
215+
```
216+
217+
### Using Ollama (Local)
218+
219+
```json
220+
{
221+
"input": "Go to example.com and take a screenshot of the homepage",
222+
"provider": "ollama",
223+
"model": "llama3.2",
224+
"api_key": "not-required-for-ollama",
225+
"headless": false
226+
}
227+
```
228+
173229
## Available Commands
174230

175231
This project uses [just](https://just.systems) as a task runner. All commands are defined in the `justfile`.
@@ -215,7 +271,7 @@ The deployment process:
215271

216272
- **`src/app.py`**: Main Kernel app with `browser-agent` action. Creates browsers via kernel, instantiates Agent with custom session, runs tasks and returns trajectory results.
217273
- **`src/lib/browser/session.py`**: CustomBrowserSession that extends browser-use's BrowserSession, fixing viewport handling for CDP connections and setting fixed 1024x786 resolution.
218-
- **`src/lib/browser/models.py`**: BrowserAgentRequest model handling LLM provider abstraction (anthropic, gemini, openai) with AI gateway integration.
274+
- **`src/lib/browser/models.py`**: BrowserAgentRequest model handling LLM provider abstraction (anthropic, gemini, openai, azure_openai, groq, ollama) with AI gateway integration.
219275
- **`src/lib/gateway.py`**: AI gateway configuration from environment variables.
220276

221277
### Key Dependencies

lib/browser.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,21 @@
11
import logging
2-
from pathlib import Path
3-
from os import environ as env
2+
from os import environ as env, getenv
43

5-
from browser_use import Agent, BrowserProfile, Browser
4+
from browser_use import BrowserProfile, Browser
65
from kernel import AsyncKernel, KernelContext
76
from kernel.types import BrowserCreateParams
7+
import anyio
8+
import orjson
89

910
from lib.models import BrowserAgentRequest
1011

11-
VIEWPORT_SIZE = {"width": 1280, "height": 800}
12+
VIEWPORT_SIZE = orjson.loads(getenv("VIEWPORT_SIZE", '{"width": 1440, "height": 900}'))
1213

1314
kernel = AsyncKernel(api_key=env["KERNEL_API_KEY"])
1415
logger = logging.getLogger(__name__)
1516

1617

17-
def downloaded_files(agent: Agent) -> list[Path]:
18-
if downloads_path := agent.browser_profile.downloads_path:
19-
return list(Path(downloads_path).glob("*"))
20-
return []
18+
DOWNLOADS_PATH = anyio.Path(getenv("DOWNLOADS_PATH", "/tmp/downloads"))
2119

2220

2321
async def create_browser(ctx: KernelContext, request: BrowserAgentRequest):
@@ -46,7 +44,7 @@ async def create_browser(ctx: KernelContext, request: BrowserAgentRequest):
4644
headless=headless,
4745
screen=VIEWPORT_SIZE,
4846
viewport=VIEWPORT_SIZE,
49-
downloads_path="/tmp/downloads",
47+
downloads_path=str(DOWNLOADS_PATH),
5048
auto_download_pdfs=True,
5149
),
5250
)

lib/storage.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
from os import environ as env
44
from pathlib import Path
55

6+
from botocore.client import Config
7+
import anyio
68
import boto3
79
import orjson
8-
from botocore.client import Config
910

1011
from lib.asyncio import asyncify
1112

@@ -23,7 +24,7 @@
2324

2425

2526
@asyncify
26-
def upload_file(file: Path | str, key: str) -> str:
27+
def upload_file(file: anyio.Path | Path | str, key: str) -> str:
2728
client.upload_file(
2829
Bucket=BUCKET,
2930
Filename=str(file),
@@ -50,10 +51,14 @@ def upload_json(data: t.Any, key: str) -> str:
5051
)
5152

5253

53-
async def upload_files(dir: str, files: list[Path | str]) -> dict[str, str]:
54-
filenames = [Path(f).name for f in files]
55-
object_keys = [f"{dir}/{n}" for n in filenames]
54+
async def upload_files(
55+
dir: str,
56+
files: t.AsyncIterator[anyio.Path | Path | str],
57+
) -> dict[str, str]:
58+
files = [Path(f) async for f in files]
59+
names = [f.name for f in files]
60+
object_keys = [f"{dir}/{n}" for n in names]
5661
presigned_urls = await asyncio.gather(
5762
*[upload_file(f, k) for f, k in zip(files, object_keys)]
5863
)
59-
return dict(zip(filenames, presigned_urls))
64+
return dict(zip(names, presigned_urls))

main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from zenbase_llml import llml
77

88
from lib.ai import AGENT_INSTRUCTIONS, ChatFactory
9-
from lib.browser import create_browser, downloaded_files
9+
from lib.browser import DOWNLOADS_PATH, create_browser
1010
from lib.models import BrowserAgentRequest, BrowserAgentResponse
1111
from lib.storage import upload_files, upload_json
1212

@@ -44,7 +44,7 @@ async def perform(ctx: KernelContext, params: dict):
4444
trajectory = await agent.run(max_steps=request.max_steps)
4545

4646
uploads = await asyncio.gather(
47-
upload_files(dir=session, files=downloaded_files(agent)),
47+
upload_files(dir=session, files=DOWNLOADS_PATH.glob("*")),
4848
upload_json(trajectory.model_dump(), key=f"{session}/trajectory.json"),
4949
)
5050

0 commit comments

Comments
 (0)