docs: add VIEWPORT_SIZE config and expand LLM provider documentation

CyrusNuevoDia · claude · CyrusNuevoDia · commit b92cac978985 · 2025-09-23T14:46:04.000-06:00
- Add VIEWPORT_SIZE environment variable configuration in .env.example - Document support for Azure OpenAI, Groq, and Ollama providers - Update README with comprehensive examples for all LLM providers - Clarify default viewport size (1440x900) and configuration options - Improve .env.example structure with clearer option groupings 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/.claude/settings.local.json b/.claude/settings.local.json
@@ -2,7 +2,8 @@
   "permissions": {
     "allow": [
       "Bash(git add:*)",
-      "Bash(git commit:*)"
+      "Bash(git commit:*)",
+      "Bash(git push:*)"
     ],
     "deny": [],
     "ask": []
diff --git a/.env.example b/.env.example
@@ -1,13 +1,23 @@
-# LLM Provider Configuration (required)
-# Configure each provider you plan to use with JSON configuration
-# Replace {account_id} and your-gateway-token with actual values
-# These are passed directly to BrowserUse's ChatX classes as kwargs
-ANTHROPIC_CONFIG='{"base_url": "optional", "default_headers": {...}}'
-OPENAI_CONFIG='{"base_url": "optional", "default_headers": {...}}'
-GEMINI_CONFIG='{"http_options": {"base_url": "optional", "headers": {...}}}'
-AZURE_OPENAI_CONFIG='{"azure_endpoint": "https://your-resource.openai.azure.com/", "api_version": "2024-02-01"}'
-GROQ_CONFIG='{"base_url": "https://api.groq.com/openai/v1"}'
-OLLAMA_CONFIG='{"base_url": "http://localhost:11434/v1"}'
+# LLM Provider Configuration
+# Option 1: Direct API access (no gateway) - providers use default endpoints
+# Nothing required here - providers will use their default API endpoints!
+
+# Option 2: With AI Gateway (Cloudflare example)
+# AI_GATEWAY_URL="https://gateway.ai.cloudflare.com/v1/{account_id}/ai-gateway"
+# AI_GATEWAY_HEADERS='{"cf-aig-authorization": "Bearer your-gateway-token"}'
+# ANTHROPIC_CONFIG='{"base_url": "${AI_GATEWAY_URL}/anthropic", "default_headers": ${AI_GATEWAY_HEADERS}}'
+# OPENAI_CONFIG='{"base_url": "${AI_GATEWAY_URL}/openai", "default_headers": ${AI_GATEWAY_HEADERS}}'
+# GEMINI_CONFIG='{"http_options": {"base_url": "${AI_GATEWAY_URL}/google-ai-studio", "headers": ${AI_GATEWAY_HEADERS}}}'
+
+# Option 3: Provider-specific configurations
+# Azure OpenAI
+# AZURE_OPENAI_CONFIG='{"azure_endpoint": "https://your-resource.openai.azure.com/", "api_version": "2024-02-01"}'
+
+# Groq
+# GROQ_CONFIG='{"base_url": "https://api.groq.com/openai/v1"}'
+
+# Ollama (local)
+# OLLAMA_CONFIG='{"base_url": "http://localhost:11434/v1"}'
 
 # Kernel Platform (required)
 # Get your API key from the Kernel platform dashboard
@@ -25,3 +35,6 @@ BROWSER_USE_LOGGING_LEVEL="info"
 
 # Set to "false" to disable anonymous telemetry
 ANONYMIZED_TELEMETRY="false"
+
+# Browser viewport size (default: 1440x900)
+# VIEWPORT_SIZE='{"width": 1440, "height": 900}'
diff --git a/README.md b/README.md
@@ -4,13 +4,13 @@ An AI-powered browser automation microservice built on the Kernel platform that
 
 ## Overview
 
-The browser-agent microservice provides AI-powered browser automation capabilities, allowing you to control browsers using natural language instructions. It supports multiple LLM providers (Anthropic Claude, OpenAI GPT, Google Gemini) and can handle complex multi-step web tasks including data extraction, form filling, file downloads, and CAPTCHA solving.
+The browser-agent microservice provides AI-powered browser automation capabilities, allowing you to control browsers using natural language instructions. It supports multiple LLM providers (Anthropic Claude, OpenAI GPT, Google Gemini, Azure OpenAI, Groq, and Ollama) and can handle complex multi-step web tasks including data extraction, form filling, file downloads, and CAPTCHA solving.
 
 ## Features
 
 - **AI-powered browser automation**: Uses LLMs to intelligently control browsers and perform complex web tasks
 - **Multi-step task execution**: Decomposes complex requests into sub-tasks and executes them sequentially
-- **Multi-provider LLM support**: Works with Anthropic Claude, OpenAI GPT, and Google Gemini
+- **Multi-provider LLM support**: Works with Anthropic Claude, OpenAI GPT, Google Gemini, Azure OpenAI, Groq, and Ollama
 - **File handling**: Automatically downloads PDFs and other files, uploads them to cloud storage
 - **CAPTCHA solving**: Built-in capability to handle CAPTCHAs and similar challenges
 - **Session management**: Creates isolated browser sessions with proper cleanup
@@ -38,8 +38,8 @@ Edit your `.env` file with the required values:
 
 ```bash
 # LLM Provider Configuration
-# Option 1: Direct API access (no gateway)
-# Nothing required here!
+# Option 1: Direct API access (no gateway) - providers use default endpoints
+# Nothing required here - providers will use their default API endpoints!
 
 # Option 2: With AI Gateway (Cloudflare example)
 AI_GATEWAY_URL="https://gateway.ai.cloudflare.com/v1/{account_id}/ai-gateway"
@@ -48,6 +48,16 @@ ANTHROPIC_CONFIG='{"base_url": "${AI_GATEWAY_URL}/anthropic", "default_headers":
 OPENAI_CONFIG='{"base_url": "${AI_GATEWAY_URL}/openai", "default_headers": ${AI_GATEWAY_HEADERS}}'
 GEMINI_CONFIG='{"http_options": {"base_url": "${AI_GATEWAY_URL}/google-ai-studio", "headers": ${AI_GATEWAY_HEADERS}}}'
 
+# Option 3: Provider-specific configurations
+# Azure OpenAI
+AZURE_OPENAI_CONFIG='{"azure_endpoint": "https://your-resource.openai.azure.com/", "api_version": "2024-02-01"}'
+
+# Groq
+GROQ_CONFIG='{"base_url": "https://api.groq.com/openai/v1"}'
+
+# Ollama (local)
+OLLAMA_CONFIG='{"base_url": "http://localhost:11434/v1"}'
+
 # Kernel Platform (required)
 KERNEL_API_KEY="sk_xxxxx"
 
@@ -56,6 +66,16 @@ S3_BUCKET="browser-agent"
 S3_ACCESS_KEY_ID="your-access-key"
 S3_ENDPOINT_URL="https://{account_id}.r2.cloudflarestorage.com"
 S3_SECRET_ACCESS_KEY="your-secret-key"
+
+# Optional Configuration
+# Browser viewport size (default: 1440x900)
+# VIEWPORT_SIZE='{"width": 1440, "height": 900}'
+
+# Set to "debug" for verbose browser-use logging
+# BROWSER_USE_LOGGING_LEVEL="info"
+
+# Set to "false" to disable anonymous telemetry
+# ANONYMIZED_TELEMETRY="false"
 ```
 
 Test that everything is working:
@@ -79,8 +99,8 @@ curl http://localhost:8080/health
 ```json
 {
   "input": "Task description for the browser agent",
-  "provider": "anthropic|gemini|openai",
-  "model": "claude-4-sonnet|gpt-4.1|gemini-2.5-pro",
+  "provider": "anthropic|gemini|openai|azure_openai|groq|ollama",
+  "model": "claude-3-5-sonnet-20241022|gpt-4o|gemini-2.0-flash-exp|llama-3.3-70b-versatile",
   "api_key": "your-llm-api-key",
   "instructions": "Optional additional instructions",
   "stealth": true,
@@ -95,7 +115,7 @@ curl http://localhost:8080/health
 ### Request Parameters
 
 - `input` (required): Natural language description of the task to perform
-- `provider` (required): LLM provider (`"anthropic"`, `"gemini"`, or `"openai"`)
+- `provider` (required): LLM provider (`"anthropic"`, `"gemini"`, `"openai"`, `"azure_openai"`, `"groq"`, or `"ollama"`)
 - `model` (required): Specific model to use (e.g., `"claude-3-sonnet-20240229"`)
 - `api_key` (required): API key for the LLM provider
 - `instructions` (optional): Additional context or constraints for the task
@@ -164,12 +184,48 @@ curl http://localhost:8080/health
 {
   "input": "Fill out the contact form on example.com with name 'John Doe', email 'john@example.com', and message 'Hello world'",
   "provider": "gemini",
-  "model": "gemini-2.5-pro",
+  "model": "gemini-2.0-flash-exp",
   "api_key": "your-gemini-key",
   "stealth": true
 }
 ```
 
+### Using Azure OpenAI
+
+```json
+{
+  "input": "Navigate to news.ycombinator.com and summarize the top 5 stories",
+  "provider": "azure_openai",
+  "model": "gpt-4o",
+  "api_key": "your-azure-openai-key",
+  "headless": true
+}
+```
+
+### Using Groq
+
+```json
+{
+  "input": "Search for 'climate change' on Wikipedia and extract the first paragraph",
+  "provider": "groq",
+  "model": "llama-3.3-70b-versatile",
+  "api_key": "your-groq-key",
+  "reasoning": true
+}
+```
+
+### Using Ollama (Local)
+
+```json
+{
+  "input": "Go to example.com and take a screenshot of the homepage",
+  "provider": "ollama",
+  "model": "llama3.2",
+  "api_key": "not-required-for-ollama",
+  "headless": false
+}
+```
+
 ## Available Commands
 
 This project uses [just](https://just.systems) as a task runner. All commands are defined in the `justfile`.
@@ -215,7 +271,7 @@ The deployment process:
 
 - **`src/app.py`**: Main Kernel app with `browser-agent` action. Creates browsers via kernel, instantiates Agent with custom session, runs tasks and returns trajectory results.
 - **`src/lib/browser/session.py`**: CustomBrowserSession that extends browser-use's BrowserSession, fixing viewport handling for CDP connections and setting fixed 1024x786 resolution.
-- **`src/lib/browser/models.py`**: BrowserAgentRequest model handling LLM provider abstraction (anthropic, gemini, openai) with AI gateway integration.
+- **`src/lib/browser/models.py`**: BrowserAgentRequest model handling LLM provider abstraction (anthropic, gemini, openai, azure_openai, groq, ollama) with AI gateway integration.
 - **`src/lib/gateway.py`**: AI gateway configuration from environment variables.
 
 ### Key Dependencies
diff --git a/lib/browser.py b/lib/browser.py
@@ -1,23 +1,21 @@
 import logging
-from pathlib import Path
-from os import environ as env
+from os import environ as env, getenv
 
-from browser_use import Agent, BrowserProfile, Browser
+from browser_use import BrowserProfile, Browser
 from kernel import AsyncKernel, KernelContext
 from kernel.types import BrowserCreateParams
+import anyio
+import orjson
 
 from lib.models import BrowserAgentRequest
 
-VIEWPORT_SIZE = {"width": 1280, "height": 800}
+VIEWPORT_SIZE = orjson.loads(getenv("VIEWPORT_SIZE", '{"width": 1440, "height": 900}'))
 
 kernel = AsyncKernel(api_key=env["KERNEL_API_KEY"])
 logger = logging.getLogger(__name__)
 
 
-def downloaded_files(agent: Agent) -> list[Path]:
-    if downloads_path := agent.browser_profile.downloads_path:
-        return list(Path(downloads_path).glob("*"))
-    return []
+DOWNLOADS_PATH = anyio.Path(getenv("DOWNLOADS_PATH", "/tmp/downloads"))
 
 
 async def create_browser(ctx: KernelContext, request: BrowserAgentRequest):
@@ -46,7 +44,7 @@ async def create_browser(ctx: KernelContext, request: BrowserAgentRequest):
             headless=headless,
             screen=VIEWPORT_SIZE,
             viewport=VIEWPORT_SIZE,
-            downloads_path="/tmp/downloads",
+            downloads_path=str(DOWNLOADS_PATH),
             auto_download_pdfs=True,
         ),
     )
diff --git a/lib/storage.py b/lib/storage.py
@@ -3,9 +3,10 @@
 from os import environ as env
 from pathlib import Path
 
+from botocore.client import Config
+import anyio
 import boto3
 import orjson
-from botocore.client import Config
 
 from lib.asyncio import asyncify
 
@@ -23,7 +24,7 @@
 
 
 @asyncify
-def upload_file(file: Path | str, key: str) -> str:
+def upload_file(file: anyio.Path | Path | str, key: str) -> str:
     client.upload_file(
         Bucket=BUCKET,
         Filename=str(file),
@@ -50,10 +51,14 @@ def upload_json(data: t.Any, key: str) -> str:
     )
 
 
-async def upload_files(dir: str, files: list[Path | str]) -> dict[str, str]:
-    filenames = [Path(f).name for f in files]
-    object_keys = [f"{dir}/{n}" for n in filenames]
+async def upload_files(
+    dir: str,
+    files: t.AsyncIterator[anyio.Path | Path | str],
+) -> dict[str, str]:
+    files = [Path(f) async for f in files]
+    names = [f.name for f in files]
+    object_keys = [f"{dir}/{n}" for n in names]
     presigned_urls = await asyncio.gather(
         *[upload_file(f, k) for f, k in zip(files, object_keys)]
     )
-    return dict(zip(filenames, presigned_urls))
+    return dict(zip(names, presigned_urls))
diff --git a/main.py b/main.py
@@ -6,7 +6,7 @@
 from zenbase_llml import llml
 
 from lib.ai import AGENT_INSTRUCTIONS, ChatFactory
-from lib.browser import create_browser, downloaded_files
+from lib.browser import DOWNLOADS_PATH, create_browser
 from lib.models import BrowserAgentRequest, BrowserAgentResponse
 from lib.storage import upload_files, upload_json
 
@@ -44,7 +44,7 @@ async def perform(ctx: KernelContext, params: dict):
     trajectory = await agent.run(max_steps=request.max_steps)
 
     uploads = await asyncio.gather(
-        upload_files(dir=session, files=downloaded_files(agent)),
+        upload_files(dir=session, files=DOWNLOADS_PATH.glob("*")),
         upload_json(trajectory.model_dump(), key=f"{session}/trajectory.json"),
     )