Merge pull request #34 from meta-llama/update-facility-eval

heyjustinai · web-flow · commit dd9232fab0b1 · 2025-07-24T21:20:20.000+01:00
fix: update evals to use openrouter for facility use case
diff --git a/use-cases/facility-support-analyzer/eval.ipynb b/use-cases/facility-support-analyzer/eval.ipynb
@@ -19,6 +19,7 @@
    "source": [
     "from typing import Any, Dict\n",
     "import json\n",
+    "import re\n",
     "\n",
     "def parse_json(input_string: str):\n",
     "    \"\"\"\n",
@@ -88,14 +89,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "6a447f4a-5aac-4f85-8393-6f9bade1ce30",
    "metadata": {},
    "outputs": [],
    "source": [
     "import yaml\n",
     "\n",
-    "with open('facility_v2_train.json') as stream:\n",
+    "with open('dataset.json') as stream:\n",
     "    dataset = json.load(stream)\n",
     "\n",
     "with open('facility_prompt.yaml') as stream:\n",
@@ -104,58 +105,88 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "id": "ba35209a-c778-4d9b-a575-a27bb9078caf",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "60"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "dataset_test = dataset[int(len(dataset)*0.7):]\n",
     "len(dataset_test)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "07f83eb5-a957-4f77-b6c9-fdd943f58cc2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# from openai import OpenAI\n",
-    "from gen_ai_hub.proxy.native.openai import OpenAI"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3f5e7904-2ca8-4784-9937-6fa824b3d109",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "client = OpenAI()"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "e68c6fd3-b191-47e4-9d18-9fa1596ecb50",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/justinai/anaconda3/envs/sap-prompt-opt/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Processing batches:  92%|█████████▏| 11/12 [00:58<00:05,  5.46s/it]"
+     ]
+    }
+   ],
    "source": [
+    "import asyncio\n",
     "from tqdm.auto import tqdm\n",
+    "from openai import AsyncOpenAI\n",
+    "import os\n",
     "\n",
-    "result = []\n",
+    "# Configure OpenRouter client\n",
+    "client = AsyncOpenAI(\n",
+    "    base_url=\"https://openrouter.ai/api/v1\",\n",
+    "    api_key=os.getenv(\"OPENROUTER_API_KEY\"),  # Make sure to set this environment variable\n",
+    ")\n",
     "\n",
-    "for entry in tqdm(dataset_test):\n",
-    "    output = client.chat.completions.create(\n",
-    "        model=\"gpt-4o\",\n",
-    "        messages=[\n",
-    "            {\"role\": \"system\", \"content\": prompt[\"system\"]},\n",
-    "            {\"role\": \"user\", \"content\": prompt[\"user\"].format(**entry[\"fields\"])},\n",
-    "        ],\n",
-    "        temperature=0.\n",
-    "    )\n",
-    "    prediction = output.choices[0].message.content\n",
-    "    result.append(evaluate(entry[\"answer\"], prediction))\n",
+    "async def process_entry(entry):\n",
+    "    \"\"\"Process a single entry with OpenRouter\"\"\"\n",
+    "    try:\n",
+    "        output = await client.chat.completions.create(\n",
+    "            model=\"meta-llama/llama-3.3-70b-instruct\",\n",
+    "            messages=[\n",
+    "                {\"role\": \"system\", \"content\": prompt[\"system\"]},\n",
+    "                {\"role\": \"user\", \"content\": prompt[\"user\"].format(**entry[\"fields\"])},\n",
+    "            ],\n",
+    "            temperature=0.\n",
+    "        )\n",
+    "        prediction = output.choices[0].message.content\n",
+    "        return evaluate(entry[\"answer\"], prediction)\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error processing entry: {e}\")\n",
+    "        return {\"error\": str(e)}\n",
+    "\n",
+    "async def process_batch(entries, batch_size=10):\n",
+    "    \"\"\"Process entries in batches to avoid rate limits\"\"\"\n",
+    "    results = []\n",
+    "    \n",
+    "    for i in tqdm(range(0, len(entries), batch_size), desc=\"Processing batches\"):\n",
+    "        batch = entries[i:i + batch_size]\n",
+    "        batch_results = await asyncio.gather(*[process_entry(entry) for entry in batch])\n",
+    "        results.extend(batch_results)\n",
+    "        \n",
+    "        # Optional: Add a small delay between batches to be respectful to the API\n",
+    "        if i + batch_size < len(entries):\n",
+    "            await asyncio.sleep(0.1)\n",
+    "    \n",
+    "    return results\n",
+    "\n",
+    "# Run the batch processing\n",
+    "result = await process_batch(dataset_test, batch_size=24)  # Adjust batch_size as needed\n",
     "\n",
     "    "
    ]
@@ -174,25 +205,23 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "baf28a90-19a6-44c7-af1d-125b01cf21fa",
+   "id": "f751aad4-c534-4f9c-a3c2-fffc06a1c485",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# gpt-4o -> {'is_valid_json': 0.967, 'correct_categories': 0.895, 'correct_sentiment': 0.517, 'correct_urgency': 0.767, 'total': 0.726}"
-   ]
+   "source": []
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f751aad4-c534-4f9c-a3c2-fffc06a1c485",
+   "id": "3336a411",
    "metadata": {},
    "outputs": [],
    "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "sap-prompt-opt",
    "language": "python",
    "name": "python3"
   },
@@ -206,7 +235,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.11"
+   "version": "3.9.21"
   }
  },
  "nbformat": 4,