From 9b59d2e35048db10d65c54724df95b7b9b1af1d0 Mon Sep 17 00:00:00 2001
From: mlcommons-bot <mlcommons-bot@users.noreply.github.com>
Date: Fri, 20 Dec 2024 22:46:03 +0000
Subject: [PATCH 001/165] [Automated Commit] Format Codebase

---
 loadgen/issue_query_controller.cc | 12 ++++----
 loadgen/logging.cc                |  3 +-
 loadgen/logging.h                 | 51 ++++++++++++++++++++-----------
 3 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/loadgen/issue_query_controller.cc b/loadgen/issue_query_controller.cc
index c1abea9d14..4c5ca66f0c 100644
--- a/loadgen/issue_query_controller.cc
+++ b/loadgen/issue_query_controller.cc
@@ -459,8 +459,8 @@ void IssueQueryController::IssueQueriesInternal(size_t query_stride,
 #if USE_NEW_LOGGING_FORMAT
             std::stringstream ss;
             ss << "IssueQueryThread " << thread_idx
-               << " Ending early: Too many outstanding queries." << " issued "
-               << queries_issued_total << " outstanding "
+               << " Ending early: Too many outstanding queries."
+               << " issued " << queries_issued_total << " outstanding "
                << queries_outstanding;
             MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
 #else
@@ -499,8 +499,8 @@ void IssueQueryController::IssueQueriesInternal(size_t query_stride,
 #if USE_NEW_LOGGING_FORMAT
         std::stringstream ss;
         ss << "IssueQueryThread " << thread_idx
-           << " Ending early: Max query count reached." << " query_count "
-           << queries_issued;
+           << " Ending early: Max query count reached."
+           << " query_count " << queries_issued;
         MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
 #else
         detail.Error("IssueQueryThread ", std::to_string(thread_idx),
@@ -519,8 +519,8 @@ void IssueQueryController::IssueQueriesInternal(size_t query_stride,
 #if USE_NEW_LOGGING_FORMAT
         std::stringstream ss;
         ss << "IssueQueryThread " << thread_idx
-           << " Ending early: Max test duration reached." << " duration_ns "
-           << duration.count();
+           << " Ending early: Max test duration reached."
+           << " duration_ns " << duration.count();
         MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
 #else
         detail.Error("IssueQueryThread ", std::to_string(thread_idx),
diff --git a/loadgen/logging.cc b/loadgen/logging.cc
index 807c1954a8..d7e83e54b9 100644
--- a/loadgen/logging.cc
+++ b/loadgen/logging.cc
@@ -812,7 +812,8 @@ void Logger::CollectTlsLoggerStats(TlsLogger* tls_logger) {
   if (max_entry_vector_size > kTlsLogReservedEntryCount) {
 #if USE_NEW_LOGGING_FORMAT
     std::stringstream msg;
-    msg << "Logging allocation detected:" << " tid: " << tls_logger->Tid()
+    msg << "Logging allocation detected:"
+        << " tid: " << tls_logger->Tid()
         << " reserved_entries: " << kTlsLogReservedEntryCount
         << " max_entries: " << max_entry_vector_size;
     MLPERF_LOG_WARNING((*this), "warning_generic_message", msg.str());
diff --git a/loadgen/logging.h b/loadgen/logging.h
index c5514562a5..a557e4b9a8 100644
--- a/loadgen/logging.h
+++ b/loadgen/logging.h
@@ -119,10 +119,13 @@ class ChromeTracer {
   void AddCompleteEvent(const std::string& name, uint64_t pid, uint64_t tid,
                         PerfClock::time_point start, PerfClock::time_point end,
                         const Args... args) {
-    *out_ << "{\"name\":\"" << name << "\"," << "\"ph\":\"X\","
-          << "\"pid\":" << pid << "," << "\"tid\":" << tid << ","
+    *out_ << "{\"name\":\"" << name << "\","
+          << "\"ph\":\"X\","
+          << "\"pid\":" << pid << ","
+          << "\"tid\":" << tid << ","
           << "\"ts\":" << Micros(start - origin_).count() << ","
-          << "\"dur\":" << Micros(end - start).count() << "," << "\"args\":{";
+          << "\"dur\":" << Micros(end - start).count() << ","
+          << "\"args\":{";
     AddArgs(args...);
     *out_ << "}},\n";
   }
@@ -130,9 +133,12 @@ class ChromeTracer {
   template <typename... Args>
   void AddAsyncBeginEvent(const std::string& name, uint64_t pid, uint64_t id,
                           PerfClock::time_point time, const Args... args) {
-    *out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
-          << "\"ph\":\"b\"," << "\"pid\":" << pid << "," << "\"id\":" << id
-          << "," << "\"ts\":" << Micros(time - origin_).count() << ","
+    *out_ << "{\"name\":\"" << name << "\","
+          << "\"cat\":\"default\","
+          << "\"ph\":\"b\","
+          << "\"pid\":" << pid << ","
+          << "\"id\":" << id << ","
+          << "\"ts\":" << Micros(time - origin_).count() << ","
           << "\"args\":{";
     AddArgs(args...);
     *out_ << "}},\n";
@@ -141,9 +147,12 @@ class ChromeTracer {
   template <typename... Args>
   void AddAsyncInstantEvent(const std::string& name, uint64_t pid, uint64_t id,
                             PerfClock::time_point time, const Args... args) {
-    *out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
-          << "\"ph\":\"n\"," << "\"pid\":" << pid << "," << "\"id\":" << id
-          << "," << "\"ts\":" << Micros(time - origin_).count() << ","
+    *out_ << "{\"name\":\"" << name << "\","
+          << "\"cat\":\"default\","
+          << "\"ph\":\"n\","
+          << "\"pid\":" << pid << ","
+          << "\"id\":" << id << ","
+          << "\"ts\":" << Micros(time - origin_).count() << ","
           << "\"args\":{";
     AddArgs(args...);
     *out_ << "}},\n";
@@ -152,15 +161,19 @@ class ChromeTracer {
   template <typename... Args>
   void AddAsyncEndEvent(const std::string& name, uint64_t pid, uint64_t id,
                         PerfClock::time_point time) {
-    *out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
-          << "\"ph\":\"e\", " << "\"pid\":" << pid << "," << "\"id\":" << id
-          << "," << "\"ts\":" << Micros(time - origin_).count() << "},\n";
+    *out_ << "{\"name\":\"" << name << "\","
+          << "\"cat\":\"default\","
+          << "\"ph\":\"e\", "
+          << "\"pid\":" << pid << ","
+          << "\"id\":" << id << ","
+          << "\"ts\":" << Micros(time - origin_).count() << "},\n";
   }
 
   template <typename... Args>
   void AddCounterEvent(const std::string& name, uint64_t pid,
                        PerfClock::time_point time, const Args... args) {
-    *out_ << "{\"name\":\"" << name << "\"," << "\"ph\": \"C\","
+    *out_ << "{\"name\":\"" << name << "\","
+          << "\"ph\": \"C\","
           << "\"pid\":" << pid << ","
           << "\"ts\":" << Micros(time - origin_).count() << ","
           << "\"args\":{ ";
@@ -720,13 +733,15 @@ void AsyncLog::LogDetail(const std::string& key, const T& value,
   }
   auto time_ns = (log_detail_time_ - log_origin_).count();
   for (auto os : detail_streams) {
-    *os << ":::MLLOG {" << "\"key\": " << ArgValueTransform(key) << ", "
+    *os << ":::MLLOG {"
+        << "\"key\": " << ArgValueTransform(key) << ", "
         << "\"value\": " << ArgValueTransform(value) << ", "
         << "\"time_ms\": " << ArgValueTransform(time_ns / 1000000ULL) << "."
         << std::setfill('0') << std::setw(6)
         << ArgValueTransform(time_ns % 1000000ULL) << ", "
         << "\"namespace\": \"mlperf::logging\", "
-        << "\"event_type\": \"POINT_IN_TIME\", " << "\"metadata\": {"
+        << "\"event_type\": \"POINT_IN_TIME\", "
+        << "\"metadata\": {"
         << "\"is_error\": " << ArgValueTransform(error_flagged_) << ", "
         << "\"is_warning\": " << ArgValueTransform(warning_flagged_) << ", "
         << "\"file\": \"" << file_name << "\", "
@@ -755,9 +770,9 @@ void AsyncLog::LogDetail(const std::string& message, const Args... args) {
     detail_streams.pop_back();
   }
   for (auto os : detail_streams) {
-    *os << "\"pid\": " << current_pid_ << ", " << "\"tid\": " << current_tid_
-        << ", " << "\"ts\": " << (log_detail_time_ - log_origin_).count()
-        << "ns : ";
+    *os << "\"pid\": " << current_pid_ << ", "
+        << "\"tid\": " << current_tid_ << ", "
+        << "\"ts\": " << (log_detail_time_ - log_origin_).count() << "ns : ";
     if (error_flagged_) {
       *os << "ERROR : ";
     } else if (warning_flagged_) {

From 1f2666cd44ba03b612744a4d76d6b7e7ee7b6ca0 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 18 Sep 2025 19:24:45 +0000
Subject: [PATCH 002/165] [Automated Commit] Format Codebase

---
 language/deepseek-r1/eval_accuracy.py  |  2 +-
 language/llama3.1-8b/download_cnndm.py |  6 +--
 loadgen/logging.h                      | 51 +++++++++-----------------
 3 files changed, 22 insertions(+), 37 deletions(-)

diff --git a/language/deepseek-r1/eval_accuracy.py b/language/deepseek-r1/eval_accuracy.py
index bf537e9d3a..9c103fdcba 100644
--- a/language/deepseek-r1/eval_accuracy.py
+++ b/language/deepseek-r1/eval_accuracy.py
@@ -773,7 +773,7 @@ def print_evaluation_results(df_evaluated: pd.DataFrame,
         'tokens_per_sample': mean_output_len,
         'num-samples': len(df_evaluated),
     }
-    
+
     print("\nResults\n")
     print(results)
 
diff --git a/language/llama3.1-8b/download_cnndm.py b/language/llama3.1-8b/download_cnndm.py
index d8694be720..90c9ad8d7a 100644
--- a/language/llama3.1-8b/download_cnndm.py
+++ b/language/llama3.1-8b/download_cnndm.py
@@ -100,8 +100,8 @@ def preprocess_function(sample, padding="max_length"):
     # create list of samples
     inputs = []
 
-    #print(f"Num samples: {len(sample[text_column])}")
-    #for i in range(0, len(sample[text_column])):
+    # print(f"Num samples: {len(sample[text_column])}")
+    # for i in range(0, len(sample[text_column])):
     x = dict()
     x["instruction"] = instruction_template
     x["input"] = sample[text_column]
@@ -109,7 +109,7 @@ def preprocess_function(sample, padding="max_length"):
         instruction_template[instruction].format_map(x)
     )
     x["output"] = sample[summary_column]
-    #inputs.append(x)
+    # inputs.append(x)
     model_inputs = dict()
     model_inputs["text"] = x
 
diff --git a/loadgen/logging.h b/loadgen/logging.h
index c74a7efba2..8f1a398e9d 100644
--- a/loadgen/logging.h
+++ b/loadgen/logging.h
@@ -119,13 +119,10 @@ class ChromeTracer {
   void AddCompleteEvent(const std::string& name, uint64_t pid, uint64_t tid,
                         PerfClock::time_point start, PerfClock::time_point end,
                         const Args... args) {
-    *out_ << "{\"name\":\"" << name << "\","
-          << "\"ph\":\"X\","
-          << "\"pid\":" << pid << ","
-          << "\"tid\":" << tid << ","
+    *out_ << "{\"name\":\"" << name << "\"," << "\"ph\":\"X\","
+          << "\"pid\":" << pid << "," << "\"tid\":" << tid << ","
           << "\"ts\":" << Micros(start - origin_).count() << ","
-          << "\"dur\":" << Micros(end - start).count() << ","
-          << "\"args\":{";
+          << "\"dur\":" << Micros(end - start).count() << "," << "\"args\":{";
     AddArgs(args...);
     *out_ << "}},\n";
   }
@@ -133,12 +130,9 @@ class ChromeTracer {
   template <typename... Args>
   void AddAsyncBeginEvent(const std::string& name, uint64_t pid, uint64_t id,
                           PerfClock::time_point time, const Args... args) {
-    *out_ << "{\"name\":\"" << name << "\","
-          << "\"cat\":\"default\","
-          << "\"ph\":\"b\","
-          << "\"pid\":" << pid << ","
-          << "\"id\":" << id << ","
-          << "\"ts\":" << Micros(time - origin_).count() << ","
+    *out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
+          << "\"ph\":\"b\"," << "\"pid\":" << pid << "," << "\"id\":" << id
+          << "," << "\"ts\":" << Micros(time - origin_).count() << ","
           << "\"args\":{";
     AddArgs(args...);
     *out_ << "}},\n";
@@ -147,12 +141,9 @@ class ChromeTracer {
   template <typename... Args>
   void AddAsyncInstantEvent(const std::string& name, uint64_t pid, uint64_t id,
                             PerfClock::time_point time, const Args... args) {
-    *out_ << "{\"name\":\"" << name << "\","
-          << "\"cat\":\"default\","
-          << "\"ph\":\"n\","
-          << "\"pid\":" << pid << ","
-          << "\"id\":" << id << ","
-          << "\"ts\":" << Micros(time - origin_).count() << ","
+    *out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
+          << "\"ph\":\"n\"," << "\"pid\":" << pid << "," << "\"id\":" << id
+          << "," << "\"ts\":" << Micros(time - origin_).count() << ","
           << "\"args\":{";
     AddArgs(args...);
     *out_ << "}},\n";
@@ -161,19 +152,15 @@ class ChromeTracer {
   template <typename... Args>
   void AddAsyncEndEvent(const std::string& name, uint64_t pid, uint64_t id,
                         PerfClock::time_point time) {
-    *out_ << "{\"name\":\"" << name << "\","
-          << "\"cat\":\"default\","
-          << "\"ph\":\"e\", "
-          << "\"pid\":" << pid << ","
-          << "\"id\":" << id << ","
-          << "\"ts\":" << Micros(time - origin_).count() << "},\n";
+    *out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
+          << "\"ph\":\"e\", " << "\"pid\":" << pid << "," << "\"id\":" << id
+          << "," << "\"ts\":" << Micros(time - origin_).count() << "},\n";
   }
 
   template <typename... Args>
   void AddCounterEvent(const std::string& name, uint64_t pid,
                        PerfClock::time_point time, const Args... args) {
-    *out_ << "{\"name\":\"" << name << "\","
-          << "\"ph\": \"C\","
+    *out_ << "{\"name\":\"" << name << "\"," << "\"ph\": \"C\","
           << "\"pid\":" << pid << ","
           << "\"ts\":" << Micros(time - origin_).count() << ","
           << "\"args\":{ ";
@@ -734,15 +721,13 @@ void AsyncLog::LogDetail(const std::string& key, const T& value,
   }
   auto time_ns = (log_detail_time_ - log_origin_).count();
   for (auto os : detail_streams) {
-    *os << ":::MLLOG {"
-        << "\"key\": " << ArgValueTransform(key) << ", "
+    *os << ":::MLLOG {" << "\"key\": " << ArgValueTransform(key) << ", "
         << "\"value\": " << ArgValueTransform(value) << ", "
         << "\"time_ms\": " << ArgValueTransform(time_ns / 1000000ULL) << "."
         << std::setfill('0') << std::setw(6)
         << ArgValueTransform(time_ns % 1000000ULL) << ", "
         << "\"namespace\": \"mlperf::logging\", "
-        << "\"event_type\": \"POINT_IN_TIME\", "
-        << "\"metadata\": {"
+        << "\"event_type\": \"POINT_IN_TIME\", " << "\"metadata\": {"
         << "\"is_error\": " << ArgValueTransform(error_flagged_) << ", "
         << "\"is_warning\": " << ArgValueTransform(warning_flagged_) << ", "
         << "\"file\": \"" << file_name << "\", "
@@ -771,9 +756,9 @@ void AsyncLog::LogDetail(const std::string& message, const Args... args) {
     detail_streams.pop_back();
   }
   for (auto os : detail_streams) {
-    *os << "\"pid\": " << current_pid_ << ", "
-        << "\"tid\": " << current_tid_ << ", "
-        << "\"ts\": " << (log_detail_time_ - log_origin_).count() << "ns : ";
+    *os << "\"pid\": " << current_pid_ << ", " << "\"tid\": " << current_tid_
+        << ", " << "\"ts\": " << (log_detail_time_ - log_origin_).count()
+        << "ns : ";
     if (error_flagged_) {
       *os << "ERROR : ";
     } else if (warning_flagged_) {

From f9c4f615165c5d2cfa02c7569bf0bac1a3d85ec9 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Sat, 27 Sep 2025 01:35:03 -0700
Subject: [PATCH 003/165] initial

---
 .gitmodules                              |   4 +
 language/gpt-oss/README.md               |  18 +++
 language/gpt-oss/README_send_requests.md |  58 +++++++
 language/gpt-oss/deps/sglang             |   1 +
 language/gpt-oss/requirements.txt        |   2 +
 language/gpt-oss/send_requests.py        | 195 +++++++++++++++++++++++
 6 files changed, 278 insertions(+)
 create mode 100644 language/gpt-oss/README.md
 create mode 100644 language/gpt-oss/README_send_requests.md
 create mode 160000 language/gpt-oss/deps/sglang
 create mode 100644 language/gpt-oss/requirements.txt
 create mode 100755 language/gpt-oss/send_requests.py

diff --git a/.gitmodules b/.gitmodules
index 7f0d5d302a..d177d01f7c 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,7 @@
 [submodule "language/deepseek-r1/submodules/LiveCodeBench"]
 	path = language/deepseek-r1/submodules/LiveCodeBench
 	url = https://github.com/LiveCodeBench/LiveCodeBench
+[submodule "language/gpt-oss/deps/sglang"]
+	path = language/gpt-oss/deps/sglang
+	url = git@github.com:sgl-project/sglang.git
+	branch = main
diff --git a/language/gpt-oss/README.md b/language/gpt-oss/README.md
new file mode 100644
index 0000000000..3b2386c3d6
--- /dev/null
+++ b/language/gpt-oss/README.md
@@ -0,0 +1,18 @@
+# MLPerf Inference reference implementation for GPT-OSS-120B
+This is the reference implementation for GPT-OSS-120B. This is a proposal and is a WIP. 
+
+## Model and Dataset download
+
+* Model: `openai/gpt-oss-120b`, commit id: [`b5c939d`](https://huggingface.co/openai/gpt-oss-120b/tree/b5c939de8f754692c1647ca79fbf85e8c1e70f8a)
+* Dataset: For now, we are re-using the `deepseek-r1` dataset. (TODO @shobhitv: Add instructions)
+
+## Harmony format ??
+
+## Running the reference implementation: SGLang
+[`SGLang`](https://github.com/sgl-project/sglang) is the framework of choice to run the reference implementation.
+
+### Fetch the docker image
+SGLang docker image will be used: `lmsysorg/sglang:v0.5.3rc1`
+
+### Enroot
+TODO: Add steps
\ No newline at end of file
diff --git a/language/gpt-oss/README_send_requests.md b/language/gpt-oss/README_send_requests.md
new file mode 100644
index 0000000000..0f2fabf5e7
--- /dev/null
+++ b/language/gpt-oss/README_send_requests.md
@@ -0,0 +1,58 @@
+# SGLang Request Sender
+
+This script sends preprocessed deepseek-r1 requests to a running SGLang server.
+
+## Usage
+
+```bash
+python send_requests.py --server-url http://localhost:30000
+```
+
+### Process only first 100 samples:
+```bash
+python send_requests.py --max-samples 100 --auto-detect
+```
+
+### Custom output file and max tokens:
+```bash
+python send_requests.py --output my_responses.jsonl --max-tokens 50 --auto-detect
+```
+
+## Arguments
+
+- `--data-dir`: Directory containing preprocessed data (default: `/home/mlperf_inference_storage/preprocessed_data/deepseek-r1/`)
+- `--server-url`: SGLang server URL (e.g., `http://localhost:30000`)
+- `--max-samples`: Maximum number of samples to process (default: all 4388 samples)
+- `--max-tokens`: Maximum tokens to generate per request (default: 100)
+- `--output`: Output file for responses (default: `responses.jsonl`)
+- `--auto-detect`: Auto-detect server port
+
+## Output Format
+
+The script outputs a JSONL file where each line contains:
+```json
+{
+  "sample_id": 0,
+  "input_length": 1283,
+  "input_tokens": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+  "response": {
+    "choices": [...],
+    "usage": {...}
+  },
+  "timestamp": 1695821234.567
+}
+```
+
+## Requirements
+
+Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+## Notes
+
+- The script automatically trims padding from input sequences based on actual lengths
+- It tries multiple request formats to ensure compatibility with SGLang
+- Responses are saved incrementally to avoid data loss
+- Progress is logged every 10 samples
diff --git a/language/gpt-oss/deps/sglang b/language/gpt-oss/deps/sglang
new file mode 160000
index 0000000000..172bcf0152
--- /dev/null
+++ b/language/gpt-oss/deps/sglang
@@ -0,0 +1 @@
+Subproject commit 172bcf0152d0d4e4fdd12d7b5c47195ea2aa28ac
diff --git a/language/gpt-oss/requirements.txt b/language/gpt-oss/requirements.txt
new file mode 100644
index 0000000000..8a404a0ad1
--- /dev/null
+++ b/language/gpt-oss/requirements.txt
@@ -0,0 +1,2 @@
+numpy>=1.21.0
+requests>=2.25.0
diff --git a/language/gpt-oss/send_requests.py b/language/gpt-oss/send_requests.py
new file mode 100755
index 0000000000..a5acd0bf12
--- /dev/null
+++ b/language/gpt-oss/send_requests.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+"""
+Script to send preprocessed deepseek-r1 requests to SGLang server.
+"""
+
+import numpy as np
+import requests
+import json
+import time
+import argparse
+from typing import List, Dict, Any
+import logging
+
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+class SGLangClient:
+    def __init__(self, base_url: str = "http://localhost:30000"):
+        self.base_url = base_url
+        self.session = requests.Session()
+        
+    def send_request(self, input_ids: List[int], max_tokens: int = 100) -> Dict[str, Any]:
+        """Send a single request to the SGLang server."""
+        # Try different payload formats for SGLang
+        payloads_to_try = [
+            # Format 1: Direct token IDs in messages
+            {
+                "model": "gpt-oss-120b",
+                "messages": [
+                    {
+                        "role": "user", 
+                        "content": input_ids
+                    }
+                ],
+                "max_tokens": max_tokens,
+                "temperature": 0.0,
+                "stream": False
+            },
+            # Format 2: Text-based content
+            {
+                "model": "gpt-oss-120b",
+                "messages": [
+                    {
+                        "role": "user", 
+                        "content": f"Token IDs: {input_ids[:10]}..."  # Truncate for display
+                    }
+                ],
+                "max_tokens": max_tokens,
+                "temperature": 0.0,
+                "stream": False
+            },
+            # Format 3: SGLang specific format
+            {
+                "text": input_ids,
+                "sampling_params": {
+                    "max_new_tokens": max_tokens,
+                    "temperature": 0.0
+                }
+            }
+        ]
+        
+        endpoints_to_try = [
+            "/v1/chat/completions",
+            "/generate",
+            "/v1/completions"
+        ]
+        
+        for payload in payloads_to_try:
+            for endpoint in endpoints_to_try:
+                try:
+                    response = self.session.post(
+                        f"{self.base_url}{endpoint}",
+                        json=payload,
+                        timeout=60
+                    )
+                    if response.status_code == 200:
+                        return response.json()
+                    else:
+                        logger.debug(f"Endpoint {endpoint} returned {response.status_code}")
+                except requests.exceptions.RequestException as e:
+                    logger.debug(f"Request to {endpoint} failed: {e}")
+                    continue
+        
+        return {"error": "All request formats failed"}
+
+def load_preprocessed_data(data_dir: str) -> tuple:
+    """Load the preprocessed data files."""
+    input_ids_path = f"{data_dir}/input_ids_padded.npy"
+    input_lens_path = f"{data_dir}/input_lens.npy"
+    
+    logger.info(f"Loading data from {data_dir}")
+    input_ids = np.load(input_ids_path)
+    input_lens = np.load(input_lens_path)
+    
+    logger.info(f"Loaded {len(input_ids)} samples")
+    logger.info(f"Input shape: {input_ids.shape}")
+    logger.info(f"Lengths range: {input_lens.min()} - {input_lens.max()}")
+    
+    return input_ids, input_lens
+
+def trim_padding(input_ids: np.ndarray, actual_length: int) -> List[int]:
+    """Trim padding from input_ids based on actual length."""
+    return input_ids[:actual_length].tolist()
+
+def send_requests(client: SGLangClient, input_ids: np.ndarray, input_lens: np.ndarray, 
+                 max_samples: int = None, max_tokens: int = 100, 
+                 output_file: str = "responses.jsonl") -> None:
+    """Send requests to SGLang server and save responses."""
+    
+    num_samples = min(len(input_ids), max_samples) if max_samples else len(input_ids)
+    logger.info(f"Sending {num_samples} requests")
+    
+    responses = []
+    start_time = time.time()
+    
+    with open(output_file, 'w') as f:
+        for i in range(num_samples):
+            # Trim padding based on actual length
+            actual_length = input_lens[i]
+            trimmed_input = trim_padding(input_ids[i], actual_length)
+            
+            logger.info(f"Processing sample {i+1}/{num_samples} (length: {actual_length})")
+            
+            # Send request
+            response = client.send_request(trimmed_input, max_tokens=max_tokens)
+            
+            # Prepare response data
+            response_data = {
+                "sample_id": i,
+                "input_length": actual_length,
+                "input_tokens": trimmed_input[:10],  # First 10 tokens for reference
+                "response": response,
+                "timestamp": time.time()
+            }
+            
+            # Save to file immediately
+            f.write(json.dumps(response_data) + '\n')
+            f.flush()
+            
+            responses.append(response_data)
+            
+            # Log progress
+            if (i + 1) % 10 == 0:
+                elapsed = time.time() - start_time
+                rate = (i + 1) / elapsed
+                logger.info(f"Processed {i+1}/{num_samples} samples ({rate:.2f} samples/sec)")
+    
+    total_time = time.time() - start_time
+    logger.info(f"Completed {num_samples} requests in {total_time:.2f} seconds")
+    logger.info(f"Average rate: {num_samples/total_time:.2f} requests/sec")
+    logger.info(f"Responses saved to {output_file}")
+
+def main():
+    parser = argparse.ArgumentParser(description="Send preprocessed requests to SGLang server")
+    parser.add_argument("--data-dir", default="/home/mlperf_inference_storage/preprocessed_data/deepseek-r1/",
+                       help="Directory containing preprocessed data")
+    parser.add_argument("--server-url", default="http://localhost:30000",
+                       help="SGLang server URL (default: http://localhost:30000)")
+    parser.add_argument("--max-samples", type=int, default=None,
+                       help="Maximum number of samples to process (default: all)")
+    parser.add_argument("--max-tokens", type=int, default=100,
+                       help="Maximum tokens to generate per request")
+    parser.add_argument("--output", default="responses.jsonl",
+                       help="Output file for responses")
+    
+    args = parser.parse_args()
+    
+    # Determine server URL
+    server_url = args.server_url
+    
+    # Load data
+    input_ids, input_lens = load_preprocessed_data(args.data_dir)
+    
+    # Create client
+    client = SGLangClient(server_url)
+    
+    # Test connection
+    logger.info(f"Testing server connection to {server_url}...")
+    test_response = client.send_request([1, 2, 3], max_tokens=5)
+    if "error" in test_response:
+        logger.error(f"Server connection failed: {test_response['error']}")
+        logger.error("Make sure your SGLang server is running. Try:")
+        logger.error("  python -m sglang.launch_server --model-path openai/gpt-oss-120b --mem-fraction-static 0.98 --tp 8")
+        return
+    logger.info("Server connection successful")
+    
+    # Send requests
+    send_requests(client, input_ids, input_lens, 
+                 max_samples=args.max_samples, 
+                 max_tokens=args.max_tokens,
+                 output_file=args.output)
+
+if __name__ == "__main__":
+    main()

From db9d25e5c00f453527509d17836f0ded89bd56c1 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Sat, 27 Sep 2025 08:35:41 +0000
Subject: [PATCH 004/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/send_requests.py | 118 +++++++++++++++++-------------
 1 file changed, 69 insertions(+), 49 deletions(-)

diff --git a/language/gpt-oss/send_requests.py b/language/gpt-oss/send_requests.py
index a5acd0bf12..5ddc567df8 100755
--- a/language/gpt-oss/send_requests.py
+++ b/language/gpt-oss/send_requests.py
@@ -12,15 +12,19 @@
 import logging
 
 # Set up logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 
+
 class SGLangClient:
     def __init__(self, base_url: str = "http://localhost:30000"):
         self.base_url = base_url
         self.session = requests.Session()
-        
-    def send_request(self, input_ids: List[int], max_tokens: int = 100) -> Dict[str, Any]:
+
+    def send_request(
+            self, input_ids: List[int], max_tokens: int = 100) -> Dict[str, Any]:
         """Send a single request to the SGLang server."""
         # Try different payload formats for SGLang
         payloads_to_try = [
@@ -29,7 +33,7 @@ def send_request(self, input_ids: List[int], max_tokens: int = 100) -> Dict[str,
                 "model": "gpt-oss-120b",
                 "messages": [
                     {
-                        "role": "user", 
+                        "role": "user",
                         "content": input_ids
                     }
                 ],
@@ -42,8 +46,9 @@ def send_request(self, input_ids: List[int], max_tokens: int = 100) -> Dict[str,
                 "model": "gpt-oss-120b",
                 "messages": [
                     {
-                        "role": "user", 
-                        "content": f"Token IDs: {input_ids[:10]}..."  # Truncate for display
+                        "role": "user",
+                        # Truncate for display
+                        "content": f"Token IDs: {input_ids[:10]}..."
                     }
                 ],
                 "max_tokens": max_tokens,
@@ -59,13 +64,13 @@ def send_request(self, input_ids: List[int], max_tokens: int = 100) -> Dict[str,
                 }
             }
         ]
-        
+
         endpoints_to_try = [
             "/v1/chat/completions",
             "/generate",
             "/v1/completions"
         ]
-        
+
         for payload in payloads_to_try:
             for endpoint in endpoints_to_try:
                 try:
@@ -77,119 +82,134 @@ def send_request(self, input_ids: List[int], max_tokens: int = 100) -> Dict[str,
                     if response.status_code == 200:
                         return response.json()
                     else:
-                        logger.debug(f"Endpoint {endpoint} returned {response.status_code}")
+                        logger.debug(
+                            f"Endpoint {endpoint} returned {response.status_code}")
                 except requests.exceptions.RequestException as e:
                     logger.debug(f"Request to {endpoint} failed: {e}")
                     continue
-        
+
         return {"error": "All request formats failed"}
 
+
 def load_preprocessed_data(data_dir: str) -> tuple:
     """Load the preprocessed data files."""
     input_ids_path = f"{data_dir}/input_ids_padded.npy"
     input_lens_path = f"{data_dir}/input_lens.npy"
-    
+
     logger.info(f"Loading data from {data_dir}")
     input_ids = np.load(input_ids_path)
     input_lens = np.load(input_lens_path)
-    
+
     logger.info(f"Loaded {len(input_ids)} samples")
     logger.info(f"Input shape: {input_ids.shape}")
     logger.info(f"Lengths range: {input_lens.min()} - {input_lens.max()}")
-    
+
     return input_ids, input_lens
 
+
 def trim_padding(input_ids: np.ndarray, actual_length: int) -> List[int]:
     """Trim padding from input_ids based on actual length."""
     return input_ids[:actual_length].tolist()
 
-def send_requests(client: SGLangClient, input_ids: np.ndarray, input_lens: np.ndarray, 
-                 max_samples: int = None, max_tokens: int = 100, 
-                 output_file: str = "responses.jsonl") -> None:
+
+def send_requests(client: SGLangClient, input_ids: np.ndarray, input_lens: np.ndarray,
+                  max_samples: int = None, max_tokens: int = 100,
+                  output_file: str = "responses.jsonl") -> None:
     """Send requests to SGLang server and save responses."""
-    
-    num_samples = min(len(input_ids), max_samples) if max_samples else len(input_ids)
+
+    num_samples = min(
+        len(input_ids),
+        max_samples) if max_samples else len(input_ids)
     logger.info(f"Sending {num_samples} requests")
-    
+
     responses = []
     start_time = time.time()
-    
+
     with open(output_file, 'w') as f:
         for i in range(num_samples):
             # Trim padding based on actual length
             actual_length = input_lens[i]
             trimmed_input = trim_padding(input_ids[i], actual_length)
-            
-            logger.info(f"Processing sample {i+1}/{num_samples} (length: {actual_length})")
-            
+
+            logger.info(
+                f"Processing sample {i+1}/{num_samples} (length: {actual_length})")
+
             # Send request
-            response = client.send_request(trimmed_input, max_tokens=max_tokens)
-            
+            response = client.send_request(
+                trimmed_input, max_tokens=max_tokens)
+
             # Prepare response data
             response_data = {
                 "sample_id": i,
                 "input_length": actual_length,
-                "input_tokens": trimmed_input[:10],  # First 10 tokens for reference
+                # First 10 tokens for reference
+                "input_tokens": trimmed_input[:10],
                 "response": response,
                 "timestamp": time.time()
             }
-            
+
             # Save to file immediately
             f.write(json.dumps(response_data) + '\n')
             f.flush()
-            
+
             responses.append(response_data)
-            
+
             # Log progress
             if (i + 1) % 10 == 0:
                 elapsed = time.time() - start_time
                 rate = (i + 1) / elapsed
-                logger.info(f"Processed {i+1}/{num_samples} samples ({rate:.2f} samples/sec)")
-    
+                logger.info(
+                    f"Processed {i+1}/{num_samples} samples ({rate:.2f} samples/sec)")
+
     total_time = time.time() - start_time
-    logger.info(f"Completed {num_samples} requests in {total_time:.2f} seconds")
+    logger.info(
+        f"Completed {num_samples} requests in {total_time:.2f} seconds")
     logger.info(f"Average rate: {num_samples/total_time:.2f} requests/sec")
     logger.info(f"Responses saved to {output_file}")
 
+
 def main():
-    parser = argparse.ArgumentParser(description="Send preprocessed requests to SGLang server")
+    parser = argparse.ArgumentParser(
+        description="Send preprocessed requests to SGLang server")
     parser.add_argument("--data-dir", default="/home/mlperf_inference_storage/preprocessed_data/deepseek-r1/",
-                       help="Directory containing preprocessed data")
+                        help="Directory containing preprocessed data")
     parser.add_argument("--server-url", default="http://localhost:30000",
-                       help="SGLang server URL (default: http://localhost:30000)")
+                        help="SGLang server URL (default: http://localhost:30000)")
     parser.add_argument("--max-samples", type=int, default=None,
-                       help="Maximum number of samples to process (default: all)")
+                        help="Maximum number of samples to process (default: all)")
     parser.add_argument("--max-tokens", type=int, default=100,
-                       help="Maximum tokens to generate per request")
+                        help="Maximum tokens to generate per request")
     parser.add_argument("--output", default="responses.jsonl",
-                       help="Output file for responses")
-    
+                        help="Output file for responses")
+
     args = parser.parse_args()
-    
+
     # Determine server URL
     server_url = args.server_url
-    
+
     # Load data
     input_ids, input_lens = load_preprocessed_data(args.data_dir)
-    
+
     # Create client
     client = SGLangClient(server_url)
-    
+
     # Test connection
     logger.info(f"Testing server connection to {server_url}...")
     test_response = client.send_request([1, 2, 3], max_tokens=5)
     if "error" in test_response:
         logger.error(f"Server connection failed: {test_response['error']}")
         logger.error("Make sure your SGLang server is running. Try:")
-        logger.error("  python -m sglang.launch_server --model-path openai/gpt-oss-120b --mem-fraction-static 0.98 --tp 8")
+        logger.error(
+            "  python -m sglang.launch_server --model-path openai/gpt-oss-120b --mem-fraction-static 0.98 --tp 8")
         return
     logger.info("Server connection successful")
-    
+
     # Send requests
-    send_requests(client, input_ids, input_lens, 
-                 max_samples=args.max_samples, 
-                 max_tokens=args.max_tokens,
-                 output_file=args.output)
+    send_requests(client, input_ids, input_lens,
+                  max_samples=args.max_samples,
+                  max_tokens=args.max_tokens,
+                  output_file=args.output)
+
 
 if __name__ == "__main__":
     main()

From 9daa72c9ca662be111f0f335bb4bc784fb157f0d Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Sat, 27 Sep 2025 02:56:01 -0700
Subject: [PATCH 005/165] json fixes

---
 language/gpt-oss/send_requests.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/language/gpt-oss/send_requests.py b/language/gpt-oss/send_requests.py
index 5ddc567df8..e947bd7194 100755
--- a/language/gpt-oss/send_requests.py
+++ b/language/gpt-oss/send_requests.py
@@ -109,7 +109,7 @@ def load_preprocessed_data(data_dir: str) -> tuple:
 
 def trim_padding(input_ids: np.ndarray, actual_length: int) -> List[int]:
     """Trim padding from input_ids based on actual length."""
-    return input_ids[:actual_length].tolist()
+    return input_ids[:actual_length].astype(int).tolist()
 
 
 def send_requests(client: SGLangClient, input_ids: np.ndarray, input_lens: np.ndarray,
@@ -140,12 +140,11 @@ def send_requests(client: SGLangClient, input_ids: np.ndarray, input_lens: np.nd
 
             # Prepare response data
             response_data = {
-                "sample_id": i,
-                "input_length": actual_length,
-                # First 10 tokens for reference
-                "input_tokens": trimmed_input[:10],
+                "sample_id": int(i),
+                "input_length": int(actual_length),
+                "input_tokens": trimmed_input[:10],  # First 10 tokens for reference
                 "response": response,
-                "timestamp": time.time()
+                "timestamp": float(time.time())
             }
 
             # Save to file immediately

From 2d0a179795ea0ac01c3e84f1ec9f337cdd75ef2d Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Sat, 27 Sep 2025 03:37:44 -0700
Subject: [PATCH 006/165] updates, tokenizer

---
 language/gpt-oss/README_send_requests.md |  37 ++-
 language/gpt-oss/requirements.txt        |   3 +
 language/gpt-oss/send_requests.py        | 307 ++++++++++++-----------
 3 files changed, 192 insertions(+), 155 deletions(-)

diff --git a/language/gpt-oss/README_send_requests.md b/language/gpt-oss/README_send_requests.md
index 0f2fabf5e7..4181e607e2 100644
--- a/language/gpt-oss/README_send_requests.md
+++ b/language/gpt-oss/README_send_requests.md
@@ -1,31 +1,37 @@
 # SGLang Request Sender
 
-This script sends preprocessed deepseek-r1 requests to a running SGLang server.
+This script loads text data from a pickle file, tokenizes it using a specified model's tokenizer, sends requests to a running SGLang server, and converts responses back to text.
 
 ## Usage
 
+### Basic usage:
 ```bash
-python send_requests.py --server-url http://localhost:30000
+python send_requests.py --model-name openai/gpt-oss-120b
 ```
 
 ### Process only first 100 samples:
 ```bash
-python send_requests.py --max-samples 100 --auto-detect
+python send_requests.py --model-name openai/gpt-oss-120b --max-samples 100
 ```
 
-### Custom output file and max tokens:
+### Custom data file and output:
 ```bash
-python send_requests.py --output my_responses.jsonl --max-tokens 50 --auto-detect
+python send_requests.py --model-name openai/gpt-oss-120b --data-file /path/to/data.pkl --output my_responses.jsonl
+```
+
+### Custom max tokens and server URL:
+```bash
+python send_requests.py --model-name openai/gpt-oss-120b --max-tokens 50 --server-url http://localhost:8000
 ```
 
 ## Arguments
 
-- `--data-dir`: Directory containing preprocessed data (default: `/home/mlperf_inference_storage/preprocessed_data/deepseek-r1/`)
-- `--server-url`: SGLang server URL (e.g., `http://localhost:30000`)
-- `--max-samples`: Maximum number of samples to process (default: all 4388 samples)
+- `--data-file`: Path to pickle file containing text data (default: `/home/mlperf_inference_storage/data/deepseek-r1/mlperf_deepseek_r1_dataset_4388_fp8_eval.pkl`)
+- `--model-name`: Model name for tokenizer (required, e.g., `openai/gpt-oss-120b`)
+- `--server-url`: SGLang server URL (default: `http://localhost:30000`)
+- `--max-samples`: Maximum number of samples to process (default: all)
 - `--max-tokens`: Maximum tokens to generate per request (default: 100)
 - `--output`: Output file for responses (default: `responses.jsonl`)
-- `--auto-detect`: Auto-detect server port
 
 ## Output Format
 
@@ -33,12 +39,15 @@ The script outputs a JSONL file where each line contains:
 ```json
 {
   "sample_id": 0,
+  "text_input": "Here are some example problems...",
   "input_length": 1283,
+  "token_length": 512,
   "input_tokens": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
   "response": {
-    "choices": [...],
+    "generated_text": [1, 2, 3, ...],
     "usage": {...}
   },
+  "response_text": "The answer is 6...",
   "timestamp": 1695821234.567
 }
 ```
@@ -52,7 +61,9 @@ pip install -r requirements.txt
 
 ## Notes
 
-- The script automatically trims padding from input sequences based on actual lengths
-- It tries multiple request formats to ensure compatibility with SGLang
+- The script loads text data from a pandas DataFrame in the pickle file
+- It uses the specified model's tokenizer to convert text to tokens
+- Sends tokenized input to SGLang server via `/generate` endpoint
+- Converts response tokens back to text using the same tokenizer
 - Responses are saved incrementally to avoid data loss
-- Progress is logged every 10 samples
+- Progress is logged every 10 samples
\ No newline at end of file
diff --git a/language/gpt-oss/requirements.txt b/language/gpt-oss/requirements.txt
index 8a404a0ad1..3096b113f6 100644
--- a/language/gpt-oss/requirements.txt
+++ b/language/gpt-oss/requirements.txt
@@ -1,2 +1,5 @@
 numpy>=1.21.0
+pandas>=1.3.0
 requests>=2.25.0
+transformers>=4.20.0
+torch>=1.12.0
diff --git a/language/gpt-oss/send_requests.py b/language/gpt-oss/send_requests.py
index e947bd7194..0f830e2bef 100755
--- a/language/gpt-oss/send_requests.py
+++ b/language/gpt-oss/send_requests.py
@@ -1,15 +1,18 @@
 #!/usr/bin/env python3
 """
-Script to send preprocessed deepseek-r1 requests to SGLang server.
+Script to send text requests to SGLang server with tokenization.
 """
 
 import numpy as np
+import pandas as pd
+import pickle
 import requests
 import json
 import time
 import argparse
 from typing import List, Dict, Any
 import logging
+from transformers import AutoTokenizer
 
 # Set up logging
 logging.basicConfig(
@@ -26,152 +29,172 @@ def __init__(self, base_url: str = "http://localhost:30000"):
     def send_request(
             self, input_ids: List[int], max_tokens: int = 100) -> Dict[str, Any]:
         """Send a single request to the SGLang server."""
-        # Try different payload formats for SGLang
-        payloads_to_try = [
-            # Format 1: Direct token IDs in messages
-            {
-                "model": "gpt-oss-120b",
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": input_ids
-                    }
-                ],
-                "max_tokens": max_tokens,
-                "temperature": 0.0,
-                "stream": False
-            },
-            # Format 2: Text-based content
-            {
-                "model": "gpt-oss-120b",
-                "messages": [
-                    {
-                        "role": "user",
-                        # Truncate for display
-                        "content": f"Token IDs: {input_ids[:10]}..."
-                    }
-                ],
-                "max_tokens": max_tokens,
-                "temperature": 0.0,
-                "stream": False
-            },
-            # Format 3: SGLang specific format
-            {
-                "text": input_ids,
-                "sampling_params": {
-                    "max_new_tokens": max_tokens,
-                    "temperature": 0.0
-                }
+        # SGLang format with input_ids
+        payload = {
+            "input_ids": input_ids,
+            "sampling_params": {
+                "max_new_tokens": max_tokens,
+                "temperature": 0.0
             }
-        ]
-
-        endpoints_to_try = [
-            "/v1/chat/completions",
-            "/generate",
-            "/v1/completions"
-        ]
-
-        for payload in payloads_to_try:
-            for endpoint in endpoints_to_try:
-                try:
-                    response = self.session.post(
-                        f"{self.base_url}{endpoint}",
-                        json=payload,
-                        timeout=60
-                    )
-                    if response.status_code == 200:
-                        return response.json()
-                    else:
-                        logger.debug(
-                            f"Endpoint {endpoint} returned {response.status_code}")
-                except requests.exceptions.RequestException as e:
-                    logger.debug(f"Request to {endpoint} failed: {e}")
-                    continue
-
-        return {"error": "All request formats failed"}
-
-
-def load_preprocessed_data(data_dir: str) -> tuple:
-    """Load the preprocessed data files."""
-    input_ids_path = f"{data_dir}/input_ids_padded.npy"
-    input_lens_path = f"{data_dir}/input_lens.npy"
-
-    logger.info(f"Loading data from {data_dir}")
-    input_ids = np.load(input_ids_path)
-    input_lens = np.load(input_lens_path)
-
-    logger.info(f"Loaded {len(input_ids)} samples")
-    logger.info(f"Input shape: {input_ids.shape}")
-    logger.info(f"Lengths range: {input_lens.min()} - {input_lens.max()}")
-
-    return input_ids, input_lens
-
-
-def trim_padding(input_ids: np.ndarray, actual_length: int) -> List[int]:
-    """Trim padding from input_ids based on actual length."""
-    return input_ids[:actual_length].astype(int).tolist()
-
-
-def send_requests(client: SGLangClient, input_ids: np.ndarray, input_lens: np.ndarray,
-                  max_samples: int = None, max_tokens: int = 100,
-                  output_file: str = "responses.jsonl") -> None:
-    """Send requests to SGLang server and save responses."""
-
-    num_samples = min(
-        len(input_ids),
-        max_samples) if max_samples else len(input_ids)
-    logger.info(f"Sending {num_samples} requests")
-
+        }
+
+        try:
+            response = self.session.post(
+                f"{self.base_url}/generate",
+                json=payload,
+                timeout=60
+            )
+            if response.status_code == 200:
+                return response.json()
+            else:
+                logger.error(f"Request failed with status {response.status_code}: {response.text}")
+                return {"error": f"HTTP {response.status_code}: {response.text}"}
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Request failed: {e}")
+            return {"error": str(e)}
+
+
+def load_text_data(data_file: str) -> pd.DataFrame:
+    """Load the text data from pickle file."""
+    logger.info(f"Loading data from {data_file}")
+    with open(data_file, 'rb') as f:
+        data = pickle.load(f)
+    
+    logger.info(f"Loaded {len(data)} samples")
+    logger.info(f"Columns: {list(data.columns)}")
+    logger.info(f"First text input length: {len(data.iloc[0]['text_input'])}")
+    
+    return data
+
+
+def load_tokenizer(model_name: str):
+    """Load tokenizer for the specified model."""
+    logger.info(f"Loading tokenizer for {model_name}")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        logger.info(f"Tokenizer loaded successfully")
+        return tokenizer
+    except Exception as e:
+        logger.error(f"Failed to load tokenizer: {e}")
+        raise
+
+
+def tokenize_all_inputs(data: pd.DataFrame, tokenizer, max_samples: int = None):
+    """Tokenize all text inputs at once."""
+    num_samples = min(len(data), max_samples) if max_samples else len(data)
+    logger.info(f"Tokenizing {num_samples} text inputs...")
+    
+    text_inputs = data['text_input'].tolist()[:num_samples]
+    
+    # Tokenize all texts at once
+    tokenized = tokenizer(text_inputs, return_tensors="pt", padding=False, truncation=True)
+    input_ids_list = [tokenized['input_ids'][i].tolist() for i in range(num_samples)]
+    
+    logger.info(f"Tokenization complete. Token lengths: {[len(ids) for ids in input_ids_list[:5]]}...")
+    return input_ids_list, text_inputs
+
+
+def send_requests_batch(client: SGLangClient, input_ids_list: List[List[int]], 
+                       text_inputs: List[str], max_tokens: int = 100) -> List[Dict[str, Any]]:
+    """Send all requests to SGLang server."""
+    num_samples = len(input_ids_list)
+    logger.info(f"Sending {num_samples} requests to server...")
+    
     responses = []
     start_time = time.time()
-
+    
+    for i, input_ids in enumerate(input_ids_list):
+        logger.info(f"Processing sample {i+1}/{num_samples} (token length: {len(input_ids)})")
+        
+        # Send request
+        response = client.send_request(input_ids, max_tokens=max_tokens)
+        responses.append(response)
+        
+        # Log progress
+        if (i + 1) % 10 == 0:
+            elapsed = time.time() - start_time
+            rate = (i + 1) / elapsed
+            logger.info(f"Processed {i+1}/{num_samples} samples ({rate:.2f} samples/sec)")
+    
+    total_time = time.time() - start_time
+    logger.info(f"Completed {num_samples} requests in {total_time:.2f} seconds")
+    logger.info(f"Average rate: {num_samples/total_time:.2f} requests/sec")
+    
+    return responses
+
+
+def detokenize_all_responses(responses: List[Dict[str, Any]], input_ids_list: List[List[int]], 
+                           tokenizer) -> List[str]:
+    """Detokenize all responses at once."""
+    logger.info("Detokenizing responses...")
+    
+    response_texts = []
+    for i, (response, input_ids) in enumerate(zip(responses, input_ids_list)):
+        response_text = ""
+        if "error" not in response and "generated_text" in response:
+            try:
+                # Extract generated tokens (excluding input tokens)
+                generated_tokens = response["generated_text"][len(input_ids):]
+                response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+            except Exception as e:
+                logger.warning(f"Failed to decode response for sample {i+1}: {e}")
+        response_texts.append(response_text)
+    
+    logger.info("Detokenization complete")
+    return response_texts
+
+
+def save_responses(responses: List[Dict[str, Any]], response_texts: List[str], 
+                  text_inputs: List[str], input_ids_list: List[List[int]], 
+                  output_file: str) -> None:
+    """Save all responses to file."""
+    logger.info(f"Saving responses to {output_file}...")
+    
     with open(output_file, 'w') as f:
-        for i in range(num_samples):
-            # Trim padding based on actual length
-            actual_length = input_lens[i]
-            trimmed_input = trim_padding(input_ids[i], actual_length)
-
-            logger.info(
-                f"Processing sample {i+1}/{num_samples} (length: {actual_length})")
-
-            # Send request
-            response = client.send_request(
-                trimmed_input, max_tokens=max_tokens)
-
-            # Prepare response data
+        for i, (response, response_text, text_input, input_ids) in enumerate(
+            zip(responses, response_texts, text_inputs, input_ids_list)):
+            
             response_data = {
                 "sample_id": int(i),
-                "input_length": int(actual_length),
-                "input_tokens": trimmed_input[:10],  # First 10 tokens for reference
+                "text_input": text_input[:200] + "..." if len(text_input) > 200 else text_input,
+                "input_length": len(text_input),
+                "token_length": len(input_ids),
+                "input_tokens": input_ids[:10],  # First 10 tokens for reference
                 "response": response,
+                "response_text": response_text,
                 "timestamp": float(time.time())
             }
-
-            # Save to file immediately
+            
             f.write(json.dumps(response_data) + '\n')
-            f.flush()
-
-            responses.append(response_data)
+    
+    logger.info(f"Responses saved to {output_file}")
 
-            # Log progress
-            if (i + 1) % 10 == 0:
-                elapsed = time.time() - start_time
-                rate = (i + 1) / elapsed
-                logger.info(
-                    f"Processed {i+1}/{num_samples} samples ({rate:.2f} samples/sec)")
 
-    total_time = time.time() - start_time
-    logger.info(
-        f"Completed {num_samples} requests in {total_time:.2f} seconds")
-    logger.info(f"Average rate: {num_samples/total_time:.2f} requests/sec")
-    logger.info(f"Responses saved to {output_file}")
+def process_requests(client: SGLangClient, data: pd.DataFrame, tokenizer,
+                    max_samples: int = None, max_tokens: int = 100,
+                    output_file: str = "responses.jsonl") -> None:
+    """Main processing function that handles tokenization, requests, and detokenization."""
+    
+    # Step 1: Tokenize all inputs
+    input_ids_list, text_inputs = tokenize_all_inputs(data, tokenizer, max_samples)
+    
+    # Step 2: Send all requests
+    responses = send_requests_batch(client, input_ids_list, text_inputs, max_tokens)
+    
+    # Step 3: Detokenize all responses
+    response_texts = detokenize_all_responses(responses, input_ids_list, tokenizer)
+    
+    # Step 4: Save all results
+    save_responses(responses, response_texts, text_inputs, input_ids_list, output_file)
 
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Send preprocessed requests to SGLang server")
-    parser.add_argument("--data-dir", default="/home/mlperf_inference_storage/preprocessed_data/deepseek-r1/",
-                        help="Directory containing preprocessed data")
+        description="Send text requests to SGLang server with tokenization")
+    parser.add_argument("--data-file", default="/home/mlperf_inference_storage/data/deepseek-r1/mlperf_deepseek_r1_dataset_4388_fp8_eval.pkl",
+                        help="Path to pickle file containing text data")
+    parser.add_argument("--model-name", required=True,
+                        help="Model name for tokenizer (e.g., openai/gpt-oss-120b)")
     parser.add_argument("--server-url", default="http://localhost:30000",
                         help="SGLang server URL (default: http://localhost:30000)")
     parser.add_argument("--max-samples", type=int, default=None,
@@ -183,17 +206,17 @@ def main():
 
     args = parser.parse_args()
 
-    # Determine server URL
-    server_url = args.server_url
-
     # Load data
-    input_ids, input_lens = load_preprocessed_data(args.data_dir)
+    data = load_text_data(args.data_file)
+    
+    # Load tokenizer
+    tokenizer = load_tokenizer(args.model_name)
 
     # Create client
-    client = SGLangClient(server_url)
+    client = SGLangClient(args.server_url)
 
     # Test connection
-    logger.info(f"Testing server connection to {server_url}...")
+    logger.info(f"Testing server connection to {args.server_url}...")
     test_response = client.send_request([1, 2, 3], max_tokens=5)
     if "error" in test_response:
         logger.error(f"Server connection failed: {test_response['error']}")
@@ -203,11 +226,11 @@ def main():
         return
     logger.info("Server connection successful")
 
-    # Send requests
-    send_requests(client, input_ids, input_lens,
-                  max_samples=args.max_samples,
-                  max_tokens=args.max_tokens,
-                  output_file=args.output)
+    # Process all requests in batches
+    process_requests(client, data, tokenizer,
+                    max_samples=args.max_samples,
+                    max_tokens=args.max_tokens,
+                    output_file=args.output)
 
 
 if __name__ == "__main__":

From c8e679dd01f12404756c2b2d4d2d46ba8cefa1f6 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Sat, 27 Sep 2025 03:43:47 -0700
Subject: [PATCH 007/165] fix padding

---
 language/gpt-oss/send_requests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/language/gpt-oss/send_requests.py b/language/gpt-oss/send_requests.py
index 0f830e2bef..b11e1a23c6 100755
--- a/language/gpt-oss/send_requests.py
+++ b/language/gpt-oss/send_requests.py
@@ -87,7 +87,7 @@ def tokenize_all_inputs(data: pd.DataFrame, tokenizer, max_samples: int = None):
     text_inputs = data['text_input'].tolist()[:num_samples]
     
     # Tokenize all texts at once
-    tokenized = tokenizer(text_inputs, return_tensors="pt", padding=False, truncation=True)
+    tokenized = tokenizer(text_inputs, return_tensors="pt", padding=True, truncation=True)
     input_ids_list = [tokenized['input_ids'][i].tolist() for i in range(num_samples)]
     
     logger.info(f"Tokenization complete. Token lengths: {[len(ids) for ids in input_ids_list[:5]]}...")

From 50453c60075a01f684f5e991fcb3165b34084d74 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Sat, 27 Sep 2025 03:57:16 -0700
Subject: [PATCH 008/165] concurrent requests

---
 language/gpt-oss/README_send_requests.md | 14 ++++-
 language/gpt-oss/send_requests.py        | 73 +++++++++++++++---------
 2 files changed, 57 insertions(+), 30 deletions(-)

diff --git a/language/gpt-oss/README_send_requests.md b/language/gpt-oss/README_send_requests.md
index 4181e607e2..115ece573f 100644
--- a/language/gpt-oss/README_send_requests.md
+++ b/language/gpt-oss/README_send_requests.md
@@ -24,6 +24,11 @@ python send_requests.py --model-name openai/gpt-oss-120b --data-file /path/to/da
 python send_requests.py --model-name openai/gpt-oss-120b --max-tokens 50 --server-url http://localhost:8000
 ```
 
+### Custom concurrency level:
+```bash
+python send_requests.py --model-name openai/gpt-oss-120b --max-concurrency 64
+```
+
 ## Arguments
 
 - `--data-file`: Path to pickle file containing text data (default: `/home/mlperf_inference_storage/data/deepseek-r1/mlperf_deepseek_r1_dataset_4388_fp8_eval.pkl`)
@@ -31,6 +36,7 @@ python send_requests.py --model-name openai/gpt-oss-120b --max-tokens 50 --serve
 - `--server-url`: SGLang server URL (default: `http://localhost:30000`)
 - `--max-samples`: Maximum number of samples to process (default: all)
 - `--max-tokens`: Maximum tokens to generate per request (default: 100)
+- `--max-concurrency`: Maximum number of concurrent requests (default: 128)
 - `--output`: Output file for responses (default: `responses.jsonl`)
 
 ## Output Format
@@ -63,7 +69,9 @@ pip install -r requirements.txt
 
 - The script loads text data from a pandas DataFrame in the pickle file
 - It uses the specified model's tokenizer to convert text to tokens
-- Sends tokenized input to SGLang server via `/generate` endpoint
+- Sends tokenized input to SGLang server via `/generate` endpoint in parallel using multiprocessing
 - Converts response tokens back to text using the same tokenizer
-- Responses are saved incrementally to avoid data loss
-- Progress is logged every 10 samples
\ No newline at end of file
+- Uses configurable concurrency (default: 128 concurrent requests)
+- Each process creates its own HTTP client to avoid connection issues
+- Results are maintained in order despite parallel processing
+- Progress is logged during processing
\ No newline at end of file
diff --git a/language/gpt-oss/send_requests.py b/language/gpt-oss/send_requests.py
index b11e1a23c6..493a201c56 100755
--- a/language/gpt-oss/send_requests.py
+++ b/language/gpt-oss/send_requests.py
@@ -13,6 +13,8 @@
 from typing import List, Dict, Any
 import logging
 from transformers import AutoTokenizer
+from multiprocessing import Pool
+from functools import partial
 
 # Set up logging
 logging.basicConfig(
@@ -94,27 +96,43 @@ def tokenize_all_inputs(data: pd.DataFrame, tokenizer, max_samples: int = None):
     return input_ids_list, text_inputs
 
 
-def send_requests_batch(client: SGLangClient, input_ids_list: List[List[int]], 
-                       text_inputs: List[str], max_tokens: int = 100) -> List[Dict[str, Any]]:
-    """Send all requests to SGLang server."""
+def send_single_request(args_tuple):
+    """Send a single request - used by multiprocessing pool."""
+    input_ids, max_tokens, server_url, sample_id = args_tuple
+    
+    # Create a new client for this process
+    client = SGLangClient(server_url)
+    
+    try:
+        response = client.send_request(input_ids, max_tokens=max_tokens)
+        return sample_id, response
+    except Exception as e:
+        logger.error(f"Request {sample_id} failed: {e}")
+        return sample_id, {"error": str(e)}
+
+
+def send_requests_parallel(input_ids_list: List[List[int]], server_url: str,
+                          max_tokens: int = 100, max_concurrency: int = 128) -> List[Dict[str, Any]]:
+    """Send all requests to SGLang server in parallel using multiprocessing."""
     num_samples = len(input_ids_list)
-    logger.info(f"Sending {num_samples} requests to server...")
+    logger.info(f"Sending {num_samples} requests to server with {max_concurrency} concurrent workers...")
+    
+    # Prepare arguments for multiprocessing
+    args_list = [
+        (input_ids, max_tokens, server_url, i) 
+        for i, input_ids in enumerate(input_ids_list)
+    ]
     
-    responses = []
     start_time = time.time()
     
-    for i, input_ids in enumerate(input_ids_list):
-        logger.info(f"Processing sample {i+1}/{num_samples} (token length: {len(input_ids)})")
-        
-        # Send request
-        response = client.send_request(input_ids, max_tokens=max_tokens)
-        responses.append(response)
-        
-        # Log progress
-        if (i + 1) % 10 == 0:
-            elapsed = time.time() - start_time
-            rate = (i + 1) / elapsed
-            logger.info(f"Processed {i+1}/{num_samples} samples ({rate:.2f} samples/sec)")
+    # Use multiprocessing pool
+    with Pool(processes=min(max_concurrency, num_samples)) as pool:
+        # Map the function to all arguments
+        results = pool.map(send_single_request, args_list)
+    
+    # Sort results by sample_id to maintain order
+    results.sort(key=lambda x: x[0])
+    responses = [result[1] for result in results]
     
     total_time = time.time() - start_time
     logger.info(f"Completed {num_samples} requests in {total_time:.2f} seconds")
@@ -170,16 +188,16 @@ def save_responses(responses: List[Dict[str, Any]], response_texts: List[str],
     logger.info(f"Responses saved to {output_file}")
 
 
-def process_requests(client: SGLangClient, data: pd.DataFrame, tokenizer,
+def process_requests(data: pd.DataFrame, tokenizer, server_url: str,
                     max_samples: int = None, max_tokens: int = 100,
-                    output_file: str = "responses.jsonl") -> None:
+                    max_concurrency: int = 128, output_file: str = "responses.jsonl") -> None:
     """Main processing function that handles tokenization, requests, and detokenization."""
     
     # Step 1: Tokenize all inputs
     input_ids_list, text_inputs = tokenize_all_inputs(data, tokenizer, max_samples)
     
-    # Step 2: Send all requests
-    responses = send_requests_batch(client, input_ids_list, text_inputs, max_tokens)
+    # Step 2: Send all requests in parallel
+    responses = send_requests_parallel(input_ids_list, server_url, max_tokens, max_concurrency)
     
     # Step 3: Detokenize all responses
     response_texts = detokenize_all_responses(responses, input_ids_list, tokenizer)
@@ -201,6 +219,8 @@ def main():
                         help="Maximum number of samples to process (default: all)")
     parser.add_argument("--max-tokens", type=int, default=100,
                         help="Maximum tokens to generate per request")
+    parser.add_argument("--max-concurrency", type=int, default=128,
+                        help="Maximum number of concurrent requests (default: 128)")
     parser.add_argument("--output", default="responses.jsonl",
                         help="Output file for responses")
 
@@ -212,12 +232,10 @@ def main():
     # Load tokenizer
     tokenizer = load_tokenizer(args.model_name)
 
-    # Create client
-    client = SGLangClient(args.server_url)
-
     # Test connection
     logger.info(f"Testing server connection to {args.server_url}...")
-    test_response = client.send_request([1, 2, 3], max_tokens=5)
+    test_client = SGLangClient(args.server_url)
+    test_response = test_client.send_request([1, 2, 3], max_tokens=5)
     if "error" in test_response:
         logger.error(f"Server connection failed: {test_response['error']}")
         logger.error("Make sure your SGLang server is running. Try:")
@@ -226,10 +244,11 @@ def main():
         return
     logger.info("Server connection successful")
 
-    # Process all requests in batches
-    process_requests(client, data, tokenizer,
+    # Process all requests in parallel
+    process_requests(data, tokenizer, args.server_url,
                     max_samples=args.max_samples,
                     max_tokens=args.max_tokens,
+                    max_concurrency=args.max_concurrency,
                     output_file=args.output)
 
 
From e76b68db11270f06fb724954919210f574d6bb10 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Sat, 27 Sep 2025 04:01:17 -0700
Subject: [PATCH 009/165] increase timeout

---
 language/gpt-oss/send_requests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/language/gpt-oss/send_requests.py b/language/gpt-oss/send_requests.py
index 493a201c56..db537d934a 100755
--- a/language/gpt-oss/send_requests.py
+++ b/language/gpt-oss/send_requests.py
@@ -44,7 +44,7 @@ def send_request(
             response = self.session.post(
                 f"{self.base_url}/generate",
                 json=payload,
-                timeout=60
+                timeout=1200
             )
             if response.status_code == 200:
                 return response.json()

From b6d5671b673306cf2b9e2b43e0b33d60a54e8677 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Sat, 27 Sep 2025 11:01:42 +0000
Subject: [PATCH 010/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/send_requests.py | 128 ++++++++++++++++++------------
 1 file changed, 76 insertions(+), 52 deletions(-)

diff --git a/language/gpt-oss/send_requests.py b/language/gpt-oss/send_requests.py
index db537d934a..429a9e7ee8 100755
--- a/language/gpt-oss/send_requests.py
+++ b/language/gpt-oss/send_requests.py
@@ -49,7 +49,8 @@ def send_request(
             if response.status_code == 200:
                 return response.json()
             else:
-                logger.error(f"Request failed with status {response.status_code}: {response.text}")
+                logger.error(
+                    f"Request failed with status {response.status_code}: {response.text}")
                 return {"error": f"HTTP {response.status_code}: {response.text}"}
         except requests.exceptions.RequestException as e:
             logger.error(f"Request failed: {e}")
@@ -61,11 +62,11 @@ def load_text_data(data_file: str) -> pd.DataFrame:
     logger.info(f"Loading data from {data_file}")
     with open(data_file, 'rb') as f:
         data = pickle.load(f)
-    
+
     logger.info(f"Loaded {len(data)} samples")
     logger.info(f"Columns: {list(data.columns)}")
     logger.info(f"First text input length: {len(data.iloc[0]['text_input'])}")
-    
+
     return data
 
 
@@ -81,28 +82,35 @@ def load_tokenizer(model_name: str):
         raise
 
 
-def tokenize_all_inputs(data: pd.DataFrame, tokenizer, max_samples: int = None):
+def tokenize_all_inputs(data: pd.DataFrame, tokenizer,
+                        max_samples: int = None):
     """Tokenize all text inputs at once."""
     num_samples = min(len(data), max_samples) if max_samples else len(data)
     logger.info(f"Tokenizing {num_samples} text inputs...")
-    
+
     text_inputs = data['text_input'].tolist()[:num_samples]
-    
+
     # Tokenize all texts at once
-    tokenized = tokenizer(text_inputs, return_tensors="pt", padding=True, truncation=True)
-    input_ids_list = [tokenized['input_ids'][i].tolist() for i in range(num_samples)]
-    
-    logger.info(f"Tokenization complete. Token lengths: {[len(ids) for ids in input_ids_list[:5]]}...")
+    tokenized = tokenizer(
+        text_inputs,
+        return_tensors="pt",
+        padding=True,
+        truncation=True)
+    input_ids_list = [tokenized['input_ids'][i].tolist()
+                      for i in range(num_samples)]
+
+    logger.info(
+        f"Tokenization complete. Token lengths: {[len(ids) for ids in input_ids_list[:5]]}...")
     return input_ids_list, text_inputs
 
 
 def send_single_request(args_tuple):
     """Send a single request - used by multiprocessing pool."""
     input_ids, max_tokens, server_url, sample_id = args_tuple
-    
+
     # Create a new client for this process
     client = SGLangClient(server_url)
-    
+
     try:
         response = client.send_request(input_ids, max_tokens=max_tokens)
         return sample_id, response
@@ -112,40 +120,42 @@ def send_single_request(args_tuple):
 
 
 def send_requests_parallel(input_ids_list: List[List[int]], server_url: str,
-                          max_tokens: int = 100, max_concurrency: int = 128) -> List[Dict[str, Any]]:
+                           max_tokens: int = 100, max_concurrency: int = 128) -> List[Dict[str, Any]]:
     """Send all requests to SGLang server in parallel using multiprocessing."""
     num_samples = len(input_ids_list)
-    logger.info(f"Sending {num_samples} requests to server with {max_concurrency} concurrent workers...")
-    
+    logger.info(
+        f"Sending {num_samples} requests to server with {max_concurrency} concurrent workers...")
+
     # Prepare arguments for multiprocessing
     args_list = [
-        (input_ids, max_tokens, server_url, i) 
+        (input_ids, max_tokens, server_url, i)
         for i, input_ids in enumerate(input_ids_list)
     ]
-    
+
     start_time = time.time()
-    
+
     # Use multiprocessing pool
     with Pool(processes=min(max_concurrency, num_samples)) as pool:
         # Map the function to all arguments
         results = pool.map(send_single_request, args_list)
-    
+
     # Sort results by sample_id to maintain order
     results.sort(key=lambda x: x[0])
     responses = [result[1] for result in results]
-    
+
     total_time = time.time() - start_time
-    logger.info(f"Completed {num_samples} requests in {total_time:.2f} seconds")
+    logger.info(
+        f"Completed {num_samples} requests in {total_time:.2f} seconds")
     logger.info(f"Average rate: {num_samples/total_time:.2f} requests/sec")
-    
+
     return responses
 
 
-def detokenize_all_responses(responses: List[Dict[str, Any]], input_ids_list: List[List[int]], 
-                           tokenizer) -> List[str]:
+def detokenize_all_responses(responses: List[Dict[str, Any]], input_ids_list: List[List[int]],
+                             tokenizer) -> List[str]:
     """Detokenize all responses at once."""
     logger.info("Detokenizing responses...")
-    
+
     response_texts = []
     for i, (response, input_ids) in enumerate(zip(responses, input_ids_list)):
         response_text = ""
@@ -153,57 +163,71 @@ def detokenize_all_responses(responses: List[Dict[str, Any]], input_ids_list: Li
             try:
                 # Extract generated tokens (excluding input tokens)
                 generated_tokens = response["generated_text"][len(input_ids):]
-                response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+                response_text = tokenizer.decode(
+                    generated_tokens, skip_special_tokens=True)
             except Exception as e:
-                logger.warning(f"Failed to decode response for sample {i+1}: {e}")
+                logger.warning(
+                    f"Failed to decode response for sample {i+1}: {e}")
         response_texts.append(response_text)
-    
+
     logger.info("Detokenization complete")
     return response_texts
 
 
-def save_responses(responses: List[Dict[str, Any]], response_texts: List[str], 
-                  text_inputs: List[str], input_ids_list: List[List[int]], 
-                  output_file: str) -> None:
+def save_responses(responses: List[Dict[str, Any]], response_texts: List[str],
+                   text_inputs: List[str], input_ids_list: List[List[int]],
+                   output_file: str) -> None:
     """Save all responses to file."""
     logger.info(f"Saving responses to {output_file}...")
-    
+
     with open(output_file, 'w') as f:
         for i, (response, response_text, text_input, input_ids) in enumerate(
-            zip(responses, response_texts, text_inputs, input_ids_list)):
-            
+                zip(responses, response_texts, text_inputs, input_ids_list)):
+
             response_data = {
                 "sample_id": int(i),
                 "text_input": text_input[:200] + "..." if len(text_input) > 200 else text_input,
                 "input_length": len(text_input),
                 "token_length": len(input_ids),
-                "input_tokens": input_ids[:10],  # First 10 tokens for reference
+                # First 10 tokens for reference
+                "input_tokens": input_ids[:10],
                 "response": response,
                 "response_text": response_text,
                 "timestamp": float(time.time())
             }
-            
+
             f.write(json.dumps(response_data) + '\n')
-    
+
     logger.info(f"Responses saved to {output_file}")
 
 
 def process_requests(data: pd.DataFrame, tokenizer, server_url: str,
-                    max_samples: int = None, max_tokens: int = 100,
-                    max_concurrency: int = 128, output_file: str = "responses.jsonl") -> None:
+                     max_samples: int = None, max_tokens: int = 100,
+                     max_concurrency: int = 128, output_file: str = "responses.jsonl") -> None:
     """Main processing function that handles tokenization, requests, and detokenization."""
-    
+
     # Step 1: Tokenize all inputs
-    input_ids_list, text_inputs = tokenize_all_inputs(data, tokenizer, max_samples)
-    
+    input_ids_list, text_inputs = tokenize_all_inputs(
+        data, tokenizer, max_samples)
+
     # Step 2: Send all requests in parallel
-    responses = send_requests_parallel(input_ids_list, server_url, max_tokens, max_concurrency)
-    
+    responses = send_requests_parallel(
+        input_ids_list,
+        server_url,
+        max_tokens,
+        max_concurrency)
+
     # Step 3: Detokenize all responses
-    response_texts = detokenize_all_responses(responses, input_ids_list, tokenizer)
-    
+    response_texts = detokenize_all_responses(
+        responses, input_ids_list, tokenizer)
+
     # Step 4: Save all results
-    save_responses(responses, response_texts, text_inputs, input_ids_list, output_file)
+    save_responses(
+        responses,
+        response_texts,
+        text_inputs,
+        input_ids_list,
+        output_file)
 
 
 def main():
@@ -228,7 +252,7 @@ def main():
 
     # Load data
     data = load_text_data(args.data_file)
-    
+
     # Load tokenizer
     tokenizer = load_tokenizer(args.model_name)
 
@@ -246,10 +270,10 @@ def main():
 
     # Process all requests in parallel
     process_requests(data, tokenizer, args.server_url,
-                    max_samples=args.max_samples,
-                    max_tokens=args.max_tokens,
-                    max_concurrency=args.max_concurrency,
-                    output_file=args.output)
+                     max_samples=args.max_samples,
+                     max_tokens=args.max_tokens,
+                     max_concurrency=args.max_concurrency,
+                     output_file=args.output)
 
 
 if __name__ == "__main__":

From 4fd4f56b28bc670f5b8f5fde021687ffb2a27d27 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Sat, 27 Sep 2025 12:30:50 -0700
Subject: [PATCH 011/165] add refactor changes

---
 language/gpt-oss/send_requests.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/language/gpt-oss/send_requests.py b/language/gpt-oss/send_requests.py
index 429a9e7ee8..7bf809492a 100755
--- a/language/gpt-oss/send_requests.py
+++ b/language/gpt-oss/send_requests.py
@@ -250,12 +250,6 @@ def main():
 
     args = parser.parse_args()
 
-    # Load data
-    data = load_text_data(args.data_file)
-
-    # Load tokenizer
-    tokenizer = load_tokenizer(args.model_name)
-
     # Test connection
     logger.info(f"Testing server connection to {args.server_url}...")
     test_client = SGLangClient(args.server_url)
@@ -268,7 +262,10 @@ def main():
         return
     logger.info("Server connection successful")
 
-    # Process all requests in parallel
+
+    data = load_text_data(args.data_file)
+    tokenizer = load_tokenizer(args.model_name)
+
     process_requests(data, tokenizer, args.server_url,
                      max_samples=args.max_samples,
                      max_tokens=args.max_tokens,

From 1df0885944fb88af554b86025c845c2152970207 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Sat, 27 Sep 2025 19:31:16 +0000
Subject: [PATCH 012/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/send_requests.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/language/gpt-oss/send_requests.py b/language/gpt-oss/send_requests.py
index 7bf809492a..4e33b90563 100755
--- a/language/gpt-oss/send_requests.py
+++ b/language/gpt-oss/send_requests.py
@@ -262,7 +262,6 @@ def main():
         return
     logger.info("Server connection successful")
 
-
     data = load_text_data(args.data_file)
     tokenizer = load_tokenizer(args.model_name)
 

From eb2f48ccc727e2210b5053dd8d6e525e4d25e79a Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Sat, 27 Sep 2025 13:09:33 -0700
Subject: [PATCH 013/165] rm truncation

---
 language/gpt-oss/send_requests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/language/gpt-oss/send_requests.py b/language/gpt-oss/send_requests.py
index 4e33b90563..90c823b7c4 100755
--- a/language/gpt-oss/send_requests.py
+++ b/language/gpt-oss/send_requests.py
@@ -186,11 +186,11 @@ def save_responses(responses: List[Dict[str, Any]], response_texts: List[str],
 
             response_data = {
                 "sample_id": int(i),
-                "text_input": text_input[:200] + "..." if len(text_input) > 200 else text_input,
+                "text_input": text_input,
                 "input_length": len(text_input),
                 "token_length": len(input_ids),
                 # First 10 tokens for reference
-                "input_tokens": input_ids[:10],
+                "input_tokens": input_ids,
                 "response": response,
                 "response_text": response_text,
                 "timestamp": float(time.time())

From 4f35b8aa04b9ce688f0bf854c68e7c54a04226af Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Sat, 27 Sep 2025 13:26:48 -0700
Subject: [PATCH 014/165] rm truncation, wait for server ready

---
 language/gpt-oss/run_ref_bench.sh | 106 ++++++++++++++++++++++++++++++
 language/gpt-oss/send_requests.py |   5 +-
 2 files changed, 109 insertions(+), 2 deletions(-)
 create mode 100644 language/gpt-oss/run_ref_bench.sh

diff --git a/language/gpt-oss/run_ref_bench.sh b/language/gpt-oss/run_ref_bench.sh
new file mode 100644
index 0000000000..e585915608
--- /dev/null
+++ b/language/gpt-oss/run_ref_bench.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+
+model_name="openai/gpt-oss-120b"
+data_file="/home/mlperf_inference_storage/data/deepseek-r1/mlperf_deepseek_r1_dataset_4388_fp8_eval.pkl"
+output_file="responses.jsonl"
+max_samples=4388
+max_token_osl=20000
+max_concurrency=256
+tep_size=8
+mlperf_storage="/home/mlperf_inference_storage"
+
+out_folder=output_$SLURM_JOB_ID
+mkdir -p $out_folder
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --help|-h)
+            echo "Usage: $0 [OPTIONS]"
+            echo "Options:"
+            echo "    --model-name: Set the model name"
+            echo "    --data-file: Set the data file"
+            echo "    --output-file: Set the output file"
+            echo "    --max-samples: Set the maximum number of samples"
+            echo "    --max-token-osl: Set the maximum token output length"
+            echo "    --max-concurrency: Set the maximum concurrency for requests sent"
+            echo "    --tep-size: Set tp, ep size for server"
+            echo "    --mlperf-storage: Set the mlperf storage directory"
+            echo "    --help|-h: Show this help message"
+            exit 0
+            ;;
+        --model-name)
+            model_name=$2
+            shift 2
+            ;;
+        --data-file)
+            data_file=$2
+            shift 2
+            ;;
+        --output-file)
+            output_file=$2
+            shift 2
+            ;;
+        --max-samples)
+            max_samples=$2
+            shift 2
+            ;;
+        --max-tokens)
+            max_tokens=$2
+            shift 2
+            ;;
+        --max-concurrency)
+            max_concurrency=$2
+            shift 2
+            ;;
+        --tep-size)
+            tep_size=$2
+            shift 2
+            ;;
+        --mlperf-storage)
+            mlperf_storage=$2
+            shift 2
+            ;;
+    esac
+done
+
+srun --nodes=1 \
+    --output=$out_folder/server.log \
+    --container-image=images/lmsysorg+sglang+v0.5.3rc1.sqsh \
+    --container-mounts=$(pwd):/$(pwd),$mlperf_storage:/home/mlperf_inference_storage,
+    --container-name=sglang_server_$SLURM_JOB_ID \
+    python3 -m sglang.launch_server \
+    --model-path $model_name \
+    --host 0.0.0.0 \
+    --port 30000 \
+    --trust-remote-code \
+    --tp-size=$tep_size \
+    --data-parallel-size=1 \
+    --cuda-graph-max-bs $max_concurrency \
+    --max-running-requests $max_concurrency \
+    --mem-fraction-static 0.85 \
+    --kv-cache-dtype fp8_e4m3 \
+    --chunked-prefill-size 16384 \
+    --ep-size $tep_size \
+    --quantization modelopt_fp4  \
+    --enable-flashinfer-allreduce-fusion \
+    --enable-symm-mem  \
+    --disable-radix-cache \
+    --attention-backend trtllm_mla \
+    --moe-runner-backend flashinfer_trtllm \
+    --stream-interval 10 &
+
+echo "Waiting for server to start on port 30000..."
+while ! srun --nodes=1 --overlap netstat -tulnp 2>/dev/null | grep -q ":30000"; do
+    echo "Server not ready yet, waiting..."
+    sleep 5
+done
+echo "Server is ready on port 30000!"
+
+srun --nodes=1 --overlap --container-name sglang_client_$SLURM_JOB_ID --output=$out_folder/client.log \
+    python3 send_requests.py \
+    --model-name $model_name \
+    --data-file $data_file \
+    --output $out_folder/responses.jsonl \
+    --max-samples $max_samples \
+    --max-tokens $max_token_osl \
+    --max-concurrency $max_concurrency
\ No newline at end of file
diff --git a/language/gpt-oss/send_requests.py b/language/gpt-oss/send_requests.py
index 90c823b7c4..7bf809492a 100755
--- a/language/gpt-oss/send_requests.py
+++ b/language/gpt-oss/send_requests.py
@@ -186,11 +186,11 @@ def save_responses(responses: List[Dict[str, Any]], response_texts: List[str],
 
             response_data = {
                 "sample_id": int(i),
-                "text_input": text_input,
+                "text_input": text_input[:200] + "..." if len(text_input) > 200 else text_input,
                 "input_length": len(text_input),
                 "token_length": len(input_ids),
                 # First 10 tokens for reference
-                "input_tokens": input_ids,
+                "input_tokens": input_ids[:10],
                 "response": response,
                 "response_text": response_text,
                 "timestamp": float(time.time())
@@ -262,6 +262,7 @@ def main():
         return
     logger.info("Server connection successful")
 
+
     data = load_text_data(args.data_file)
     tokenizer = load_tokenizer(args.model_name)
 

From 354cb62c95020932ca2e4b88dca6407c6ceadd96 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Sat, 27 Sep 2025 13:28:01 -0700
Subject: [PATCH 015/165] left padding

---
 language/gpt-oss/send_requests.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/language/gpt-oss/send_requests.py b/language/gpt-oss/send_requests.py
index 7bf809492a..3a074eb5f2 100755
--- a/language/gpt-oss/send_requests.py
+++ b/language/gpt-oss/send_requests.py
@@ -95,7 +95,8 @@ def tokenize_all_inputs(data: pd.DataFrame, tokenizer,
         text_inputs,
         return_tensors="pt",
         padding=True,
-        truncation=True)
+        truncation=True,
+        padding_side="left")
     input_ids_list = [tokenized['input_ids'][i].tolist()
                       for i in range(num_samples)]
 
@@ -186,11 +187,10 @@ def save_responses(responses: List[Dict[str, Any]], response_texts: List[str],
 
             response_data = {
                 "sample_id": int(i),
-                "text_input": text_input[:200] + "..." if len(text_input) > 200 else text_input,
+                "text_input": text_input,
                 "input_length": len(text_input),
                 "token_length": len(input_ids),
-                # First 10 tokens for reference
-                "input_tokens": input_ids[:10],
+                "input_tokens": input_ids,
                 "response": response,
                 "response_text": response_text,
                 "timestamp": float(time.time())

From 75f4307e36b92ae3fb5610667d0514dc7259d4a1 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Sat, 27 Sep 2025 20:28:32 +0000
Subject: [PATCH 016/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/send_requests.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/language/gpt-oss/send_requests.py b/language/gpt-oss/send_requests.py
index 3a074eb5f2..983625a56c 100755
--- a/language/gpt-oss/send_requests.py
+++ b/language/gpt-oss/send_requests.py
@@ -262,7 +262,6 @@ def main():
         return
     logger.info("Server connection successful")
 
-
     data = load_text_data(args.data_file)
     tokenizer = load_tokenizer(args.model_name)
 

From 065bf7cfe298f8300f6a76667f8e17133607a6f4 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Sat, 27 Sep 2025 13:38:04 -0700
Subject: [PATCH 017/165] fixes

---
 language/gpt-oss/run_ref_bench.sh | 6 ++++--
 language/gpt-oss/send_requests.py | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/language/gpt-oss/run_ref_bench.sh b/language/gpt-oss/run_ref_bench.sh
index e585915608..4440478d5b 100644
--- a/language/gpt-oss/run_ref_bench.sh
+++ b/language/gpt-oss/run_ref_bench.sh
@@ -63,10 +63,11 @@ while [[ $# -gt 0 ]]; do
     esac
 done
 
+set -x; 
 srun --nodes=1 \
     --output=$out_folder/server.log \
     --container-image=images/lmsysorg+sglang+v0.5.3rc1.sqsh \
-    --container-mounts=$(pwd):/$(pwd),$mlperf_storage:/home/mlperf_inference_storage,
+    --container-mounts=$(pwd):/$(pwd),$mlperf_storage:/home/mlperf_inference_storage \
     --container-name=sglang_server_$SLURM_JOB_ID \
     python3 -m sglang.launch_server \
     --model-path $model_name \
@@ -88,6 +89,7 @@ srun --nodes=1 \
     --attention-backend trtllm_mla \
     --moe-runner-backend flashinfer_trtllm \
     --stream-interval 10 &
+set +x;
 
 echo "Waiting for server to start on port 30000..."
 while ! srun --nodes=1 --overlap netstat -tulnp 2>/dev/null | grep -q ":30000"; do
@@ -103,4 +105,4 @@ srun --nodes=1 --overlap --container-name sglang_client_$SLURM_JOB_ID --output=$
     --output $out_folder/responses.jsonl \
     --max-samples $max_samples \
     --max-tokens $max_token_osl \
-    --max-concurrency $max_concurrency
\ No newline at end of file
+    --max-concurrency $max_concurrency
diff --git a/language/gpt-oss/send_requests.py b/language/gpt-oss/send_requests.py
index 983625a56c..60e92b1960 100755
--- a/language/gpt-oss/send_requests.py
+++ b/language/gpt-oss/send_requests.py
@@ -96,7 +96,7 @@ def tokenize_all_inputs(data: pd.DataFrame, tokenizer,
         return_tensors="pt",
         padding=True,
         truncation=True,
-        padding_side="left")
+        padding_side="right")
     input_ids_list = [tokenized['input_ids'][i].tolist()
                       for i in range(num_samples)]
 

From 36aa581179553b8efd184f56f8658853212298ff Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Sat, 27 Sep 2025 13:44:24 -0700
Subject: [PATCH 018/165] add failure check

---
 language/gpt-oss/run_ref_bench.sh | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/language/gpt-oss/run_ref_bench.sh b/language/gpt-oss/run_ref_bench.sh
index 4440478d5b..e4f5006b4b 100644
--- a/language/gpt-oss/run_ref_bench.sh
+++ b/language/gpt-oss/run_ref_bench.sh
@@ -91,10 +91,34 @@ srun --nodes=1 \
     --stream-interval 10 &
 set +x;
 
+SERVER_PID=$!
+echo "Server launched with PID: $SERVER_PID"
+
 echo "Waiting for server to start on port 30000..."
+TIMEOUT=300  # 5 minutes timeout
+ELAPSED=0
 while ! srun --nodes=1 --overlap netstat -tulnp 2>/dev/null | grep -q ":30000"; do
-    echo "Server not ready yet, waiting..."
+    # Check if server process is still running
+    if ! kill -0 $SERVER_PID 2>/dev/null; then
+        echo "ERROR: Server process has died. Checking server logs..."
+        echo "Last 20 lines of server log:"
+        tail -20 $out_folder/server.log
+        echo "Server launch failed. Exiting."
+        exit 1
+    fi
+    
+    # Check for timeout
+    if [ $ELAPSED -ge $TIMEOUT ]; then
+        echo "ERROR: Server failed to start within $TIMEOUT seconds. Checking server logs..."
+        echo "Last 20 lines of server log:"
+        tail -20 $out_folder/server.log
+        echo "Timeout reached. Exiting."
+        exit 1
+    fi
+    
+    echo "Server not ready yet, waiting... (${ELAPSED}s/${TIMEOUT}s)"
     sleep 5
+    ELAPSED=$((ELAPSED + 5))
 done
 echo "Server is ready on port 30000!"
 

From c75e629a686ee796cfee7e75c888fd946e71e47c Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Sat, 27 Sep 2025 16:31:33 -0700
Subject: [PATCH 019/165] change opts

---
 language/gpt-oss/run_ref_bench.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
 mode change 100644 => 100755 language/gpt-oss/run_ref_bench.sh

diff --git a/language/gpt-oss/run_ref_bench.sh b/language/gpt-oss/run_ref_bench.sh
old mode 100644
new mode 100755
index e4f5006b4b..300da9aa87
--- a/language/gpt-oss/run_ref_bench.sh
+++ b/language/gpt-oss/run_ref_bench.sh
@@ -66,7 +66,7 @@ done
 set -x; 
 srun --nodes=1 \
     --output=$out_folder/server.log \
-    --container-image=images/lmsysorg+sglang+v0.5.3rc1.sqsh \
+    --container-image=./images/lmsysorg+sglang+v0.5.3rc1.sqsh \
     --container-mounts=$(pwd):/$(pwd),$mlperf_storage:/home/mlperf_inference_storage \
     --container-name=sglang_server_$SLURM_JOB_ID \
     python3 -m sglang.launch_server \
@@ -82,11 +82,11 @@ srun --nodes=1 \
     --kv-cache-dtype fp8_e4m3 \
     --chunked-prefill-size 16384 \
     --ep-size $tep_size \
-    --quantization modelopt_fp4  \
+    --quantization mxfp4  \
     --enable-flashinfer-allreduce-fusion \
     --enable-symm-mem  \
     --disable-radix-cache \
-    --attention-backend trtllm_mla \
+    --attention-backend trtllm_mha \
     --moe-runner-backend flashinfer_trtllm \
     --stream-interval 10 &
 set +x;
@@ -95,7 +95,7 @@ SERVER_PID=$!
 echo "Server launched with PID: $SERVER_PID"
 
 echo "Waiting for server to start on port 30000..."
-TIMEOUT=300  # 5 minutes timeout
+TIMEOUT=1200  # 20 minutes timeout
 ELAPSED=0
 while ! srun --nodes=1 --overlap netstat -tulnp 2>/dev/null | grep -q ":30000"; do
     # Check if server process is still running

From 040e98645a331e0ca4ed45d259f97dc047efc9d8 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Tue, 30 Sep 2025 23:21:24 -0700
Subject: [PATCH 020/165] organize files

---
 language/gpt-oss/README_send_requests.md      | 77 -------------------
 .../{ => infer-sglang}/run_ref_bench.sh       |  0
 .../{ => infer-sglang}/send_requests.py       |  0
 3 files changed, 77 deletions(-)
 delete mode 100644 language/gpt-oss/README_send_requests.md
 rename language/gpt-oss/{ => infer-sglang}/run_ref_bench.sh (100%)
 rename language/gpt-oss/{ => infer-sglang}/send_requests.py (100%)

diff --git a/language/gpt-oss/README_send_requests.md b/language/gpt-oss/README_send_requests.md
deleted file mode 100644
index 115ece573f..0000000000
--- a/language/gpt-oss/README_send_requests.md
+++ /dev/null
@@ -1,77 +0,0 @@
-# SGLang Request Sender
-
-This script loads text data from a pickle file, tokenizes it using a specified model's tokenizer, sends requests to a running SGLang server, and converts responses back to text.
-
-## Usage
-
-### Basic usage:
-```bash
-python send_requests.py --model-name openai/gpt-oss-120b
-```
-
-### Process only first 100 samples:
-```bash
-python send_requests.py --model-name openai/gpt-oss-120b --max-samples 100
-```
-
-### Custom data file and output:
-```bash
-python send_requests.py --model-name openai/gpt-oss-120b --data-file /path/to/data.pkl --output my_responses.jsonl
-```
-
-### Custom max tokens and server URL:
-```bash
-python send_requests.py --model-name openai/gpt-oss-120b --max-tokens 50 --server-url http://localhost:8000
-```
-
-### Custom concurrency level:
-```bash
-python send_requests.py --model-name openai/gpt-oss-120b --max-concurrency 64
-```
-
-## Arguments
-
-- `--data-file`: Path to pickle file containing text data (default: `/home/mlperf_inference_storage/data/deepseek-r1/mlperf_deepseek_r1_dataset_4388_fp8_eval.pkl`)
-- `--model-name`: Model name for tokenizer (required, e.g., `openai/gpt-oss-120b`)
-- `--server-url`: SGLang server URL (default: `http://localhost:30000`)
-- `--max-samples`: Maximum number of samples to process (default: all)
-- `--max-tokens`: Maximum tokens to generate per request (default: 100)
-- `--max-concurrency`: Maximum number of concurrent requests (default: 128)
-- `--output`: Output file for responses (default: `responses.jsonl`)
-
-## Output Format
-
-The script outputs a JSONL file where each line contains:
-```json
-{
-  "sample_id": 0,
-  "text_input": "Here are some example problems...",
-  "input_length": 1283,
-  "token_length": 512,
-  "input_tokens": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-  "response": {
-    "generated_text": [1, 2, 3, ...],
-    "usage": {...}
-  },
-  "response_text": "The answer is 6...",
-  "timestamp": 1695821234.567
-}
-```
-
-## Requirements
-
-Install dependencies:
-```bash
-pip install -r requirements.txt
-```
-
-## Notes
-
-- The script loads text data from a pandas DataFrame in the pickle file
-- It uses the specified model's tokenizer to convert text to tokens
-- Sends tokenized input to SGLang server via `/generate` endpoint in parallel using multiprocessing
-- Converts response tokens back to text using the same tokenizer
-- Uses configurable concurrency (default: 128 concurrent requests)
-- Each process creates its own HTTP client to avoid connection issues
-- Results are maintained in order despite parallel processing
-- Progress is logged during processing
\ No newline at end of file
diff --git a/language/gpt-oss/run_ref_bench.sh b/language/gpt-oss/infer-sglang/run_ref_bench.sh
similarity index 100%
rename from language/gpt-oss/run_ref_bench.sh
rename to language/gpt-oss/infer-sglang/run_ref_bench.sh
diff --git a/language/gpt-oss/send_requests.py b/language/gpt-oss/infer-sglang/send_requests.py
similarity index 100%
rename from language/gpt-oss/send_requests.py
rename to language/gpt-oss/infer-sglang/send_requests.py

From 991ff7e98c6927cf1af81367444bf406495ae107 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Tue, 30 Sep 2025 23:29:08 -0700
Subject: [PATCH 021/165] rm submodule

---
 .gitmodules                  | 4 ----
 language/gpt-oss/deps/sglang | 1 -
 2 files changed, 5 deletions(-)
 delete mode 160000 language/gpt-oss/deps/sglang

diff --git a/.gitmodules b/.gitmodules
index d177d01f7c..7f0d5d302a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,7 +10,3 @@
 [submodule "language/deepseek-r1/submodules/LiveCodeBench"]
 	path = language/deepseek-r1/submodules/LiveCodeBench
 	url = https://github.com/LiveCodeBench/LiveCodeBench
-[submodule "language/gpt-oss/deps/sglang"]
-	path = language/gpt-oss/deps/sglang
-	url = git@github.com:sgl-project/sglang.git
-	branch = main
diff --git a/language/gpt-oss/deps/sglang b/language/gpt-oss/deps/sglang
deleted file mode 160000
index 172bcf0152..0000000000
--- a/language/gpt-oss/deps/sglang
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 172bcf0152d0d4e4fdd12d7b5c47195ea2aa28ac

From 492847a412312ea4538425961f91ec49f22c3fcc Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 1 Oct 2025 11:29:04 +0000
Subject: [PATCH 022/165] add infer stuff

---
 .../gpt-oss/infer-sglang/run_ref_bench.sh     | 132 ---------
 language/gpt-oss/infer-sglang/run_server.sh   |  26 ++
 .../gpt-oss/infer-sglang/send_requests.py     | 274 ++++++++++--------
 3 files changed, 174 insertions(+), 258 deletions(-)
 delete mode 100755 language/gpt-oss/infer-sglang/run_ref_bench.sh
 create mode 100755 language/gpt-oss/infer-sglang/run_server.sh

diff --git a/language/gpt-oss/infer-sglang/run_ref_bench.sh b/language/gpt-oss/infer-sglang/run_ref_bench.sh
deleted file mode 100755
index 300da9aa87..0000000000
--- a/language/gpt-oss/infer-sglang/run_ref_bench.sh
+++ /dev/null
@@ -1,132 +0,0 @@
-#!/bin/bash
-
-model_name="openai/gpt-oss-120b"
-data_file="/home/mlperf_inference_storage/data/deepseek-r1/mlperf_deepseek_r1_dataset_4388_fp8_eval.pkl"
-output_file="responses.jsonl"
-max_samples=4388
-max_token_osl=20000
-max_concurrency=256
-tep_size=8
-mlperf_storage="/home/mlperf_inference_storage"
-
-out_folder=output_$SLURM_JOB_ID
-mkdir -p $out_folder
-
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --help|-h)
-            echo "Usage: $0 [OPTIONS]"
-            echo "Options:"
-            echo "    --model-name: Set the model name"
-            echo "    --data-file: Set the data file"
-            echo "    --output-file: Set the output file"
-            echo "    --max-samples: Set the maximum number of samples"
-            echo "    --max-token-osl: Set the maximum token output length"
-            echo "    --max-concurrency: Set the maximum concurrency for requests sent"
-            echo "    --tep-size: Set tp, ep size for server"
-            echo "    --mlperf-storage: Set the mlperf storage directory"
-            echo "    --help|-h: Show this help message"
-            exit 0
-            ;;
-        --model-name)
-            model_name=$2
-            shift 2
-            ;;
-        --data-file)
-            data_file=$2
-            shift 2
-            ;;
-        --output-file)
-            output_file=$2
-            shift 2
-            ;;
-        --max-samples)
-            max_samples=$2
-            shift 2
-            ;;
-        --max-tokens)
-            max_tokens=$2
-            shift 2
-            ;;
-        --max-concurrency)
-            max_concurrency=$2
-            shift 2
-            ;;
-        --tep-size)
-            tep_size=$2
-            shift 2
-            ;;
-        --mlperf-storage)
-            mlperf_storage=$2
-            shift 2
-            ;;
-    esac
-done
-
-set -x; 
-srun --nodes=1 \
-    --output=$out_folder/server.log \
-    --container-image=./images/lmsysorg+sglang+v0.5.3rc1.sqsh \
-    --container-mounts=$(pwd):/$(pwd),$mlperf_storage:/home/mlperf_inference_storage \
-    --container-name=sglang_server_$SLURM_JOB_ID \
-    python3 -m sglang.launch_server \
-    --model-path $model_name \
-    --host 0.0.0.0 \
-    --port 30000 \
-    --trust-remote-code \
-    --tp-size=$tep_size \
-    --data-parallel-size=1 \
-    --cuda-graph-max-bs $max_concurrency \
-    --max-running-requests $max_concurrency \
-    --mem-fraction-static 0.85 \
-    --kv-cache-dtype fp8_e4m3 \
-    --chunked-prefill-size 16384 \
-    --ep-size $tep_size \
-    --quantization mxfp4  \
-    --enable-flashinfer-allreduce-fusion \
-    --enable-symm-mem  \
-    --disable-radix-cache \
-    --attention-backend trtllm_mha \
-    --moe-runner-backend flashinfer_trtllm \
-    --stream-interval 10 &
-set +x;
-
-SERVER_PID=$!
-echo "Server launched with PID: $SERVER_PID"
-
-echo "Waiting for server to start on port 30000..."
-TIMEOUT=1200  # 20 minutes timeout
-ELAPSED=0
-while ! srun --nodes=1 --overlap netstat -tulnp 2>/dev/null | grep -q ":30000"; do
-    # Check if server process is still running
-    if ! kill -0 $SERVER_PID 2>/dev/null; then
-        echo "ERROR: Server process has died. Checking server logs..."
-        echo "Last 20 lines of server log:"
-        tail -20 $out_folder/server.log
-        echo "Server launch failed. Exiting."
-        exit 1
-    fi
-    
-    # Check for timeout
-    if [ $ELAPSED -ge $TIMEOUT ]; then
-        echo "ERROR: Server failed to start within $TIMEOUT seconds. Checking server logs..."
-        echo "Last 20 lines of server log:"
-        tail -20 $out_folder/server.log
-        echo "Timeout reached. Exiting."
-        exit 1
-    fi
-    
-    echo "Server not ready yet, waiting... (${ELAPSED}s/${TIMEOUT}s)"
-    sleep 5
-    ELAPSED=$((ELAPSED + 5))
-done
-echo "Server is ready on port 30000!"
-
-srun --nodes=1 --overlap --container-name sglang_client_$SLURM_JOB_ID --output=$out_folder/client.log \
-    python3 send_requests.py \
-    --model-name $model_name \
-    --data-file $data_file \
-    --output $out_folder/responses.jsonl \
-    --max-samples $max_samples \
-    --max-tokens $max_token_osl \
-    --max-concurrency $max_concurrency
diff --git a/language/gpt-oss/infer-sglang/run_server.sh b/language/gpt-oss/infer-sglang/run_server.sh
new file mode 100755
index 0000000000..0ec2e3050c
--- /dev/null
+++ b/language/gpt-oss/infer-sglang/run_server.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+dp=1
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --dp)
+            dp=$2
+            shift 2
+            ;;
+    esac
+done
+
+set -x;
+python3 -m sglang.launch_server \
+    --model-path openai/gpt-oss-120b \
+    --host 0.0.0.0 \
+    --port 30000 \
+    --tp-size=1 \
+    --data-parallel-size=$dp \
+    --max-running-requests 256 \
+    --mem-fraction-static 0.85 \
+    --chunked-prefill-size 16384 \
+    --ep-size=1 \
+    --quantization mxfp4 \
+    --stream-interval 50
diff --git a/language/gpt-oss/infer-sglang/send_requests.py b/language/gpt-oss/infer-sglang/send_requests.py
index 60e92b1960..367e69003d 100755
--- a/language/gpt-oss/infer-sglang/send_requests.py
+++ b/language/gpt-oss/infer-sglang/send_requests.py
@@ -1,20 +1,28 @@
 #!/usr/bin/env python3
 """
-Script to send text requests to SGLang server with tokenization.
+Script to send pre-tokenized requests to SGLang server.
+
+Usage:
+    python send_requests.py --input-tokens tokenized_data.pkl [options]
+
+Arguments:
+    --input-tokens     Path to pickle file containing pre-tokenized data from harmony-tokens.py
+    --server-url       SGLang server URL (default: http://umbriel-b200-145:30000)
+    --max-samples      Maximum number of samples to process (default: all)
+    --max-tokens       Maximum tokens to generate per request (default: 100)
+    --max-concurrency  Maximum number of concurrent requests (default: 128)
+    --output           Output pickle file for responses (optional)
 """
 
-import numpy as np
-import pandas as pd
-import pickle
 import requests
 import json
 import time
 import argparse
 from typing import List, Dict, Any
 import logging
-from transformers import AutoTokenizer
 from multiprocessing import Pool
-from functools import partial
+import pandas as pd
+from tqdm import tqdm
 
 # Set up logging
 logging.basicConfig(
@@ -57,52 +65,38 @@ def send_request(
             return {"error": str(e)}
 
 
-def load_text_data(data_file: str) -> pd.DataFrame:
-    """Load the text data from pickle file."""
-    logger.info(f"Loading data from {data_file}")
-    with open(data_file, 'rb') as f:
-        data = pickle.load(f)
-
-    logger.info(f"Loaded {len(data)} samples")
-    logger.info(f"Columns: {list(data.columns)}")
-    logger.info(f"First text input length: {len(data.iloc[0]['text_input'])}")
-
-    return data
-
-
-def load_tokenizer(model_name: str):
-    """Load tokenizer for the specified model."""
-    logger.info(f"Loading tokenizer for {model_name}")
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        logger.info(f"Tokenizer loaded successfully")
-        return tokenizer
-    except Exception as e:
-        logger.error(f"Failed to load tokenizer: {e}")
-        raise
-
-
-def tokenize_all_inputs(data: pd.DataFrame, tokenizer,
-                        max_samples: int = None):
-    """Tokenize all text inputs at once."""
-    num_samples = min(len(data), max_samples) if max_samples else len(data)
-    logger.info(f"Tokenizing {num_samples} text inputs...")
-
-    text_inputs = data['text_input'].tolist()[:num_samples]
-
-    # Tokenize all texts at once
-    tokenized = tokenizer(
-        text_inputs,
-        return_tensors="pt",
-        padding=True,
-        truncation=True,
-        padding_side="right")
-    input_ids_list = [tokenized['input_ids'][i].tolist()
-                      for i in range(num_samples)]
-
-    logger.info(
-        f"Tokenization complete. Token lengths: {[len(ids) for ids in input_ids_list[:5]]}...")
-    return input_ids_list, text_inputs
+def load_tokenized_data(data_file: str) -> pd.DataFrame:
+    """Load pre-tokenized data from pickle file produced by harmony-tokens.py."""
+    logger.info(f"Loading tokenized data from {data_file}")
+    
+    # Load DataFrame from pickle
+    df = pd.read_pickle(data_file)
+    logger.info(f"Loaded DataFrame with shape: {df.shape}")
+    
+    # Check if tok_input column exists and has valid data
+    if 'tok_input' in df.columns:
+        # Check for any None values in tok_input (indicating failed tokenization)
+        failed_mask = df['tok_input'].isna()
+        failed_count = failed_mask.sum()
+        
+        if failed_count > 0:
+            failed_indices = df[failed_mask].index.unique()
+            error_msg = f"Found {failed_count} failed tokenized samples at indices: {failed_indices.tolist()}"
+            logger.error(error_msg)
+            raise AssertionError(error_msg)
+        
+        # Check first sample
+        first_tokens = df.iloc[0]['tok_input']
+        if isinstance(first_tokens, list):
+            logger.info(f"First sample token length: {len(first_tokens)}")
+        else:
+            logger.warning("tok_input column exists but first sample is not a list")
+        
+        logger.info(f"All {len(df)} samples were successfully tokenized")
+    else:
+        logger.warning("No 'tok_input' column found in DataFrame")
+    
+    return df
 
 
 def send_single_request(args_tuple):
@@ -120,25 +114,30 @@ def send_single_request(args_tuple):
         return sample_id, {"error": str(e)}
 
 
-def send_requests_parallel(input_ids_list: List[List[int]], server_url: str,
+def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
                            max_tokens: int = 100, max_concurrency: int = 128) -> List[Dict[str, Any]]:
     """Send all requests to SGLang server in parallel using multiprocessing."""
-    num_samples = len(input_ids_list)
+    num_samples = len(tokenized_df)
     logger.info(
         f"Sending {num_samples} requests to server with {max_concurrency} concurrent workers...")
 
     # Prepare arguments for multiprocessing
     args_list = [
-        (input_ids, max_tokens, server_url, i)
-        for i, input_ids in enumerate(input_ids_list)
+        (row['tok_input'], max_tokens, server_url, idx)
+        for idx, row in tokenized_df.iterrows()
     ]
 
     start_time = time.time()
 
-    # Use multiprocessing pool
+    # Use multiprocessing pool with progress bar
     with Pool(processes=min(max_concurrency, num_samples)) as pool:
-        # Map the function to all arguments
-        results = pool.map(send_single_request, args_list)
+        # Map the function to all arguments with progress bar
+        results = list(tqdm(
+            pool.imap(send_single_request, args_list),
+            total=len(args_list),
+            desc="Sending requests",
+            unit="request"
+        ))
 
     # Sort results by sample_id to maintain order
     results.sort(key=lambda x: x[0])
@@ -152,101 +151,113 @@ def send_requests_parallel(input_ids_list: List[List[int]], server_url: str,
     return responses
 
 
-def detokenize_all_responses(responses: List[Dict[str, Any]], input_ids_list: List[List[int]],
-                             tokenizer) -> List[str]:
-    """Detokenize all responses at once."""
-    logger.info("Detokenizing responses...")
+def extract_response_texts(responses: List[Dict[str, Any]], tokenized_df: pd.DataFrame) -> List[str]:
+    """Extract response texts from SGLang responses."""
+    logger.info("Extracting response texts...")
 
     response_texts = []
-    for i, (response, input_ids) in enumerate(zip(responses, input_ids_list)):
+    for i, (response, (_, row)) in enumerate(tqdm(zip(responses, tokenized_df.iterrows()), 
+                                                 total=len(responses), 
+                                                 desc="Extracting responses", 
+                                                 unit="response")):
         response_text = ""
-        if "error" not in response and "generated_text" in response:
+        if "error" not in response and "text" in response:
             try:
-                # Extract generated tokens (excluding input tokens)
-                generated_tokens = response["generated_text"][len(input_ids):]
-                response_text = tokenizer.decode(
-                    generated_tokens, skip_special_tokens=True)
+                # SGLang returns the generated text directly in the 'text' field
+                response_text = response["text"]
             except Exception as e:
                 logger.warning(
-                    f"Failed to decode response for sample {i+1}: {e}")
+                    f"Failed to extract response for sample {i+1}: {e}")
         response_texts.append(response_text)
 
-    logger.info("Detokenization complete")
+    logger.info("Response text extraction complete")
     return response_texts
 
 
 def save_responses(responses: List[Dict[str, Any]], response_texts: List[str],
-                   text_inputs: List[str], input_ids_list: List[List[int]],
-                   output_file: str) -> None:
-    """Save all responses to file."""
-    logger.info(f"Saving responses to {output_file}...")
-
-    with open(output_file, 'w') as f:
-        for i, (response, response_text, text_input, input_ids) in enumerate(
-                zip(responses, response_texts, text_inputs, input_ids_list)):
-
-            response_data = {
-                "sample_id": int(i),
-                "text_input": text_input,
-                "input_length": len(text_input),
-                "token_length": len(input_ids),
-                "input_tokens": input_ids,
-                "response": response,
-                "response_text": response_text,
-                "timestamp": float(time.time())
-            }
-
-            f.write(json.dumps(response_data) + '\n')
-
-    logger.info(f"Responses saved to {output_file}")
-
-
-def process_requests(data: pd.DataFrame, tokenizer, server_url: str,
+                   tokenized_df: pd.DataFrame, output_file: str = None) -> pd.DataFrame:
+    """Save all responses to DataFrame and optionally to pickle file."""
+    logger.info("Processing responses and updating DataFrame...")
+
+    # Work with the original DataFrame
+    result_df = tokenized_df.copy()
+    
+    # Overwrite existing columns with server response data
+    result_df['ref_output'] = response_texts
+    result_df['tok_ref_output'] = response_texts  # Same as ref_output for now
+    result_df['tok_ref_output_len'] = [len(text) for text in response_texts]
+    
+    # Calculate output token lengths for logging
+    output_token_lengths = []
+    for i, (response, response_text) in enumerate(zip(responses, response_texts)):
+        if "error" not in response and "meta_info" in response:
+            try:
+                # Use the completion_tokens from meta_info
+                output_token_lengths.append(response["meta_info"]["completion_tokens"])
+            except Exception as e:
+                logger.warning(f"Failed to calculate output tokens for sample {i+1}: {e}")
+                output_token_lengths.append(0)
+        else:
+            output_token_lengths.append(0)
+    
+    logger.info(f"Updated DataFrame with shape: {result_df.shape}")
+    logger.info(f"Updated columns: ref_output, tok_ref_output, tok_ref_output_len")
+    logger.info(f"Average output token length: {sum(output_token_lengths)/len(output_token_lengths):.1f}")
+    
+    # Save to pickle file if output_file is provided
+    if output_file:
+        logger.info(f"Saving responses to {output_file}...")
+        result_df.to_pickle(output_file)
+        logger.info(f"Responses saved to {output_file}")
+    
+    return result_df
+
+
+def process_requests(tokenized_df: pd.DataFrame, server_url: str,
                      max_samples: int = None, max_tokens: int = 100,
-                     max_concurrency: int = 128, output_file: str = "responses.jsonl") -> None:
-    """Main processing function that handles tokenization, requests, and detokenization."""
+                     max_concurrency: int = 128, output_file: str = None) -> pd.DataFrame:
+    """Main processing function that handles requests and response extraction."""
 
-    # Step 1: Tokenize all inputs
-    input_ids_list, text_inputs = tokenize_all_inputs(
-        data, tokenizer, max_samples)
+    # Step 1: Limit samples if specified
+    if max_samples is not None:
+        tokenized_df = tokenized_df.head(max_samples)
+        logger.info(f"Limited to first {max_samples} samples")
 
     # Step 2: Send all requests in parallel
     responses = send_requests_parallel(
-        input_ids_list,
+        tokenized_df,
         server_url,
         max_tokens,
         max_concurrency)
 
-    # Step 3: Detokenize all responses
-    response_texts = detokenize_all_responses(
-        responses, input_ids_list, tokenizer)
+    # Step 3: Extract response texts
+    response_texts = extract_response_texts(responses, tokenized_df)
 
-    # Step 4: Save all results
-    save_responses(
+    # Step 4: Save all results and return DataFrame
+    result_df = save_responses(
         responses,
         response_texts,
-        text_inputs,
-        input_ids_list,
+        tokenized_df,
         output_file)
+    
+    return result_df
 
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Send text requests to SGLang server with tokenization")
-    parser.add_argument("--data-file", default="/home/mlperf_inference_storage/data/deepseek-r1/mlperf_deepseek_r1_dataset_4388_fp8_eval.pkl",
-                        help="Path to pickle file containing text data")
-    parser.add_argument("--model-name", required=True,
-                        help="Model name for tokenizer (e.g., openai/gpt-oss-120b)")
+        description="Send pre-tokenized requests to SGLang server")
+    parser.add_argument("--input-tokens", required=True,
+                        help="Path to pickle file containing pre-tokenized data from harmony-tokens.py")
     parser.add_argument("--server-url", default="http://localhost:30000",
                         help="SGLang server URL (default: http://localhost:30000)")
     parser.add_argument("--max-samples", type=int, default=None,
                         help="Maximum number of samples to process (default: all)")
     parser.add_argument("--max-tokens", type=int, default=100,
                         help="Maximum tokens to generate per request")
-    parser.add_argument("--max-concurrency", type=int, default=128,
+    parser.add_argument("--max-concurrency", type=int, default=256,
                         help="Maximum number of concurrent requests (default: 128)")
-    parser.add_argument("--output", default="responses.jsonl",
-                        help="Output file for responses")
+    parser.add_argument("--output", default=None,
+                        help="Output pickle file for responses (optional)")
 
     args = parser.parse_args()
 
@@ -262,14 +273,25 @@ def main():
         return
     logger.info("Server connection successful")
 
-    data = load_text_data(args.data_file)
-    tokenizer = load_tokenizer(args.model_name)
-
-    process_requests(data, tokenizer, args.server_url,
-                     max_samples=args.max_samples,
-                     max_tokens=args.max_tokens,
-                     max_concurrency=args.max_concurrency,
-                     output_file=args.output)
+    # Load pre-tokenized data
+    tokenized_df = load_tokenized_data(args.input_tokens)
+
+    # Process requests and get result DataFrame
+    result_df = process_requests(tokenized_df, args.server_url,
+                                max_samples=args.max_samples,
+                                max_tokens=args.max_tokens,
+                                max_concurrency=args.max_concurrency,
+                                output_file=args.output)
+    
+    # Print summary
+    logger.info(f"\nProcessing completed:")
+    logger.info(f"  - Total samples processed: {len(result_df)}")
+    logger.info(f"  - Average input token length: {result_df['tok_input_len'].mean():.1f}")
+    logger.info(f"  - Average output text length: {result_df['tok_ref_output_len'].mean():.1f}")
+    if args.output:
+        logger.info(f"  - Results saved to: {args.output}")
+    else:
+        logger.info("  - Results returned as DataFrame (not saved to file)")
 
 
 if __name__ == "__main__":

From f3a328264bb9dec111e458c70f02001a9255bc99 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 1 Oct 2025 11:29:28 +0000
Subject: [PATCH 023/165] add harmonize-tokens.py

---
 .../gpt-oss/prepare-inputs/harmony-tokens.py  | 539 ++++++++++++++++++
 1 file changed, 539 insertions(+)
 create mode 100644 language/gpt-oss/prepare-inputs/harmony-tokens.py

diff --git a/language/gpt-oss/prepare-inputs/harmony-tokens.py b/language/gpt-oss/prepare-inputs/harmony-tokens.py
new file mode 100644
index 0000000000..af41ec64da
--- /dev/null
+++ b/language/gpt-oss/prepare-inputs/harmony-tokens.py
@@ -0,0 +1,539 @@
+#!/usr/bin/env python3
+"""
+Multi-Shot Prompting with OpenAI Harmony Format
+
+This example demonstrates how to construct and tokenize a zero/multi-shot prompt using the 
+openai-harmony Python package for math500, aime1983, livecodebench, mmlu, gpqa with gpt-oss model series.
+"""
+
+import argparse
+import json
+import os
+import pandas as pd
+from multiprocessing import Pool, cpu_count
+from tqdm import tqdm
+from openai_harmony import (
+    load_harmony_encoding,
+    HarmonyEncodingName,
+    Role,
+    Message,
+    Conversation,
+    SystemContent,
+    DeveloperContent,
+    ReasoningEffort
+)
+
+def create_math500_prompt(user_query):
+    """
+    Creates a multi-shot prompt for mathematical problem solving using Harmony format.
+
+    Returns:
+        tuple: (conversation_object, token_list) ready for model completion
+    """
+    
+    # Load the Harmony encoding for gpt-oss models
+    enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+    
+    # Create the system message with required channels
+    system_message = (
+        SystemContent.new()
+        .with_reasoning_effort(ReasoningEffort.HIGH)
+        .with_conversation_start_date("2025-09-30")
+        .with_required_channels(["analysis", "commentary", "final"])
+    )
+     
+    # Create the developer message with instructions
+    developer_message = (
+        DeveloperContent.new()
+        .with_instructions(
+            "You are a math expert that solves problems step-by-step. "
+            "Always show your work clearly and put your final answer in \\boxed{answer} format. "
+            "Follow the format shown in the examples below."
+        )
+    )
+    
+    # Create the conversation with multi-shot examples
+    messages = [
+        # System and developer setup
+        Message.from_role_and_content(Role.SYSTEM, system_message),
+        Message.from_role_and_content(Role.DEVELOPER, developer_message),
+        
+        # Example 1: Square areas and side lengths
+        Message.from_role_and_content(
+            Role.USER, 
+            "The areas of three squares are 16, 49 and 169. What is the average (mean) of their side lengths?"
+        ),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            "Since the areas of the three squares are 16, 49 and 169, then their side lengths are $\\sqrt{16}=4$, $\\sqrt{49}=7$ and $\\sqrt{169}=13$, respectively.\\n\\nThus, the average of their side lengths is $$\\frac{4+7+13}{3}=\\boxed{8}.$$"
+        ).with_channel("final"),
+        
+        # Example 2: Floor function equation
+        Message.from_role_and_content(
+            Role.USER,
+            "Find all $x$ such that $\\lfloor \\lfloor 2x \\rfloor - 1/2 \\rfloor = \\lfloor x + 2 \\rfloor.$"
+        ),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            "Observe that $\\lfloor 2x \\rfloor$ is an integer, so it follows that $\\lfloor \\lfloor 2x \\rfloor - 1/2 \\rfloor = \\lfloor 2x \\rfloor - 1$. Also, $\\lfloor x + 2 \\rfloor = \\lfloor x \\rfloor + 2$. Thus, our equation becomes $$\\lfloor 2x \\rfloor = \\lfloor x \\rfloor + 3.$$Let $n = \\lfloor x \\rfloor,$ so $n \\le x < n + 1.$\\n\\nIf $x < n + \\frac{1}{2},$ then $2n \\le x < 2n + 1,$ so $\\lfloor 2x \\rfloor = 2n,$ and\\n\\[2n = n + 3,\\]which means $n = 3.$\\n\\nIf $x \\ge n + \\frac{1}{2},$ then $2n + 1 \\le x < 2n + 2,$ so $\\lfloor 2x \\rfloor = 2n + 1,$ and\\n\\[2n + 1 = n + 3,\\]which means $n = 2.$\\n\\nTherefore, the set of solutions is $x \\in \\boxed{\\left[ \\frac{5}{2}, \\frac{7}{2} \\right)}.$"
+        ).with_channel("final"),
+        
+        # Example 3: Sequences and differences
+        Message.from_role_and_content(
+            Role.USER,
+            "Sequence $A$ is a geometric sequence. Sequence $B$ is an arithmetic sequence. Each sequence stops as soon as one of its terms is greater than $300.$ What is the least positive difference between a number selected from sequence $A$ and a number selected from sequence $B?$\\n\\n$\\bullet$ Sequence $A:$ $2,$ $4,$ $8,$ $16,$ $32,$ $\\ldots$\\n\\n$\\bullet$ Sequence $B:$ $20,$ $40,$ $60,$ $80,$ $100,$ $\\ldots$"
+        ),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            "The terms of sequence $A$ are $2,$ $4,$ $8,$ $16,$ $32,$ $64,$ $128,$ $256,$ $512.$ The terms of sequence $B$ start from $20$ and go up by $20$ each time, so sequence $B$ is precisely all multiples of $20$ from $20$ to $320.$ We thus need to see which term in sequence $A$ is closest to a multiple of $20.$ $16,$ $64,$ and $256$ are the closest, each being $4$ away from a multiple of $20.$ So the least positive difference between a term in sequence $A$ and one in sequence $B$ is $\\boxed{4}.$"
+        ).with_channel("final"),
+        
+        # Example 4: Probability and Deal or No Deal
+        Message.from_role_and_content(
+            Role.USER,
+            "In the game Deal or No Deal, participants choose a box at random from a set of $26,$ one containing each of the following values: \\begin{tabular}{|c|c|}\\hline\\$.01&\\$1,000\\\\\\hline\\$1&\\$5,000\\\\\\hline\\$5&\\$10,000\\\\\\hline\\$10&\\$25,000\\\\\\hline\\$25&\\$50,000\\\\\\hline\\$50&\\$75,000\\\\\\hline\\$75&\\$100,000\\\\\\hline\\$100&\\$200,000\\\\\\hline\\$200&\\$300,000\\\\\\hline\\$300&\\$400,000\\\\\\hline\\$400&\\$500,000\\\\\\hline\\$500&\\$750,000\\\\\\hline\\$750&\\$1,000,000\\\\\\hline\\end{tabular} After choosing a box, participants eliminate other boxes by opening them, showing the amount of money in the box to the crowd, and then removing that box (and its money!) from the game. What is the minimum number of boxes a participant needs to eliminate in order to have a half chance of holding at least $\\$100,\\!000$ as his or her chosen box?"
+        ),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            "Seven of the boxes contain at least this amount. If a participant is going to be holding one of these boxes with a probability of $1/2,$ there can be at most $7$ other boxes left. This means that at least $26-7-7=\\boxed{12}$ boxes must be eliminated."
+        ).with_channel("final"),
+        
+        # Example 5: Domain of composite function
+        Message.from_role_and_content(
+            Role.USER,
+            "Find the domain of the function $f(x) = \\tan(\\arccos(x^2)).$"
+        ),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            "For $\\arccos (x^2)$ to be defined, we must have $-1 \\le x^2 \\le 1,$ which is satisfied only for $-1 \\le x \\le 1.$  Then $\\arccos (x^2)$ will always return an angle between 0 and $\\frac{\\pi}{2}.$  Then $\\tan (\\arccos(x^2))$ is defined, unless $\\arccos(x^2) = \\frac{\\pi}{2}.$  This occurs only when $x = 0.$\\n\\nTherefore, the domain of $f(x)$ is $\\boxed{[-1,0) \\cup (0,1]}.$"
+        ).with_channel("final"),
+        
+        # The actual problem to solve
+        Message.from_role_and_content(
+            Role.USER,
+            user_query,
+        )
+    ]
+    
+    # Create the conversation
+    convo = Conversation.from_messages(messages)
+    
+    # Render the conversation for completion (ready to send to the model)
+    tokens = enc.render_conversation_for_completion(convo, Role.ASSISTANT)
+    
+    return convo, tokens
+
+def create_aime1983_prompt(user_query):
+    """
+    Creates a zero-shot prompt for mathematical problem solving using Harmony format.
+    
+    Returns:
+        tuple: (conversation_object, token_list) ready for model completion
+    """
+    
+    # Load the Harmony encoding for gpt-oss models
+    enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+    
+    # Create the system message with required channels
+    system_message = (
+        SystemContent.new()
+        .with_reasoning_effort(ReasoningEffort.HIGH)
+        .with_conversation_start_date("2025-09-30")
+        .with_required_channels(["analysis", "commentary", "final"])
+    )
+     
+    # Create the developer message with instructions
+    developer_message = (
+        DeveloperContent.new()
+        .with_instructions(
+            "You are a math expert that solves problems step-by-step. "
+            "The final line of your response should contain the final answer as an integer enclosed in \\boxed{answer}."
+        )
+    )
+
+    messages = [
+        # System and developer setup
+        Message.from_role_and_content(Role.SYSTEM, system_message),
+        Message.from_role_and_content(Role.DEVELOPER, developer_message),
+        Message.from_role_and_content(Role.USER, user_query),
+    ]
+
+    # Create the conversation
+    convo = Conversation.from_messages(messages)
+    
+    # Render the conversation for completion (ready to send to the model)
+    tokens = enc.render_conversation_for_completion(convo, Role.ASSISTANT)
+    
+    return convo, tokens
+
+def create_livecodebench_prompt(user_query):
+    """
+    Creates a zero-shot prompt for livecodebench problem solving using Harmony format.
+    
+    Returns:
+        tuple: (conversation_object, token_list) ready for model completion
+    """
+
+    # Load the Harmony encoding for gpt-oss models
+    enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+
+    # Create the system message with required channels
+    system_message = (
+        SystemContent.new()
+        .with_reasoning_effort(ReasoningEffort.HIGH)
+        .with_conversation_start_date("2025-09-30")
+        .with_required_channels(["analysis", "commentary", "final"])
+    )
+
+    # Create the developer message with instructions
+    developer_message = (
+        DeveloperContent.new()
+        .with_instructions(
+            "You are a python coding expert that solves problems step-by-step. "
+            "You must provide the reasoning to arriving at your solution and the code to solve the problem."
+            "The code should be enclosed within ```python delimiters."
+        )
+    )
+
+    messages = [
+        # System and developer setup
+        Message.from_role_and_content(Role.SYSTEM, system_message),
+        Message.from_role_and_content(Role.DEVELOPER, developer_message),
+        Message.from_role_and_content(Role.USER, user_query),
+    ]
+
+    convo = Conversation.from_messages(messages)
+    
+    # Render the conversation for completion (ready to send to the model)
+    tokens = enc.render_conversation_for_completion(convo, Role.ASSISTANT)
+    
+    return convo, tokens
+
+def create_mmlu_prompt(user_query):
+    """
+    Creates a multi-shot prompt for multiple choice question answering using Harmony format.
+    
+    Returns:
+        tuple: (conversation_object, token_list) ready for model completion
+    """
+    
+    # Load the Harmony encoding for gpt-oss models
+    enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+    
+    # Create the system message with required channels
+    system_message = (
+        SystemContent.new()
+        .with_reasoning_effort(ReasoningEffort.HIGH)
+        .with_conversation_start_date("2025-09-30")
+        .with_required_channels(["analysis", "commentary", "final"])
+    )
+    
+    # Create the developer message with instructions
+    developer_message = (
+        DeveloperContent.new()
+        .with_instructions(
+            "You are an expert test-taker that answers multiple choice questions accurately. "
+            "Follow the format shown in the examples below. "
+            "The last line of your response should be in the following format: 'Answer: A/B/C/...' (e.g. 'Answer: A')."
+        )
+    )
+    
+    # Create the conversation with multi-shot examples
+    messages = [
+        # System and developer setup
+        Message.from_role_and_content(Role.SYSTEM, system_message),
+        Message.from_role_and_content(Role.DEVELOPER, developer_message),
+        
+        # Example 1: Abstract Algebra - Ring Theory
+        Message.from_role_and_content(
+            Role.USER, 
+            "The symmetric group $S_n$ has $n!$ elements, hence it is not true that $S_{10}$ has 10 elements.\\nFind the characteristic of the ring 2Z.\\nA) 0\\nB) 30\\nC) 3\\nD) 10\\nE) 12\\nF) 50\\nG) 2\\nH) 100\\nI) 20\\nJ) 5"
+        ),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            "The characteristic of a ring R is the smallest positive integer n such that n·1_R = 0, or 0 if no such positive integer exists.\\n\\nFor the ring 2Z (even integers under addition and multiplication), we need to find the smallest positive integer n such that n·2 = 0 in 2Z. However, since 2Z contains all even integers, and n·2 is always even for any positive integer n, we have n·2 ≠ 0 for any positive n in 2Z.\\n\\nTherefore, the characteristic of 2Z is 0.\\n\\nAnswer: A"
+        ).with_channel("final"),
+        
+        # Example 2: Linear Algebra - Transformations
+        Message.from_role_and_content(
+            Role.USER,
+            "Let V be the set of all real polynomials p(x). Let transformations T, S be defined on V by T:p(x) -> xp(x) and S:p(x) -> p'(x) = d/dx p(x), and interpret (ST)(p(x)) as S(T(p(x))). Which of the following is true?\\nA) ST + TS is the identity map of V onto itself.\\nB) TS = 0\\nC) ST = 1\\nD) ST - TS = 0\\nE) ST = T\\nF) ST = 0\\nG) ST = TS\\nH) ST - TS is the identity map of V onto itself.\\nI) TS = T\\nJ) ST = S"
+        ),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            "Let's compute ST and TS for a general polynomial p(x).\\n\\nFor ST: (ST)(p(x)) = S(T(p(x))) = S(xp(x)) = d/dx(xp(x)) = p(x) + xp'(x)\\n\\nFor TS: (TS)(p(x)) = T(S(p(x))) = T(p'(x)) = xp'(x)\\n\\nNow let's find ST - TS:\\n(ST - TS)(p(x)) = (p(x) + xp'(x)) - xp'(x) = p(x)\\n\\nSince (ST - TS)(p(x)) = p(x) for any polynomial p(x), we have ST - TS = I, where I is the identity transformation.\\n\\nAnswer: H"
+        ).with_channel("final"),
+        
+        # Example 3: Number Theory - Diophantine Equations
+        Message.from_role_and_content(
+            Role.USER,
+            "Let A be the set of all ordered pairs of integers (m, n) such that 7m + 12n = 22. What is the greatest negative number in the set B = {m + n : (m, n) ∈ A}?\\nA) -5\\nB) 0\\nC) -3\\nD) -7\\nE) -4\\nF) -6\\nG) -1\\nH) -2\\nI) -9\\nJ) N/A"
+        ),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            "First, I need to find the general solution to 7m + 12n = 22.\\n\\nUsing the extended Euclidean algorithm: gcd(7,12) = 1, so solutions exist.\\nA particular solution is m₀ = 10, n₀ = -4 (since 7(10) + 12(-4) = 70 - 48 = 22).\\n\\nThe general solution is:\\nm = 10 + 12t\\nn = -4 - 7t\\nfor integer t.\\n\\nSo m + n = (10 + 12t) + (-4 - 7t) = 6 + 5t\\n\\nFor different values of t:\\nt = 0: m + n = 6\\nt = -1: m + n = 1\\nt = -2: m + n = -4\\nt = -3: m + n = -9\\nt = 1: m + n = 11\\n\\nThe greatest negative number is -4.\\n\\nAnswer: E"
+        ).with_channel("final"),
+        
+        # Example 4: Differential Equations - Salt Tank Problem
+        Message.from_role_and_content(
+            Role.USER,
+            "A tank initially contains a salt solution of 3 grams of salt dissolved in 100 liters of water. A salt solution containing 0.02 grams of salt per liter of water is sprayed into the tank at a rate of 4 liters per minute. The sprayed solution is continually mixed with the salt solution in the tank, and the mixture flows out of the tank at a rate of 4 liters per minute. If the mixing is instantaneous, how many grams of salt are in the tank after 100 minutes have elapsed?\\nA) 3 + e^-2\\nB) 2 - e^-4\\nC) 2 - e^-2\\nD) 3 + e^-4\\nE) 2 + e^-3\\nF) 2 - e^-3\\nG) 3 - e^-2\\nH) 2 + e^-2\\nI) 2 + e^-4\\nJ) 2"
+        ),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            "Let S(t) be the amount of salt at time t. The tank volume remains constant at 100L since inflow = outflow = 4 L/min.\\n\\nSalt enters at rate: 4 L/min × 0.02 g/L = 0.08 g/min\\nSalt leaves at rate: 4 L/min × S(t)/100 g/L = 0.04S(t) g/min\\n\\nDifferential equation: dS/dt = 0.08 - 0.04S(t)\\n\\nRearranging: dS/dt + 0.04S = 0.08\\n\\nThis is first-order linear. The integrating factor is e^(0.04t).\\n\\nSolution: S(t) = 2 + Ce^(-0.04t)\\n\\nUsing initial condition S(0) = 3:\\n3 = 2 + C, so C = 1\\n\\nTherefore: S(t) = 2 + e^(-0.04t)\\n\\nAt t = 100: S(100) = 2 + e^(-4) = 2 + e^-4\\n\\nAnswer: I"
+        ).with_channel("final"),
+        
+        # Example 5: Basic Arithmetic - Division
+        Message.from_role_and_content(
+            Role.USER,
+            "A total of 30 players will play basketball at a park. There will be exactly 5 players on each team. Which statement correctly explains how to find the number of teams needed?\\nA) Multiply 5 by 5 to find 25 teams.\\nB) Divide 30 by 5 to find 6 teams.\\nC) Add 5 to 30 to find 35 teams.\\nD) Subtract 30 from 5 to find -25 teams.\\nE) Divide 5 by 30 to find 0.1667 teams.\\nF) Add 5 to 30 then divide by 2 to find 17.5 teams.\\nG) N/A\\nH) N/A\\nI) N/A\\nJ) N/A"
+        ),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            "To find the number of teams when we know the total number of players and the number of players per team, we need to divide the total number of players by the number of players per team.\\n\\nTotal players = 30\\nPlayers per team = 5\\n\\nNumber of teams = Total players ÷ Players per team = 30 ÷ 5 = 6\\n\\nTherefore, we divide 30 by 5 to find 6 teams.\\n\\nAnswer: B"
+        ).with_channel("final"),
+        
+        # The actual problem to solve
+        Message.from_role_and_content(
+            Role.USER,
+            user_query,
+        )
+    ]
+    
+    # Create the conversation
+    convo = Conversation.from_messages(messages)
+    
+    # Render the conversation for completion (ready to send to the model)
+    tokens = enc.render_conversation_for_completion(convo, Role.ASSISTANT)
+    
+    return convo, tokens
+
+def create_gpqa_prompt(user_query):
+    """
+    Creates a multi-shot prompt for organic chemistry and biochemistry question answering using Harmony format.
+    
+    Returns:
+        tuple: (conversation_object, token_list) ready for model completion
+    """
+    
+    # Load the Harmony encoding for gpt-oss models
+    enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+    
+    # Create the system message with required channels
+    system_message = (
+        SystemContent.new()
+        .with_reasoning_effort(ReasoningEffort.HIGH)
+        .with_conversation_start_date("2025-09-30")
+        .with_required_channels(["analysis", "commentary", "final"])
+    )
+    
+    # Create the developer message with instructions
+    developer_message = (
+        DeveloperContent.new()
+        .with_instructions(
+            "You are an expert in organic chemistry and biochemistry who answers scientific questions accurately. "
+            "Provide detailed explanations showing your reasoning, then conclude with the answer. "
+            "The last line of your response should be in the following format: 'Answer: A/B/C/D' (e.g. 'Answer: A')."
+        )
+    )
+    
+    # Create the conversation with multi-shot examples
+    messages = [
+        # System and developer setup
+        Message.from_role_and_content(Role.SYSTEM, system_message),
+        Message.from_role_and_content(Role.DEVELOPER, developer_message),
+        
+        # Example 1: Molecular Biology - Gene Therapy
+        Message.from_role_and_content(
+            Role.USER, 
+            "A large gene has dozens of exons, of which the central ones code for folded triple helical repeats that connect the cytoskeleton with sarcolemma and extracellular space. Each exon usually codes for one folded triple alpha helix. The most common mutations of the gene are central exon deletions that create out-of-frame peptides and progressive degenerative organ waste. A solution is to deliver a Morpholino that recognizes the 5' end of the out-of-frame exon in pre-mRNA. The molecule prevents binding of the spliceosome and creates exon skipping and in-frame joining. Several missing exons are well tolerated by an organism. Which structure below is not involved in the proposed therapy?\\nA) polyA tail\\nB) lariat\\nC) antisense\\nD) R-loops"
+        ),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            "Explanation: The text describes the dystrophin gene and the FDA-approved oligonucleotide therapy that causes exon skipping by creating a functional, albeit shorter, dystrophin protein. Let me analyze each option:\\n\\n- Morpholino is bound to the pre-mRNA in an antisense orientation (C is involved)\\n- Every splicing mechanism creates the lariat molecule that is circular with a 3' tail and soon degraded (B is involved)\\n- The spliced RNA is polyadenylated at the 3' end with a polyA tail (A is involved)\\n- R-loops are triple helix structures of DNA and pre-mRNA that are a consequence of RNA transcription, not splicing and RNA maturation\\n\\nR-loops are not involved in the proposed exon-skipping therapy mechanism.\\n\\nAnswer: D"
+        ).with_channel("final"),
+        
+        # Example 2: Stereochemistry - Optical Activity
+        Message.from_role_and_content(
+            Role.USER,
+            "How many of the following compounds exhibit optical activity?\\n1-methyl-4-(prop-1-en-2-yl)cyclohex-1-ene\\n2,3,3,3-tetrafluoroprop-1-ene\\ndi(cyclohex-2-en-1-ylidene)methane\\n5-(5-methylhexan-2-ylidene)cyclopenta-1,3-diene\\n3-(2-methylbut-1-en-1-ylidene)cyclohex-1-ene\\n[1,1'-biphenyl]-3,3'-diol\\n8,8-dichlorobicyclo[4.2.0]octan-7-one\\ncyclopent-2-en-1-one\\nA) 6\\nB) 5\\nC) 4\\nD) 3"
+        ),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            "Explanation: To exhibit optical activity, a compound must be chiral (lack a mirror plane of symmetry). Let me analyze each compound:\\n\\nOptically active (chiral) compounds:\\n- 1-methyl-4-(prop-1-en-2-yl)cyclohex-1-ene: has chiral center\\n- di(cyclohex-2-en-1-ylidene)methane: no plane of symmetry\\n- 3-(2-methylbut-1-en-1-ylidene)cyclohex-1-ene: chiral\\n- 8,8-dichlorobicyclo[4.2.0]octan-7-one: chiral bicyclic structure\\n\\nOptically inactive (achiral) compounds:\\n- 2,3,3,3-tetrafluoroprop-1-ene: has mirror plane\\n- 5-(5-methylhexan-2-ylidene)cyclopenta-1,3-diene: has mirror plane\\n- [1,1'-biphenyl]-3,3'-diol: has mirror plane\\n- cyclopent-2-en-1-one: has mirror plane\\n\\nTherefore, 4 compounds exhibit optical activity.\\n\\nAnswer: C"
+        ).with_channel("final"),
+        
+        # The actual problem to solve
+        Message.from_role_and_content(
+            Role.USER,
+            user_query,
+        )
+    ]
+    
+    # Create the conversation
+    convo = Conversation.from_messages(messages)
+    
+    # Render the conversation for completion (ready to send to the model)
+    tokens = enc.render_conversation_for_completion(convo, Role.ASSISTANT)
+    
+    return convo, tokens
+
+
+def process_row(args):
+    """
+    Worker function to process a single row from the dataframe.
+    
+    Args:
+        args: tuple of (index, row, dataset_function_map)
+        
+    Returns:
+        tuple: (index, convo, tokens, dataset_name) or (index, None, None, dataset_name, error)
+    """
+    index, row, dataset_function_map = args
+    dataset_name = row["dataset"]
+    user_query = row["question"]
+    
+    try:
+        # Get the appropriate function based on dataset type
+        if dataset_name in dataset_function_map:
+            create_prompt_func = dataset_function_map[dataset_name]
+            convo, tokens = create_prompt_func(user_query)
+            return (index, convo, tokens, dataset_name)
+        else:
+            error_msg = f"Unknown dataset '{dataset_name}' at index {index}"
+            return (index, None, None, dataset_name, error_msg)
+    except Exception as e:
+        error_msg = f"Error processing {dataset_name} at index {index}: {str(e)}"
+        return (index, None, None, dataset_name, error_msg)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-file", type=str, default="/home/mlperf_inference_storage/data/deepseek-r1/mlperf_deepseek_r1_dataset_4388_fp8_eval.pkl")
+    parser.add_argument("--num-processes", type=int, default=cpu_count(), 
+                       help="Number of processes to use for parallel processing (default: number of CPU cores)")
+    parser.add_argument("--max-rows", type=int, default=None,
+                       help="Maximum number of rows to process (default: process all rows)")
+    parser.add_argument("--output-file", type=str, required=True,
+                       help="Output pickle file path to save the processed data as pandas DataFrame")
+    args = parser.parse_args()
+    df = pd.read_pickle(args.data_file)
+    
+    # Apply row limit if specified
+    if args.max_rows is not None:
+        df = df.head(args.max_rows)
+        print(f"Limited to first {args.max_rows} rows")
+    
+    # Create mapping from dataset names to prompt creation functions
+    dataset_function_map = {
+        'aime1983': create_aime1983_prompt,
+        'gpqa': create_gpqa_prompt,
+        'livecodebench': create_livecodebench_prompt,
+        'math500': create_math500_prompt,
+        'mmlu_pro': create_mmlu_prompt
+    }
+    
+    # Prepare data for parallel processing
+    process_args = [(index, row, dataset_function_map) for index, row in df.iterrows()]
+
+    # Don't use more processes than we have rows
+    num_processes = min(args.num_processes, len(df))
+    
+    print(f"Processing {len(df)} queries using {num_processes} processes...")
+    
+    # Process rows in parallel with progress bar
+    with Pool(processes=num_processes) as pool:
+        results = list(tqdm(
+            pool.imap(process_row, process_args),
+            total=len(process_args),
+            desc="Processing queries",
+            unit="query"
+        ))
+    
+    # Sort results by index to preserve original order
+    results.sort(key=lambda x: x[0])
+    
+    # Ensure output directory exists
+    output_dir = os.path.dirname(args.output_file)
+    if output_dir and not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    
+    # Process results and modify original DataFrame
+    successful_count = 0
+    error_count = 0
+    
+    # Set reference columns to None
+    df['ref_accuracy'] = None
+    df['ref_extracted_answer'] = None
+    df['ref_output'] = None
+    df['tok_ref_output'] = None
+    df['tok_ref_output_len'] = None
+    
+    # Set templated text input to None (if it exists)
+    if 'templated_text_input' in df.columns:
+        df['templated_text_input'] = None
+    
+    # Initialize columns for harmony tokenized input
+    df['tok_input'] = None
+    df['tok_input_len'] = None
+    df['text_input'] = None
+    
+    for result in tqdm(results, desc="Processing results", unit="entry"):
+        if len(result) == 5:  # Error case
+            index, convo, tokens, dataset_name, error_msg = result
+            error_count += 1
+            
+            # Update the original DataFrame with error data
+            df.at[index, 'tok_input'] = None
+            df.at[index, 'tok_input_len'] = None
+            df.at[index, 'text_input'] = None
+        else:  # Success case
+            index, convo, tokens, dataset_name = result
+            successful_count += 1
+            
+            # Convert conversation to string format
+            conversation_parts = []
+            for message in convo.messages:
+                # Get role from message.author.role
+                role = message.author.role.value if hasattr(message.author.role, 'value') else str(message.author.role)
+                
+                # Get content from message.content (which is a list)
+                content_parts = []
+                for content_item in message.content:
+                    if hasattr(content_item, 'text'):
+                        content_parts.append(content_item.text)
+                    else:
+                        content_parts.append(str(content_item))
+                content = ' '.join(content_parts)
+                
+                # Format as "Role: content"
+                conversation_parts.append(f"{role}: {content}")
+            
+            conversation_string = '\n'.join(conversation_parts)
+            
+            # Update the original DataFrame with successful data
+            df.at[index, 'tok_input'] = tokens.tolist() if hasattr(tokens, 'tolist') else list(tokens)
+            df.at[index, 'tok_input_len'] = len(tokens) if hasattr(tokens, '__len__') else 0
+            df.at[index, 'text_input'] = conversation_string
+    
+    # Verify input and output have identical column lists
+    input_columns = list(pd.read_pickle(args.data_file).columns)
+    output_columns = list(df.columns)
+    if input_columns != output_columns:
+        print(f"WARNING: Column lists differ!")
+        print(f"Input columns: {input_columns}")
+        print(f"Output columns: {output_columns}")
+    
+    # Save the modified DataFrame as pickle
+    print("Saving modified DataFrame to pickle...")
+    df.to_pickle(args.output_file)
+
+    print(f"\nProcessing completed:")
+    print(f"  - Successfully processed: {successful_count} queries")
+    print(f"  - Errors: {error_count} queries")
+    print(f"  - Output saved to: {args.output_file} (pickle format)")
+    print(f"  - Total processed: {successful_count + error_count} queries")
+    print(f"  - Modified DataFrame shape: {df.shape}")
+    print(f"  - Updated columns: tok_input, tok_input_len, text_input")
+    print(f"  - Set to None: ref_accuracy, ref_extracted_answer, ref_output, tok_ref_output, tok_ref_output_len, templated_text_input")
\ No newline at end of file

From 8596b13307c671b4862dca3a27eb36e6f52f3d23 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 1 Oct 2025 11:30:29 +0000
Subject: [PATCH 024/165] move things

---
 .../{prepare-inputs/harmony-tokens.py => harmonize-inputs.py}     | 0
 language/gpt-oss/{infer-sglang/send_requests.py => run_infer.py}  | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename language/gpt-oss/{prepare-inputs/harmony-tokens.py => harmonize-inputs.py} (100%)
 rename language/gpt-oss/{infer-sglang/send_requests.py => run_infer.py} (100%)

diff --git a/language/gpt-oss/prepare-inputs/harmony-tokens.py b/language/gpt-oss/harmonize-inputs.py
similarity index 100%
rename from language/gpt-oss/prepare-inputs/harmony-tokens.py
rename to language/gpt-oss/harmonize-inputs.py
diff --git a/language/gpt-oss/infer-sglang/send_requests.py b/language/gpt-oss/run_infer.py
similarity index 100%
rename from language/gpt-oss/infer-sglang/send_requests.py
rename to language/gpt-oss/run_infer.py

From 1db2f96d926ba47e1bb2a15c169e68731d8cc729 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 1 Oct 2025 11:30:55 +0000
Subject: [PATCH 025/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/harmonize-inputs.py | 181 ++++++++++++++-------------
 language/gpt-oss/run_infer.py        |  75 ++++++-----
 2 files changed, 140 insertions(+), 116 deletions(-)

diff --git a/language/gpt-oss/harmonize-inputs.py b/language/gpt-oss/harmonize-inputs.py
index af41ec64da..93e064662b 100644
--- a/language/gpt-oss/harmonize-inputs.py
+++ b/language/gpt-oss/harmonize-inputs.py
@@ -2,7 +2,7 @@
 """
 Multi-Shot Prompting with OpenAI Harmony Format
 
-This example demonstrates how to construct and tokenize a zero/multi-shot prompt using the 
+This example demonstrates how to construct and tokenize a zero/multi-shot prompt using the
 openai-harmony Python package for math500, aime1983, livecodebench, mmlu, gpqa with gpt-oss model series.
 """
 
@@ -23,6 +23,7 @@
     ReasoningEffort
 )
 
+
 def create_math500_prompt(user_query):
     """
     Creates a multi-shot prompt for mathematical problem solving using Harmony format.
@@ -30,10 +31,10 @@ def create_math500_prompt(user_query):
     Returns:
         tuple: (conversation_object, token_list) ready for model completion
     """
-    
+
     # Load the Harmony encoding for gpt-oss models
     enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
-    
+
     # Create the system message with required channels
     system_message = (
         SystemContent.new()
@@ -41,7 +42,7 @@ def create_math500_prompt(user_query):
         .with_conversation_start_date("2025-09-30")
         .with_required_channels(["analysis", "commentary", "final"])
     )
-     
+
     # Create the developer message with instructions
     developer_message = (
         DeveloperContent.new()
@@ -51,23 +52,23 @@ def create_math500_prompt(user_query):
             "Follow the format shown in the examples below."
         )
     )
-    
+
     # Create the conversation with multi-shot examples
     messages = [
         # System and developer setup
         Message.from_role_and_content(Role.SYSTEM, system_message),
         Message.from_role_and_content(Role.DEVELOPER, developer_message),
-        
+
         # Example 1: Square areas and side lengths
         Message.from_role_and_content(
-            Role.USER, 
+            Role.USER,
             "The areas of three squares are 16, 49 and 169. What is the average (mean) of their side lengths?"
         ),
         Message.from_role_and_content(
             Role.ASSISTANT,
             "Since the areas of the three squares are 16, 49 and 169, then their side lengths are $\\sqrt{16}=4$, $\\sqrt{49}=7$ and $\\sqrt{169}=13$, respectively.\\n\\nThus, the average of their side lengths is $$\\frac{4+7+13}{3}=\\boxed{8}.$$"
         ).with_channel("final"),
-        
+
         # Example 2: Floor function equation
         Message.from_role_and_content(
             Role.USER,
@@ -77,7 +78,7 @@ def create_math500_prompt(user_query):
             Role.ASSISTANT,
             "Observe that $\\lfloor 2x \\rfloor$ is an integer, so it follows that $\\lfloor \\lfloor 2x \\rfloor - 1/2 \\rfloor = \\lfloor 2x \\rfloor - 1$. Also, $\\lfloor x + 2 \\rfloor = \\lfloor x \\rfloor + 2$. Thus, our equation becomes $$\\lfloor 2x \\rfloor = \\lfloor x \\rfloor + 3.$$Let $n = \\lfloor x \\rfloor,$ so $n \\le x < n + 1.$\\n\\nIf $x < n + \\frac{1}{2},$ then $2n \\le x < 2n + 1,$ so $\\lfloor 2x \\rfloor = 2n,$ and\\n\\[2n = n + 3,\\]which means $n = 3.$\\n\\nIf $x \\ge n + \\frac{1}{2},$ then $2n + 1 \\le x < 2n + 2,$ so $\\lfloor 2x \\rfloor = 2n + 1,$ and\\n\\[2n + 1 = n + 3,\\]which means $n = 2.$\\n\\nTherefore, the set of solutions is $x \\in \\boxed{\\left[ \\frac{5}{2}, \\frac{7}{2} \\right)}.$"
         ).with_channel("final"),
-        
+
         # Example 3: Sequences and differences
         Message.from_role_and_content(
             Role.USER,
@@ -87,7 +88,7 @@ def create_math500_prompt(user_query):
             Role.ASSISTANT,
             "The terms of sequence $A$ are $2,$ $4,$ $8,$ $16,$ $32,$ $64,$ $128,$ $256,$ $512.$ The terms of sequence $B$ start from $20$ and go up by $20$ each time, so sequence $B$ is precisely all multiples of $20$ from $20$ to $320.$ We thus need to see which term in sequence $A$ is closest to a multiple of $20.$ $16,$ $64,$ and $256$ are the closest, each being $4$ away from a multiple of $20.$ So the least positive difference between a term in sequence $A$ and one in sequence $B$ is $\\boxed{4}.$"
         ).with_channel("final"),
-        
+
         # Example 4: Probability and Deal or No Deal
         Message.from_role_and_content(
             Role.USER,
@@ -97,7 +98,7 @@ def create_math500_prompt(user_query):
             Role.ASSISTANT,
             "Seven of the boxes contain at least this amount. If a participant is going to be holding one of these boxes with a probability of $1/2,$ there can be at most $7$ other boxes left. This means that at least $26-7-7=\\boxed{12}$ boxes must be eliminated."
         ).with_channel("final"),
-        
+
         # Example 5: Domain of composite function
         Message.from_role_and_content(
             Role.USER,
@@ -107,33 +108,34 @@ def create_math500_prompt(user_query):
             Role.ASSISTANT,
             "For $\\arccos (x^2)$ to be defined, we must have $-1 \\le x^2 \\le 1,$ which is satisfied only for $-1 \\le x \\le 1.$  Then $\\arccos (x^2)$ will always return an angle between 0 and $\\frac{\\pi}{2}.$  Then $\\tan (\\arccos(x^2))$ is defined, unless $\\arccos(x^2) = \\frac{\\pi}{2}.$  This occurs only when $x = 0.$\\n\\nTherefore, the domain of $f(x)$ is $\\boxed{[-1,0) \\cup (0,1]}.$"
         ).with_channel("final"),
-        
+
         # The actual problem to solve
         Message.from_role_and_content(
             Role.USER,
             user_query,
         )
     ]
-    
+
     # Create the conversation
     convo = Conversation.from_messages(messages)
-    
+
     # Render the conversation for completion (ready to send to the model)
     tokens = enc.render_conversation_for_completion(convo, Role.ASSISTANT)
-    
+
     return convo, tokens
 
+
 def create_aime1983_prompt(user_query):
     """
     Creates a zero-shot prompt for mathematical problem solving using Harmony format.
-    
+
     Returns:
         tuple: (conversation_object, token_list) ready for model completion
     """
-    
+
     # Load the Harmony encoding for gpt-oss models
     enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
-    
+
     # Create the system message with required channels
     system_message = (
         SystemContent.new()
@@ -141,7 +143,7 @@ def create_aime1983_prompt(user_query):
         .with_conversation_start_date("2025-09-30")
         .with_required_channels(["analysis", "commentary", "final"])
     )
-     
+
     # Create the developer message with instructions
     developer_message = (
         DeveloperContent.new()
@@ -160,16 +162,17 @@ def create_aime1983_prompt(user_query):
 
     # Create the conversation
     convo = Conversation.from_messages(messages)
-    
+
     # Render the conversation for completion (ready to send to the model)
     tokens = enc.render_conversation_for_completion(convo, Role.ASSISTANT)
-    
+
     return convo, tokens
 
+
 def create_livecodebench_prompt(user_query):
     """
     Creates a zero-shot prompt for livecodebench problem solving using Harmony format.
-    
+
     Returns:
         tuple: (conversation_object, token_list) ready for model completion
     """
@@ -203,23 +206,24 @@ def create_livecodebench_prompt(user_query):
     ]
 
     convo = Conversation.from_messages(messages)
-    
+
     # Render the conversation for completion (ready to send to the model)
     tokens = enc.render_conversation_for_completion(convo, Role.ASSISTANT)
-    
+
     return convo, tokens
 
+
 def create_mmlu_prompt(user_query):
     """
     Creates a multi-shot prompt for multiple choice question answering using Harmony format.
-    
+
     Returns:
         tuple: (conversation_object, token_list) ready for model completion
     """
-    
+
     # Load the Harmony encoding for gpt-oss models
     enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
-    
+
     # Create the system message with required channels
     system_message = (
         SystemContent.new()
@@ -227,7 +231,7 @@ def create_mmlu_prompt(user_query):
         .with_conversation_start_date("2025-09-30")
         .with_required_channels(["analysis", "commentary", "final"])
     )
-    
+
     # Create the developer message with instructions
     developer_message = (
         DeveloperContent.new()
@@ -237,23 +241,23 @@ def create_mmlu_prompt(user_query):
             "The last line of your response should be in the following format: 'Answer: A/B/C/...' (e.g. 'Answer: A')."
         )
     )
-    
+
     # Create the conversation with multi-shot examples
     messages = [
         # System and developer setup
         Message.from_role_and_content(Role.SYSTEM, system_message),
         Message.from_role_and_content(Role.DEVELOPER, developer_message),
-        
+
         # Example 1: Abstract Algebra - Ring Theory
         Message.from_role_and_content(
-            Role.USER, 
+            Role.USER,
             "The symmetric group $S_n$ has $n!$ elements, hence it is not true that $S_{10}$ has 10 elements.\\nFind the characteristic of the ring 2Z.\\nA) 0\\nB) 30\\nC) 3\\nD) 10\\nE) 12\\nF) 50\\nG) 2\\nH) 100\\nI) 20\\nJ) 5"
         ),
         Message.from_role_and_content(
             Role.ASSISTANT,
             "The characteristic of a ring R is the smallest positive integer n such that n·1_R = 0, or 0 if no such positive integer exists.\\n\\nFor the ring 2Z (even integers under addition and multiplication), we need to find the smallest positive integer n such that n·2 = 0 in 2Z. However, since 2Z contains all even integers, and n·2 is always even for any positive integer n, we have n·2 ≠ 0 for any positive n in 2Z.\\n\\nTherefore, the characteristic of 2Z is 0.\\n\\nAnswer: A"
         ).with_channel("final"),
-        
+
         # Example 2: Linear Algebra - Transformations
         Message.from_role_and_content(
             Role.USER,
@@ -263,7 +267,7 @@ def create_mmlu_prompt(user_query):
             Role.ASSISTANT,
             "Let's compute ST and TS for a general polynomial p(x).\\n\\nFor ST: (ST)(p(x)) = S(T(p(x))) = S(xp(x)) = d/dx(xp(x)) = p(x) + xp'(x)\\n\\nFor TS: (TS)(p(x)) = T(S(p(x))) = T(p'(x)) = xp'(x)\\n\\nNow let's find ST - TS:\\n(ST - TS)(p(x)) = (p(x) + xp'(x)) - xp'(x) = p(x)\\n\\nSince (ST - TS)(p(x)) = p(x) for any polynomial p(x), we have ST - TS = I, where I is the identity transformation.\\n\\nAnswer: H"
         ).with_channel("final"),
-        
+
         # Example 3: Number Theory - Diophantine Equations
         Message.from_role_and_content(
             Role.USER,
@@ -273,7 +277,7 @@ def create_mmlu_prompt(user_query):
             Role.ASSISTANT,
             "First, I need to find the general solution to 7m + 12n = 22.\\n\\nUsing the extended Euclidean algorithm: gcd(7,12) = 1, so solutions exist.\\nA particular solution is m₀ = 10, n₀ = -4 (since 7(10) + 12(-4) = 70 - 48 = 22).\\n\\nThe general solution is:\\nm = 10 + 12t\\nn = -4 - 7t\\nfor integer t.\\n\\nSo m + n = (10 + 12t) + (-4 - 7t) = 6 + 5t\\n\\nFor different values of t:\\nt = 0: m + n = 6\\nt = -1: m + n = 1\\nt = -2: m + n = -4\\nt = -3: m + n = -9\\nt = 1: m + n = 11\\n\\nThe greatest negative number is -4.\\n\\nAnswer: E"
         ).with_channel("final"),
-        
+
         # Example 4: Differential Equations - Salt Tank Problem
         Message.from_role_and_content(
             Role.USER,
@@ -283,7 +287,7 @@ def create_mmlu_prompt(user_query):
             Role.ASSISTANT,
             "Let S(t) be the amount of salt at time t. The tank volume remains constant at 100L since inflow = outflow = 4 L/min.\\n\\nSalt enters at rate: 4 L/min × 0.02 g/L = 0.08 g/min\\nSalt leaves at rate: 4 L/min × S(t)/100 g/L = 0.04S(t) g/min\\n\\nDifferential equation: dS/dt = 0.08 - 0.04S(t)\\n\\nRearranging: dS/dt + 0.04S = 0.08\\n\\nThis is first-order linear. The integrating factor is e^(0.04t).\\n\\nSolution: S(t) = 2 + Ce^(-0.04t)\\n\\nUsing initial condition S(0) = 3:\\n3 = 2 + C, so C = 1\\n\\nTherefore: S(t) = 2 + e^(-0.04t)\\n\\nAt t = 100: S(100) = 2 + e^(-4) = 2 + e^-4\\n\\nAnswer: I"
         ).with_channel("final"),
-        
+
         # Example 5: Basic Arithmetic - Division
         Message.from_role_and_content(
             Role.USER,
@@ -293,33 +297,34 @@ def create_mmlu_prompt(user_query):
             Role.ASSISTANT,
             "To find the number of teams when we know the total number of players and the number of players per team, we need to divide the total number of players by the number of players per team.\\n\\nTotal players = 30\\nPlayers per team = 5\\n\\nNumber of teams = Total players ÷ Players per team = 30 ÷ 5 = 6\\n\\nTherefore, we divide 30 by 5 to find 6 teams.\\n\\nAnswer: B"
         ).with_channel("final"),
-        
+
         # The actual problem to solve
         Message.from_role_and_content(
             Role.USER,
             user_query,
         )
     ]
-    
+
     # Create the conversation
     convo = Conversation.from_messages(messages)
-    
+
     # Render the conversation for completion (ready to send to the model)
     tokens = enc.render_conversation_for_completion(convo, Role.ASSISTANT)
-    
+
     return convo, tokens
 
+
 def create_gpqa_prompt(user_query):
     """
     Creates a multi-shot prompt for organic chemistry and biochemistry question answering using Harmony format.
-    
+
     Returns:
         tuple: (conversation_object, token_list) ready for model completion
     """
-    
+
     # Load the Harmony encoding for gpt-oss models
     enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
-    
+
     # Create the system message with required channels
     system_message = (
         SystemContent.new()
@@ -327,7 +332,7 @@ def create_gpqa_prompt(user_query):
         .with_conversation_start_date("2025-09-30")
         .with_required_channels(["analysis", "commentary", "final"])
     )
-    
+
     # Create the developer message with instructions
     developer_message = (
         DeveloperContent.new()
@@ -337,23 +342,23 @@ def create_gpqa_prompt(user_query):
             "The last line of your response should be in the following format: 'Answer: A/B/C/D' (e.g. 'Answer: A')."
         )
     )
-    
+
     # Create the conversation with multi-shot examples
     messages = [
         # System and developer setup
         Message.from_role_and_content(Role.SYSTEM, system_message),
         Message.from_role_and_content(Role.DEVELOPER, developer_message),
-        
+
         # Example 1: Molecular Biology - Gene Therapy
         Message.from_role_and_content(
-            Role.USER, 
+            Role.USER,
             "A large gene has dozens of exons, of which the central ones code for folded triple helical repeats that connect the cytoskeleton with sarcolemma and extracellular space. Each exon usually codes for one folded triple alpha helix. The most common mutations of the gene are central exon deletions that create out-of-frame peptides and progressive degenerative organ waste. A solution is to deliver a Morpholino that recognizes the 5' end of the out-of-frame exon in pre-mRNA. The molecule prevents binding of the spliceosome and creates exon skipping and in-frame joining. Several missing exons are well tolerated by an organism. Which structure below is not involved in the proposed therapy?\\nA) polyA tail\\nB) lariat\\nC) antisense\\nD) R-loops"
         ),
         Message.from_role_and_content(
             Role.ASSISTANT,
             "Explanation: The text describes the dystrophin gene and the FDA-approved oligonucleotide therapy that causes exon skipping by creating a functional, albeit shorter, dystrophin protein. Let me analyze each option:\\n\\n- Morpholino is bound to the pre-mRNA in an antisense orientation (C is involved)\\n- Every splicing mechanism creates the lariat molecule that is circular with a 3' tail and soon degraded (B is involved)\\n- The spliced RNA is polyadenylated at the 3' end with a polyA tail (A is involved)\\n- R-loops are triple helix structures of DNA and pre-mRNA that are a consequence of RNA transcription, not splicing and RNA maturation\\n\\nR-loops are not involved in the proposed exon-skipping therapy mechanism.\\n\\nAnswer: D"
         ).with_channel("final"),
-        
+
         # Example 2: Stereochemistry - Optical Activity
         Message.from_role_and_content(
             Role.USER,
@@ -363,37 +368,37 @@ def create_gpqa_prompt(user_query):
             Role.ASSISTANT,
             "Explanation: To exhibit optical activity, a compound must be chiral (lack a mirror plane of symmetry). Let me analyze each compound:\\n\\nOptically active (chiral) compounds:\\n- 1-methyl-4-(prop-1-en-2-yl)cyclohex-1-ene: has chiral center\\n- di(cyclohex-2-en-1-ylidene)methane: no plane of symmetry\\n- 3-(2-methylbut-1-en-1-ylidene)cyclohex-1-ene: chiral\\n- 8,8-dichlorobicyclo[4.2.0]octan-7-one: chiral bicyclic structure\\n\\nOptically inactive (achiral) compounds:\\n- 2,3,3,3-tetrafluoroprop-1-ene: has mirror plane\\n- 5-(5-methylhexan-2-ylidene)cyclopenta-1,3-diene: has mirror plane\\n- [1,1'-biphenyl]-3,3'-diol: has mirror plane\\n- cyclopent-2-en-1-one: has mirror plane\\n\\nTherefore, 4 compounds exhibit optical activity.\\n\\nAnswer: C"
         ).with_channel("final"),
-        
+
         # The actual problem to solve
         Message.from_role_and_content(
             Role.USER,
             user_query,
         )
     ]
-    
+
     # Create the conversation
     convo = Conversation.from_messages(messages)
-    
+
     # Render the conversation for completion (ready to send to the model)
     tokens = enc.render_conversation_for_completion(convo, Role.ASSISTANT)
-    
+
     return convo, tokens
 
 
 def process_row(args):
     """
     Worker function to process a single row from the dataframe.
-    
+
     Args:
         args: tuple of (index, row, dataset_function_map)
-        
+
     Returns:
         tuple: (index, convo, tokens, dataset_name) or (index, None, None, dataset_name, error)
     """
     index, row, dataset_function_map = args
     dataset_name = row["dataset"]
     user_query = row["question"]
-    
+
     try:
         # Get the appropriate function based on dataset type
         if dataset_name in dataset_function_map:
@@ -410,21 +415,24 @@ def process_row(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--data-file", type=str, default="/home/mlperf_inference_storage/data/deepseek-r1/mlperf_deepseek_r1_dataset_4388_fp8_eval.pkl")
-    parser.add_argument("--num-processes", type=int, default=cpu_count(), 
-                       help="Number of processes to use for parallel processing (default: number of CPU cores)")
+    parser.add_argument(
+        "--data-file",
+        type=str,
+        default="/home/mlperf_inference_storage/data/deepseek-r1/mlperf_deepseek_r1_dataset_4388_fp8_eval.pkl")
+    parser.add_argument("--num-processes", type=int, default=cpu_count(),
+                        help="Number of processes to use for parallel processing (default: number of CPU cores)")
     parser.add_argument("--max-rows", type=int, default=None,
-                       help="Maximum number of rows to process (default: process all rows)")
+                        help="Maximum number of rows to process (default: process all rows)")
     parser.add_argument("--output-file", type=str, required=True,
-                       help="Output pickle file path to save the processed data as pandas DataFrame")
+                        help="Output pickle file path to save the processed data as pandas DataFrame")
     args = parser.parse_args()
     df = pd.read_pickle(args.data_file)
-    
+
     # Apply row limit if specified
     if args.max_rows is not None:
         df = df.head(args.max_rows)
         print(f"Limited to first {args.max_rows} rows")
-    
+
     # Create mapping from dataset names to prompt creation functions
     dataset_function_map = {
         'aime1983': create_aime1983_prompt,
@@ -433,15 +441,16 @@ def process_row(args):
         'math500': create_math500_prompt,
         'mmlu_pro': create_mmlu_prompt
     }
-    
+
     # Prepare data for parallel processing
-    process_args = [(index, row, dataset_function_map) for index, row in df.iterrows()]
+    process_args = [(index, row, dataset_function_map)
+                    for index, row in df.iterrows()]
 
     # Don't use more processes than we have rows
     num_processes = min(args.num_processes, len(df))
-    
+
     print(f"Processing {len(df)} queries using {num_processes} processes...")
-    
+
     # Process rows in parallel with progress bar
     with Pool(processes=num_processes) as pool:
         results = list(tqdm(
@@ -450,40 +459,40 @@ def process_row(args):
             desc="Processing queries",
             unit="query"
         ))
-    
+
     # Sort results by index to preserve original order
     results.sort(key=lambda x: x[0])
-    
+
     # Ensure output directory exists
     output_dir = os.path.dirname(args.output_file)
     if output_dir and not os.path.exists(output_dir):
         os.makedirs(output_dir)
-    
+
     # Process results and modify original DataFrame
     successful_count = 0
     error_count = 0
-    
+
     # Set reference columns to None
     df['ref_accuracy'] = None
     df['ref_extracted_answer'] = None
     df['ref_output'] = None
     df['tok_ref_output'] = None
     df['tok_ref_output_len'] = None
-    
+
     # Set templated text input to None (if it exists)
     if 'templated_text_input' in df.columns:
         df['templated_text_input'] = None
-    
+
     # Initialize columns for harmony tokenized input
     df['tok_input'] = None
     df['tok_input_len'] = None
     df['text_input'] = None
-    
+
     for result in tqdm(results, desc="Processing results", unit="entry"):
         if len(result) == 5:  # Error case
             index, convo, tokens, dataset_name, error_msg = result
             error_count += 1
-            
+
             # Update the original DataFrame with error data
             df.at[index, 'tok_input'] = None
             df.at[index, 'tok_input_len'] = None
@@ -491,13 +500,15 @@ def process_row(args):
         else:  # Success case
             index, convo, tokens, dataset_name = result
             successful_count += 1
-            
+
             # Convert conversation to string format
             conversation_parts = []
             for message in convo.messages:
                 # Get role from message.author.role
-                role = message.author.role.value if hasattr(message.author.role, 'value') else str(message.author.role)
-                
+                role = message.author.role.value if hasattr(
+                    message.author.role, 'value') else str(
+                    message.author.role)
+
                 # Get content from message.content (which is a list)
                 content_parts = []
                 for content_item in message.content:
@@ -506,17 +517,19 @@ def process_row(args):
                     else:
                         content_parts.append(str(content_item))
                 content = ' '.join(content_parts)
-                
+
                 # Format as "Role: content"
                 conversation_parts.append(f"{role}: {content}")
-            
+
             conversation_string = '\n'.join(conversation_parts)
-            
+
             # Update the original DataFrame with successful data
-            df.at[index, 'tok_input'] = tokens.tolist() if hasattr(tokens, 'tolist') else list(tokens)
-            df.at[index, 'tok_input_len'] = len(tokens) if hasattr(tokens, '__len__') else 0
+            df.at[index, 'tok_input'] = tokens.tolist() if hasattr(
+                tokens, 'tolist') else list(tokens)
+            df.at[index, 'tok_input_len'] = len(
+                tokens) if hasattr(tokens, '__len__') else 0
             df.at[index, 'text_input'] = conversation_string
-    
+
     # Verify input and output have identical column lists
     input_columns = list(pd.read_pickle(args.data_file).columns)
     output_columns = list(df.columns)
@@ -524,7 +537,7 @@ def process_row(args):
         print(f"WARNING: Column lists differ!")
         print(f"Input columns: {input_columns}")
         print(f"Output columns: {output_columns}")
-    
+
     # Save the modified DataFrame as pickle
     print("Saving modified DataFrame to pickle...")
     df.to_pickle(args.output_file)
@@ -536,4 +549,4 @@ def process_row(args):
     print(f"  - Total processed: {successful_count + error_count} queries")
     print(f"  - Modified DataFrame shape: {df.shape}")
     print(f"  - Updated columns: tok_input, tok_input_len, text_input")
-    print(f"  - Set to None: ref_accuracy, ref_extracted_answer, ref_output, tok_ref_output, tok_ref_output_len, templated_text_input")
\ No newline at end of file
+    print(f"  - Set to None: ref_accuracy, ref_extracted_answer, ref_output, tok_ref_output, tok_ref_output_len, templated_text_input")
diff --git a/language/gpt-oss/run_infer.py b/language/gpt-oss/run_infer.py
index 367e69003d..216b4a025d 100755
--- a/language/gpt-oss/run_infer.py
+++ b/language/gpt-oss/run_infer.py
@@ -68,34 +68,36 @@ def send_request(
 def load_tokenized_data(data_file: str) -> pd.DataFrame:
     """Load pre-tokenized data from pickle file produced by harmony-tokens.py."""
     logger.info(f"Loading tokenized data from {data_file}")
-    
+
     # Load DataFrame from pickle
     df = pd.read_pickle(data_file)
     logger.info(f"Loaded DataFrame with shape: {df.shape}")
-    
+
     # Check if tok_input column exists and has valid data
     if 'tok_input' in df.columns:
-        # Check for any None values in tok_input (indicating failed tokenization)
+        # Check for any None values in tok_input (indicating failed
+        # tokenization)
         failed_mask = df['tok_input'].isna()
         failed_count = failed_mask.sum()
-        
+
         if failed_count > 0:
             failed_indices = df[failed_mask].index.unique()
             error_msg = f"Found {failed_count} failed tokenized samples at indices: {failed_indices.tolist()}"
             logger.error(error_msg)
             raise AssertionError(error_msg)
-        
+
         # Check first sample
         first_tokens = df.iloc[0]['tok_input']
         if isinstance(first_tokens, list):
             logger.info(f"First sample token length: {len(first_tokens)}")
         else:
-            logger.warning("tok_input column exists but first sample is not a list")
-        
+            logger.warning(
+                "tok_input column exists but first sample is not a list")
+
         logger.info(f"All {len(df)} samples were successfully tokenized")
     else:
         logger.warning("No 'tok_input' column found in DataFrame")
-    
+
     return df
 
 
@@ -151,19 +153,21 @@ def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
     return responses
 
 
-def extract_response_texts(responses: List[Dict[str, Any]], tokenized_df: pd.DataFrame) -> List[str]:
+def extract_response_texts(
+        responses: List[Dict[str, Any]], tokenized_df: pd.DataFrame) -> List[str]:
     """Extract response texts from SGLang responses."""
     logger.info("Extracting response texts...")
 
     response_texts = []
-    for i, (response, (_, row)) in enumerate(tqdm(zip(responses, tokenized_df.iterrows()), 
-                                                 total=len(responses), 
-                                                 desc="Extracting responses", 
-                                                 unit="response")):
+    for i, (response, (_, row)) in enumerate(tqdm(zip(responses, tokenized_df.iterrows()),
+                                                  total=len(responses),
+                                                  desc="Extracting responses",
+                                                  unit="response")):
         response_text = ""
         if "error" not in response and "text" in response:
             try:
-                # SGLang returns the generated text directly in the 'text' field
+                # SGLang returns the generated text directly in the 'text'
+                # field
                 response_text = response["text"]
             except Exception as e:
                 logger.warning(
@@ -181,35 +185,40 @@ def save_responses(responses: List[Dict[str, Any]], response_texts: List[str],
 
     # Work with the original DataFrame
     result_df = tokenized_df.copy()
-    
+
     # Overwrite existing columns with server response data
     result_df['ref_output'] = response_texts
     result_df['tok_ref_output'] = response_texts  # Same as ref_output for now
     result_df['tok_ref_output_len'] = [len(text) for text in response_texts]
-    
+
     # Calculate output token lengths for logging
     output_token_lengths = []
-    for i, (response, response_text) in enumerate(zip(responses, response_texts)):
+    for i, (response, response_text) in enumerate(
+            zip(responses, response_texts)):
         if "error" not in response and "meta_info" in response:
             try:
                 # Use the completion_tokens from meta_info
-                output_token_lengths.append(response["meta_info"]["completion_tokens"])
+                output_token_lengths.append(
+                    response["meta_info"]["completion_tokens"])
             except Exception as e:
-                logger.warning(f"Failed to calculate output tokens for sample {i+1}: {e}")
+                logger.warning(
+                    f"Failed to calculate output tokens for sample {i+1}: {e}")
                 output_token_lengths.append(0)
         else:
             output_token_lengths.append(0)
-    
+
     logger.info(f"Updated DataFrame with shape: {result_df.shape}")
-    logger.info(f"Updated columns: ref_output, tok_ref_output, tok_ref_output_len")
-    logger.info(f"Average output token length: {sum(output_token_lengths)/len(output_token_lengths):.1f}")
-    
+    logger.info(
+        f"Updated columns: ref_output, tok_ref_output, tok_ref_output_len")
+    logger.info(
+        f"Average output token length: {sum(output_token_lengths)/len(output_token_lengths):.1f}")
+
     # Save to pickle file if output_file is provided
     if output_file:
         logger.info(f"Saving responses to {output_file}...")
         result_df.to_pickle(output_file)
         logger.info(f"Responses saved to {output_file}")
-    
+
     return result_df
 
 
@@ -239,7 +248,7 @@ def process_requests(tokenized_df: pd.DataFrame, server_url: str,
         response_texts,
         tokenized_df,
         output_file)
-    
+
     return result_df
 
 
@@ -278,16 +287,18 @@ def main():
 
     # Process requests and get result DataFrame
     result_df = process_requests(tokenized_df, args.server_url,
-                                max_samples=args.max_samples,
-                                max_tokens=args.max_tokens,
-                                max_concurrency=args.max_concurrency,
-                                output_file=args.output)
-    
+                                 max_samples=args.max_samples,
+                                 max_tokens=args.max_tokens,
+                                 max_concurrency=args.max_concurrency,
+                                 output_file=args.output)
+
     # Print summary
     logger.info(f"\nProcessing completed:")
     logger.info(f"  - Total samples processed: {len(result_df)}")
-    logger.info(f"  - Average input token length: {result_df['tok_input_len'].mean():.1f}")
-    logger.info(f"  - Average output text length: {result_df['tok_ref_output_len'].mean():.1f}")
+    logger.info(
+        f"  - Average input token length: {result_df['tok_input_len'].mean():.1f}")
+    logger.info(
+        f"  - Average output text length: {result_df['tok_ref_output_len'].mean():.1f}")
     if args.output:
         logger.info(f"  - Results saved to: {args.output}")
     else:

From 8c087787a46da473091b268d3756e326207f8cde Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 1 Oct 2025 11:33:45 +0000
Subject: [PATCH 026/165] add README

---
 language/gpt-oss/README.md | 81 +++++++++++++++++++++++++++++++++++---
 1 file changed, 76 insertions(+), 5 deletions(-)

diff --git a/language/gpt-oss/README.md b/language/gpt-oss/README.md
index 3b2386c3d6..db15ad4e28 100644
--- a/language/gpt-oss/README.md
+++ b/language/gpt-oss/README.md
@@ -4,15 +4,86 @@ This is the reference implementation for GPT-OSS-120B. This is a proposal and is
 ## Model and Dataset download
 
 * Model: `openai/gpt-oss-120b`, commit id: [`b5c939d`](https://huggingface.co/openai/gpt-oss-120b/tree/b5c939de8f754692c1647ca79fbf85e8c1e70f8a)
-* Dataset: For now, we are re-using the `deepseek-r1` dataset. (TODO @shobhitv: Add instructions)
+* Dataset: For now, we are re-using the `deepseek-r1` dataset.
 
-## Harmony format ??
+## Preprocessing considerations
+* For all other LLMs in MLPerf, tokenization has not been part of the benchmark and has been assumed a static preprocessing step
+* With the introduction of OpenAI's [harmony format](https://github.com/openai/harmony/tree/main) - the format must be agreed upon for consistent benchmarking. 
+* Knobs:
+  - reasoning_effort: HIGH
+
+An input to the `gpt-oss` model is first formatted into a `conversation` - an ordered list of messages.
+Each message has:
+* `role`: The sender of the message
+* `content`
+* `channel`: (choices `final/analysis/commentary`, we use `final` only in prompts)
+
+
+### Preamble:
+Each converstation starts with a message from `System` and `Developer` respectively.
+```json
+ "messages": [
+    {
+      "role": "system",
+      "content": "model_identity=' ... 
+        reasoning_effort=<ReasoningEffort.HIGH: 'High'>
+        channel_config=ChannelConfig(
+          valid_channels=['analysis', 'commentary', 'final',
+          channel_required=True
+        )
+        tools=None
+        ...."
+    },
+    {
+      "role": "developer",
+      "content": "system_prompt"
+    },
+```
+
+### Multi-shot examples
+Some queries may have multi-shot examples. For these, the `User` and `Assistant` roles are assigned.
+```json
+    {
+      "role": "user",
+      "content": "example_question"
+    },
+    {
+      "role": "assistant",
+      "content": "example_answer",
+      "channel": "final"
+    },
+```
+
+### Lastly, user query
+```json
+    {
+      "role": "user",
+      "content": "actual question"
+    }
+```
 
 ## Running the reference implementation: SGLang
 [`SGLang`](https://github.com/sgl-project/sglang) is the framework of choice to run the reference implementation.
 
 ### Fetch the docker image
-SGLang docker image will be used: `lmsysorg/sglang:v0.5.3rc1`
+SGLang docker image will be used: `lmsysorg/sglang:v0.5.3rc1`. Steps below are to be run in an environment from this image
+
+### Preprocess the dataset
+
+
+### Run the server
+```bash
+python3 -m sglang.launch_server \
+    --model-path openai/gpt-oss-120b \
+    --host 0.0.0.0 \
+    --port 30000 \
+    --tp-size=1 \
+    --data-parallel-size=$dp \
+    --max-running-requests 256 \
+    --mem-fraction-static 0.85 \
+    --chunked-prefill-size 16384 \
+    --ep-size=1 \
+    --quantization mxfp4 \
+    --stream-interval 50
+```
 
-### Enroot
-TODO: Add steps
\ No newline at end of file

From 168f210517877575ea08b94c34c4ec748571612f Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 1 Oct 2025 11:35:27 +0000
Subject: [PATCH 027/165] fix name

---
 language/gpt-oss/{harmonize-inputs.py => harmonize_inputs.py} | 0
 language/gpt-oss/run_infer.py                                 | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename language/gpt-oss/{harmonize-inputs.py => harmonize_inputs.py} (100%)
 mode change 100755 => 100644 language/gpt-oss/run_infer.py

diff --git a/language/gpt-oss/harmonize-inputs.py b/language/gpt-oss/harmonize_inputs.py
similarity index 100%
rename from language/gpt-oss/harmonize-inputs.py
rename to language/gpt-oss/harmonize_inputs.py
diff --git a/language/gpt-oss/run_infer.py b/language/gpt-oss/run_infer.py
old mode 100755
new mode 100644

From 13775b11ab6fdcdaffa2e6a2a46d08c77cf8a47e Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 1 Oct 2025 11:41:18 +0000
Subject: [PATCH 028/165] add commands

---
 language/gpt-oss/README.md | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/language/gpt-oss/README.md b/language/gpt-oss/README.md
index db15ad4e28..b2d738c5a1 100644
--- a/language/gpt-oss/README.md
+++ b/language/gpt-oss/README.md
@@ -69,7 +69,12 @@ Some queries may have multi-shot examples. For these, the `User` and `Assistant`
 SGLang docker image will be used: `lmsysorg/sglang:v0.5.3rc1`. Steps below are to be run in an environment from this image
 
 ### Preprocess the dataset
-
+```bash
+python3 harmonize_inputs.py \
+    --data-file mlperf_dsr1_fp8_ref_eval.pkl \
+    --num-processes 32 \
+    --output-file out/mlperf_gptoss_inputs.pkl
+```
 
 ### Run the server
 ```bash
@@ -87,3 +92,11 @@ python3 -m sglang.launch_server \
     --stream-interval 50
 ```
 
+### Run the inference
+```bash
+python3 run_infer.py \
+    --input-tokens out/mlperf_gptoss_inputs.pkl \
+    --max-tokens 20480 \
+    --max-concurrency 4096 \
+    --output out/mlperf_gptoss_inferred.pkl
+```
\ No newline at end of file

From 37c646d5baf7b4981433cb7946bcca815ed07092 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 1 Oct 2025 13:29:29 +0000
Subject: [PATCH 029/165] update README

---
 language/gpt-oss/README.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/language/gpt-oss/README.md b/language/gpt-oss/README.md
index b2d738c5a1..aec929261b 100644
--- a/language/gpt-oss/README.md
+++ b/language/gpt-oss/README.md
@@ -99,4 +99,14 @@ python3 run_infer.py \
     --max-tokens 20480 \
     --max-concurrency 4096 \
     --output out/mlperf_gptoss_inferred.pkl
+```
+
+### Evaluate the responses
+We use the `deepseek-r1` evaluation environment to evaluate the model responses and calculate accuracy
+
+```bash
+cd $repo_root/language/deepseek-r1
+./launch_docker.sh --backend sglang
+setup.sh
+(.venv_sglang) $ python3 eval_accuracy.py --input-file mlperf_gptoss_inferred.pkl
 ```
\ No newline at end of file

From 9cd0c616ffcc50b6e184b14f4f6d6b9efc7b2670 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 1 Oct 2025 13:51:11 +0000
Subject: [PATCH 030/165] accepts output_ids and detokenize

---
 language/gpt-oss/run_infer.py | 89 +++++++++++++++++++++++++----------
 1 file changed, 64 insertions(+), 25 deletions(-)

diff --git a/language/gpt-oss/run_infer.py b/language/gpt-oss/run_infer.py
index 216b4a025d..d26deb3eff 100644
--- a/language/gpt-oss/run_infer.py
+++ b/language/gpt-oss/run_infer.py
@@ -23,6 +23,7 @@
 from multiprocessing import Pool
 import pandas as pd
 from tqdm import tqdm
+from transformers import AutoTokenizer
 
 # Set up logging
 logging.basicConfig(
@@ -30,6 +31,19 @@
     format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 
+# Initialize tokenizer
+MODEL_NAME = "openai/gpt-oss-120b"
+tokenizer = None
+
+def get_tokenizer():
+    """Get or initialize the tokenizer."""
+    global tokenizer
+    if tokenizer is None:
+        logger.info(f"Loading tokenizer for {MODEL_NAME}...")
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+        logger.info("Tokenizer loaded successfully")
+    return tokenizer
+
 
 class SGLangClient:
     def __init__(self, base_url: str = "http://localhost:30000"):
@@ -153,33 +167,54 @@ def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
     return responses
 
 
-def extract_response_texts(
-        responses: List[Dict[str, Any]], tokenized_df: pd.DataFrame) -> List[str]:
-    """Extract response texts from SGLang responses."""
-    logger.info("Extracting response texts...")
+def extract_response_ids(
+        responses: List[Dict[str, Any]], tokenized_df: pd.DataFrame) -> List[List[int]]:
+    """Extract response output_ids from SGLang responses."""
+    logger.info("Extracting response output_ids...")
 
-    response_texts = []
+    response_ids = []
     for i, (response, (_, row)) in enumerate(tqdm(zip(responses, tokenized_df.iterrows()),
                                                   total=len(responses),
                                                   desc="Extracting responses",
                                                   unit="response")):
-        response_text = ""
-        if "error" not in response and "text" in response:
+        response_id = []
+        if "error" not in response and "output_ids" in response:
             try:
-                # SGLang returns the generated text directly in the 'text'
+                # SGLang returns the generated token IDs in the 'output_ids'
                 # field
-                response_text = response["text"]
+                response_id = response["output_ids"]
             except Exception as e:
                 logger.warning(
                     f"Failed to extract response for sample {i+1}: {e}")
-        response_texts.append(response_text)
+        response_ids.append(response_id)
 
-    logger.info("Response text extraction complete")
-    return response_texts
+    logger.info("Response output_ids extraction complete")
+    return response_ids
 
 
-def save_responses(responses: List[Dict[str, Any]], response_texts: List[str],
-                   tokenized_df: pd.DataFrame, output_file: str = None) -> pd.DataFrame:
+def detokenize_output_ids(response_ids: List[List[int]]) -> List[str]:
+    """Detokenize output_ids back to text using AutoTokenizer."""
+    logger.info("Detokenizing output_ids to text...")
+    
+    tokenizer = get_tokenizer()
+    detokenized_texts = []
+    
+    for i, token_ids in enumerate(tqdm(response_ids, desc="Detokenizing outputs", unit="output")):
+        try:
+            # Detokenize the token IDs back to text
+            text = tokenizer.decode(token_ids, skip_special_tokens=True)
+            detokenized_texts.append(text)
+        except Exception as e:
+            logger.warning(f"Failed to detokenize output for sample {i+1}: {e}")
+            detokenized_texts.append("")
+    
+    logger.info("Output detokenization complete")
+    return detokenized_texts
+
+
+def save_responses(responses: List[Dict[str, Any]], response_ids: List[List[int]],
+                   detokenized_texts: List[str], tokenized_df: pd.DataFrame, 
+                   output_file: str = None) -> pd.DataFrame:
     """Save all responses to DataFrame and optionally to pickle file."""
     logger.info("Processing responses and updating DataFrame...")
 
@@ -187,14 +222,14 @@ def save_responses(responses: List[Dict[str, Any]], response_texts: List[str],
     result_df = tokenized_df.copy()
 
     # Overwrite existing columns with server response data
-    result_df['ref_output'] = response_texts
-    result_df['tok_ref_output'] = response_texts  # Same as ref_output for now
-    result_df['tok_ref_output_len'] = [len(text) for text in response_texts]
+    result_df['ref_output'] = detokenized_texts  # Detokenized text output
+    result_df['tok_ref_output'] = response_ids  # Original output_ids from SGLang
+    result_df['tok_ref_output_len'] = [len(token_ids) for token_ids in response_ids]  # Length of output_ids
 
     # Calculate output token lengths for logging
     output_token_lengths = []
-    for i, (response, response_text) in enumerate(
-            zip(responses, response_texts)):
+    for i, (response, response_ids) in enumerate(
+            zip(responses, response_ids)):
         if "error" not in response and "meta_info" in response:
             try:
                 # Use the completion_tokens from meta_info
@@ -203,9 +238,9 @@ def save_responses(responses: List[Dict[str, Any]], response_texts: List[str],
             except Exception as e:
                 logger.warning(
                     f"Failed to calculate output tokens for sample {i+1}: {e}")
-                output_token_lengths.append(0)
+                output_token_lengths.append(len(response_ids))
         else:
-            output_token_lengths.append(0)
+            output_token_lengths.append(len(response_ids))
 
     logger.info(f"Updated DataFrame with shape: {result_df.shape}")
     logger.info(
@@ -239,13 +274,17 @@ def process_requests(tokenized_df: pd.DataFrame, server_url: str,
         max_tokens,
         max_concurrency)
 
-    # Step 3: Extract response texts
-    response_texts = extract_response_texts(responses, tokenized_df)
+    # Step 3: Extract response output_ids
+    response_ids = extract_response_ids(responses, tokenized_df)
+    
+    # Step 4: Detokenize output_ids to text for ref_output
+    detokenized_texts = detokenize_output_ids(response_ids)
 
-    # Step 4: Save all results and return DataFrame
+    # Step 5: Save all results and return DataFrame
     result_df = save_responses(
         responses,
-        response_texts,
+        response_ids,
+        detokenized_texts,
         tokenized_df,
         output_file)
 

From 6abf8fa9ea571da296688cfdff0b7f59f1dd9172 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 1 Oct 2025 13:51:45 +0000
Subject: [PATCH 031/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/run_infer.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/language/gpt-oss/run_infer.py b/language/gpt-oss/run_infer.py
index d26deb3eff..3d2bc49a59 100644
--- a/language/gpt-oss/run_infer.py
+++ b/language/gpt-oss/run_infer.py
@@ -35,6 +35,7 @@
 MODEL_NAME = "openai/gpt-oss-120b"
 tokenizer = None
 
+
 def get_tokenizer():
     """Get or initialize the tokenizer."""
     global tokenizer
@@ -195,25 +196,27 @@ def extract_response_ids(
 def detokenize_output_ids(response_ids: List[List[int]]) -> List[str]:
     """Detokenize output_ids back to text using AutoTokenizer."""
     logger.info("Detokenizing output_ids to text...")
-    
+
     tokenizer = get_tokenizer()
     detokenized_texts = []
-    
-    for i, token_ids in enumerate(tqdm(response_ids, desc="Detokenizing outputs", unit="output")):
+
+    for i, token_ids in enumerate(
+            tqdm(response_ids, desc="Detokenizing outputs", unit="output")):
         try:
             # Detokenize the token IDs back to text
             text = tokenizer.decode(token_ids, skip_special_tokens=True)
             detokenized_texts.append(text)
         except Exception as e:
-            logger.warning(f"Failed to detokenize output for sample {i+1}: {e}")
+            logger.warning(
+                f"Failed to detokenize output for sample {i+1}: {e}")
             detokenized_texts.append("")
-    
+
     logger.info("Output detokenization complete")
     return detokenized_texts
 
 
 def save_responses(responses: List[Dict[str, Any]], response_ids: List[List[int]],
-                   detokenized_texts: List[str], tokenized_df: pd.DataFrame, 
+                   detokenized_texts: List[str], tokenized_df: pd.DataFrame,
                    output_file: str = None) -> pd.DataFrame:
     """Save all responses to DataFrame and optionally to pickle file."""
     logger.info("Processing responses and updating DataFrame...")
@@ -223,8 +226,10 @@ def save_responses(responses: List[Dict[str, Any]], response_ids: List[List[int]
 
     # Overwrite existing columns with server response data
     result_df['ref_output'] = detokenized_texts  # Detokenized text output
-    result_df['tok_ref_output'] = response_ids  # Original output_ids from SGLang
-    result_df['tok_ref_output_len'] = [len(token_ids) for token_ids in response_ids]  # Length of output_ids
+    # Original output_ids from SGLang
+    result_df['tok_ref_output'] = response_ids
+    result_df['tok_ref_output_len'] = [
+        len(token_ids) for token_ids in response_ids]  # Length of output_ids
 
     # Calculate output token lengths for logging
     output_token_lengths = []
@@ -276,7 +281,7 @@ def process_requests(tokenized_df: pd.DataFrame, server_url: str,
 
     # Step 3: Extract response output_ids
     response_ids = extract_response_ids(responses, tokenized_df)
-    
+
     # Step 4: Detokenize output_ids to text for ref_output
     detokenized_texts = detokenize_output_ids(response_ids)
 

From 8fe8712cb763b80a0a49abbbb7fc46ac312555c5 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 1 Oct 2025 14:30:15 +0000
Subject: [PATCH 032/165] add plotter

---
 language/gpt-oss/plot.py | 183 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 183 insertions(+)
 create mode 100644 language/gpt-oss/plot.py

diff --git a/language/gpt-oss/plot.py b/language/gpt-oss/plot.py
new file mode 100644
index 0000000000..0c3c8a9b57
--- /dev/null
+++ b/language/gpt-oss/plot.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+"""
+Histogram analysis of token reference output length (OSL) across datasets.
+Creates individual histograms per dataset and a master histogram.
+"""
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+from pathlib import Path
+import argparse
+
+def load_data(pkl_path):
+    """Load the pickle file and return the DataFrame."""
+    print(f"Loading data from {pkl_path}...")
+    df = pd.read_pickle(pkl_path)
+    print(f"Loaded {len(df)} rows with columns: {list(df.columns)}")
+    return df
+
+def create_histograms(df, output_dir="histograms"):
+    """Create histograms of tok_ref_output_len for each dataset and a master histogram."""
+    
+    # Create output directory
+    Path(output_dir).mkdir(exist_ok=True)
+    
+    # Create histograms for all data
+    _create_histogram_plots(df, output_dir, "individual_datasets_histograms.png", "Token Reference Output Length (OSL) - All Data")
+    
+    # Create histograms for data with prompt_accuracy = 100
+    df_100_accuracy = df[df['prompt_accuracy'] == 100]
+    if len(df_100_accuracy) > 0:
+        print(f"\nFiltering for prompt_accuracy = 100: {len(df_100_accuracy)} rows out of {len(df)} total")
+        _create_histogram_plots(df_100_accuracy, output_dir, "individual_datasets_histograms_100_accuracy.png", "Token Reference Output Length (OSL) - 100% Accuracy Only")
+    else:
+        print("\nNo rows found with prompt_accuracy = 100")
+
+def _create_histogram_plots(df, output_dir, filename, title_prefix):
+    """Helper function to create histogram plots."""
+    
+    # Get unique datasets
+    datasets = df['dataset'].unique()
+    print(f"Found {len(datasets)} unique datasets: {datasets}")
+    
+    # Set up the plotting style
+    plt.style.use('default')
+    sns.set_palette("husl")
+    
+    # Create individual histograms for each dataset
+    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
+    axes = axes.flatten()
+    
+    for i, dataset in enumerate(datasets):
+        if i >= 6:  # Limit to 6 subplots for readability
+            break
+            
+        ax = axes[i]
+        dataset_data = df[df['dataset'] == dataset]['tok_ref_output_len']
+        
+        # Create histogram
+        ax.hist(dataset_data, bins=30, alpha=0.7, edgecolor='black', linewidth=0.5)
+        ax.set_title(f'Dataset: {dataset}\n(n={len(dataset_data)})', fontsize=12, fontweight='bold')
+        ax.set_xlabel('Token Reference Output Length (OSL)', fontsize=10)
+        ax.set_ylabel('Frequency', fontsize=10)
+        ax.grid(True, alpha=0.3)
+        
+        # Add statistics text
+        mean_val = dataset_data.mean()
+        median_val = dataset_data.median()
+        std_val = dataset_data.std()
+        ax.text(0.02, 0.98, f'Mean: {mean_val:.1f}\nMedian: {median_val:.1f}\nStd: {std_val:.1f}', 
+                transform=ax.transAxes, verticalalignment='top', 
+                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
+    
+    # Hide unused subplots
+    for i in range(len(datasets), 6):
+        axes[i].set_visible(False)
+    
+    plt.tight_layout()
+    plt.savefig(f'{output_dir}/{filename}', dpi=300, bbox_inches='tight')
+    plt.show()
+
+def create_master_histograms(df, output_dir="histograms"):
+    """Create master histogram and overlaid histograms."""
+    
+    # Create master histogram with all datasets
+    plt.figure(figsize=(12, 8))
+    
+    # Create histogram for all data
+    plt.hist(df['tok_ref_output_len'], bins=50, alpha=0.7, edgecolor='black', linewidth=0.5, color='skyblue')
+    plt.title('Master Histogram: Token Reference Output Length (OSL) - All Datasets', 
+              fontsize=14, fontweight='bold')
+    plt.xlabel('Token Reference Output Length (OSL)', fontsize=12)
+    plt.ylabel('Frequency', fontsize=12)
+    plt.grid(True, alpha=0.3)
+    
+    # Add overall statistics
+    mean_val = df['tok_ref_output_len'].mean()
+    median_val = df['tok_ref_output_len'].median()
+    std_val = df['tok_ref_output_len'].std()
+    min_val = df['tok_ref_output_len'].min()
+    max_val = df['tok_ref_output_len'].max()
+    
+    stats_text = f'Total samples: {len(df)}\n'
+    stats_text += f'Mean: {mean_val:.1f}\n'
+    stats_text += f'Median: {median_val:.1f}\n'
+    stats_text += f'Std: {std_val:.1f}\n'
+    stats_text += f'Min: {min_val}\n'
+    stats_text += f'Max: {max_val}'
+    
+    plt.text(0.02, 0.98, stats_text, transform=plt.gca().transAxes, 
+             verticalalignment='top', fontsize=10,
+             bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))
+    
+    plt.tight_layout()
+    plt.savefig(f'{output_dir}/master_histogram.png', dpi=300, bbox_inches='tight')
+    plt.show()
+    
+    # Create a detailed comparison plot with all datasets overlaid
+    datasets = df['dataset'].unique()
+    plt.figure(figsize=(14, 8))
+    
+    colors = plt.cm.tab10(np.linspace(0, 1, len(datasets)))
+    
+    for i, dataset in enumerate(datasets):
+        dataset_data = df[df['dataset'] == dataset]['tok_ref_output_len']
+        plt.hist(dataset_data, bins=30, alpha=0.6, label=f'{dataset} (n={len(dataset_data)})', 
+                color=colors[i], edgecolor='black', linewidth=0.3)
+    
+    plt.title('Overlaid Histograms: Token Reference Output Length (OSL) by Dataset', 
+              fontsize=14, fontweight='bold')
+    plt.xlabel('Token Reference Output Length (OSL)', fontsize=12)
+    plt.ylabel('Frequency', fontsize=12)
+    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+    plt.grid(True, alpha=0.3)
+    plt.tight_layout()
+    plt.savefig(f'{output_dir}/overlaid_histograms.png', dpi=300, bbox_inches='tight')
+    plt.show()
+    
+    # Print summary statistics
+    print("\n" + "="*60)
+    print("SUMMARY STATISTICS")
+    print("="*60)
+    print(f"Overall statistics for tok_ref_output_len:")
+    print(f"  Mean: {mean_val:.2f}")
+    print(f"  Median: {median_val:.2f}")
+    print(f"  Std: {std_val:.2f}")
+    print(f"  Min: {min_val}")
+    print(f"  Max: {max_val}")
+    print(f"  Total samples: {len(df)}")
+    
+    print(f"\nPer-dataset statistics:")
+    for dataset in datasets:
+        dataset_data = df[df['dataset'] == dataset]['tok_ref_output_len']
+        print(f"  {dataset}:")
+        print(f"    Count: {len(dataset_data)}")
+        print(f"    Mean: {dataset_data.mean():.2f}")
+        print(f"    Median: {dataset_data.median():.2f}")
+        print(f"    Std: {dataset_data.std():.2f}")
+        print(f"    Min: {dataset_data.min()}")
+        print(f"    Max: {dataset_data.max()}")
+        print()
+
+def main():
+    parser = argparse.ArgumentParser(description='Create histograms of token reference output length')
+    parser.add_argument('pkl_path', help='Path to the pickle file')
+    parser.add_argument('--output-dir', default='histograms', help='Output directory for plots')
+    
+    args = parser.parse_args()
+    
+    # Load data
+    df = load_data(args.pkl_path)
+    
+    # Create histograms
+    create_histograms(df, args.output_dir)
+    
+    # Create master histograms
+    create_master_histograms(df, args.output_dir)
+    
+    print(f"Histograms saved to {args.output_dir}/")
+
+if __name__ == "__main__":
+    main()

From 889db8c22af576c34fde73eed02415621b284ec0 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 1 Oct 2025 14:30:50 +0000
Subject: [PATCH 033/165] fix sampling

---
 language/gpt-oss/run_infer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/language/gpt-oss/run_infer.py b/language/gpt-oss/run_infer.py
index 3d2bc49a59..ce2b834416 100644
--- a/language/gpt-oss/run_infer.py
+++ b/language/gpt-oss/run_infer.py
@@ -59,7 +59,8 @@ def send_request(
             "input_ids": input_ids,
             "sampling_params": {
                 "max_new_tokens": max_tokens,
-                "temperature": 0.0
+                "temperature": 0.9,
+                "top_k": 1,
             }
         }
 

From 381cf602da55dd809d37752b3f9c7305d18e54db Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 1 Oct 2025 14:31:19 +0000
Subject: [PATCH 034/165] add reasoning effort option

---
 language/gpt-oss/harmonize_inputs.py | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/language/gpt-oss/harmonize_inputs.py b/language/gpt-oss/harmonize_inputs.py
index 93e064662b..929875266c 100644
--- a/language/gpt-oss/harmonize_inputs.py
+++ b/language/gpt-oss/harmonize_inputs.py
@@ -24,7 +24,7 @@
 )
 
 
-def create_math500_prompt(user_query):
+def create_math500_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
     """
     Creates a multi-shot prompt for mathematical problem solving using Harmony format.
 
@@ -38,7 +38,7 @@ def create_math500_prompt(user_query):
     # Create the system message with required channels
     system_message = (
         SystemContent.new()
-        .with_reasoning_effort(ReasoningEffort.HIGH)
+        .with_reasoning_effort(reasoning_effort)
         .with_conversation_start_date("2025-09-30")
         .with_required_channels(["analysis", "commentary", "final"])
     )
@@ -125,7 +125,7 @@ def create_math500_prompt(user_query):
     return convo, tokens
 
 
-def create_aime1983_prompt(user_query):
+def create_aime1983_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
     """
     Creates a zero-shot prompt for mathematical problem solving using Harmony format.
 
@@ -139,7 +139,7 @@ def create_aime1983_prompt(user_query):
     # Create the system message with required channels
     system_message = (
         SystemContent.new()
-        .with_reasoning_effort(ReasoningEffort.HIGH)
+        .with_reasoning_effort(reasoning_effort)
         .with_conversation_start_date("2025-09-30")
         .with_required_channels(["analysis", "commentary", "final"])
     )
@@ -169,7 +169,7 @@ def create_aime1983_prompt(user_query):
     return convo, tokens
 
 
-def create_livecodebench_prompt(user_query):
+def create_livecodebench_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
     """
     Creates a zero-shot prompt for livecodebench problem solving using Harmony format.
 
@@ -183,7 +183,7 @@ def create_livecodebench_prompt(user_query):
     # Create the system message with required channels
     system_message = (
         SystemContent.new()
-        .with_reasoning_effort(ReasoningEffort.HIGH)
+        .with_reasoning_effort(reasoning_effort)
         .with_conversation_start_date("2025-09-30")
         .with_required_channels(["analysis", "commentary", "final"])
     )
@@ -213,7 +213,7 @@ def create_livecodebench_prompt(user_query):
     return convo, tokens
 
 
-def create_mmlu_prompt(user_query):
+def create_mmlu_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
     """
     Creates a multi-shot prompt for multiple choice question answering using Harmony format.
 
@@ -227,7 +227,7 @@ def create_mmlu_prompt(user_query):
     # Create the system message with required channels
     system_message = (
         SystemContent.new()
-        .with_reasoning_effort(ReasoningEffort.HIGH)
+        .with_reasoning_effort(reasoning_effort)
         .with_conversation_start_date("2025-09-30")
         .with_required_channels(["analysis", "commentary", "final"])
     )
@@ -314,7 +314,7 @@ def create_mmlu_prompt(user_query):
     return convo, tokens
 
 
-def create_gpqa_prompt(user_query):
+def create_gpqa_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
     """
     Creates a multi-shot prompt for organic chemistry and biochemistry question answering using Harmony format.
 
@@ -328,7 +328,7 @@ def create_gpqa_prompt(user_query):
     # Create the system message with required channels
     system_message = (
         SystemContent.new()
-        .with_reasoning_effort(ReasoningEffort.HIGH)
+        .with_reasoning_effort(reasoning_effort)
         .with_conversation_start_date("2025-09-30")
         .with_required_channels(["analysis", "commentary", "final"])
     )
@@ -425,9 +425,17 @@ def process_row(args):
                         help="Maximum number of rows to process (default: process all rows)")
     parser.add_argument("--output-file", type=str, required=True,
                         help="Output pickle file path to save the processed data as pandas DataFrame")
+    parser.add_argument("--reasoning-effort", type=str, default="high",
+                        help="Reasoning effort to use for the prompt")
     args = parser.parse_args()
     df = pd.read_pickle(args.data_file)
 
+    reasoning_effort = {
+        "high": ReasoningEffort.HIGH,
+        "medium": ReasoningEffort.MEDIUM,
+        "low": ReasoningEffort.LOW
+    }[args.reasoning_effort.lower()]
+
     # Apply row limit if specified
     if args.max_rows is not None:
         df = df.head(args.max_rows)

From 209e92360bd9858ea3ab9fe13246461c440ebe69 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 1 Oct 2025 14:31:49 +0000
Subject: [PATCH 035/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/harmonize_inputs.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/language/gpt-oss/harmonize_inputs.py b/language/gpt-oss/harmonize_inputs.py
index 929875266c..1e5102cdde 100644
--- a/language/gpt-oss/harmonize_inputs.py
+++ b/language/gpt-oss/harmonize_inputs.py
@@ -169,7 +169,8 @@ def create_aime1983_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
     return convo, tokens
 
 
-def create_livecodebench_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
+def create_livecodebench_prompt(
+        user_query, reasoning_effort=ReasoningEffort.HIGH):
     """
     Creates a zero-shot prompt for livecodebench problem solving using Harmony format.
 

From fdc518d107f8a7b8b3f4cc9f7a459763acb636e9 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 1 Oct 2025 14:32:17 +0000
Subject: [PATCH 036/165] doc option

---
 language/gpt-oss/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/language/gpt-oss/README.md b/language/gpt-oss/README.md
index aec929261b..3f1fd85e19 100644
--- a/language/gpt-oss/README.md
+++ b/language/gpt-oss/README.md
@@ -73,7 +73,8 @@ SGLang docker image will be used: `lmsysorg/sglang:v0.5.3rc1`. Steps below are t
 python3 harmonize_inputs.py \
     --data-file mlperf_dsr1_fp8_ref_eval.pkl \
     --num-processes 32 \
-    --output-file out/mlperf_gptoss_inputs.pkl
+    --output-file out/mlperf_gptoss_inputs.pkl \
+    --reasoning-effort low|medium|high
 ```
 
 ### Run the server

From f73da58238c81256775761b2b0ce01d4e16a70d9 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 1 Oct 2025 19:00:06 +0000
Subject: [PATCH 037/165] draw input histograms

---
 language/gpt-oss/plot.py      | 153 +++++++++++++++++++++++++++++-----
 language/gpt-oss/run_infer.py |  12 +--
 2 files changed, 140 insertions(+), 25 deletions(-)

diff --git a/language/gpt-oss/plot.py b/language/gpt-oss/plot.py
index 0c3c8a9b57..559eadd35b 100644
--- a/language/gpt-oss/plot.py
+++ b/language/gpt-oss/plot.py
@@ -19,23 +19,37 @@ def load_data(pkl_path):
     return df
 
 def create_histograms(df, output_dir="histograms"):
-    """Create histograms of tok_ref_output_len for each dataset and a master histogram."""
+    """Create histograms of tok_model_output_len for each dataset and a master histogram."""
     
     # Create output directory
     Path(output_dir).mkdir(exist_ok=True)
     
     # Create histograms for all data
-    _create_histogram_plots(df, output_dir, "individual_datasets_histograms.png", "Token Reference Output Length (OSL) - All Data")
+    _create_histogram_plots(df, output_dir, "individual_datasets_histograms.png", "Token Reference Output Length (OSL) - All Data", "tok_model_output_len")
     
     # Create histograms for data with prompt_accuracy = 100
     df_100_accuracy = df[df['prompt_accuracy'] == 100]
     if len(df_100_accuracy) > 0:
         print(f"\nFiltering for prompt_accuracy = 100: {len(df_100_accuracy)} rows out of {len(df)} total")
-        _create_histogram_plots(df_100_accuracy, output_dir, "individual_datasets_histograms_100_accuracy.png", "Token Reference Output Length (OSL) - 100% Accuracy Only")
+        _create_histogram_plots(df_100_accuracy, output_dir, "individual_datasets_histograms_100_accuracy.png", "Token Reference Output Length (OSL) - 100% Accuracy Only", "tok_model_output_len")
     else:
         print("\nNo rows found with prompt_accuracy = 100")
 
-def _create_histogram_plots(df, output_dir, filename, title_prefix):
+def create_input_histograms(df, output_dir="histograms"):
+    """Create histograms of tok_input_len for each dataset and a master histogram."""
+    
+    # Create histograms for all data
+    _create_histogram_plots(df, output_dir, "individual_datasets_input_histograms.png", "Token Input Length (ISL) - All Data", "tok_input_len")
+    
+    # Create histograms for data with prompt_accuracy = 100
+    df_100_accuracy = df[df['prompt_accuracy'] == 100]
+    if len(df_100_accuracy) > 0:
+        print(f"\nFiltering for prompt_accuracy = 100 (input lengths): {len(df_100_accuracy)} rows out of {len(df)} total")
+        _create_histogram_plots(df_100_accuracy, output_dir, "individual_datasets_input_histograms_100_accuracy.png", "Token Input Length (ISL) - 100% Accuracy Only", "tok_input_len")
+    else:
+        print("\nNo rows found with prompt_accuracy = 100 for input lengths")
+
+def _create_histogram_plots(df, output_dir, filename, title_prefix, column_name):
     """Helper function to create histogram plots."""
     
     # Get unique datasets
@@ -55,12 +69,20 @@ def _create_histogram_plots(df, output_dir, filename, title_prefix):
             break
             
         ax = axes[i]
-        dataset_data = df[df['dataset'] == dataset]['tok_ref_output_len']
+        dataset_data = df[df['dataset'] == dataset][column_name]
         
         # Create histogram
         ax.hist(dataset_data, bins=30, alpha=0.7, edgecolor='black', linewidth=0.5)
         ax.set_title(f'Dataset: {dataset}\n(n={len(dataset_data)})', fontsize=12, fontweight='bold')
-        ax.set_xlabel('Token Reference Output Length (OSL)', fontsize=10)
+        
+        # Set appropriate labels based on column type
+        if column_name == 'tok_model_output_len':
+            ax.set_xlabel('Token Reference Output Length (OSL)', fontsize=10)
+        elif column_name == 'tok_input_len':
+            ax.set_xlabel('Token Input Length (ISL)', fontsize=10)
+        else:
+            ax.set_xlabel(column_name, fontsize=10)
+            
         ax.set_ylabel('Frequency', fontsize=10)
         ax.grid(True, alpha=0.3)
         
@@ -87,7 +109,7 @@ def create_master_histograms(df, output_dir="histograms"):
     plt.figure(figsize=(12, 8))
     
     # Create histogram for all data
-    plt.hist(df['tok_ref_output_len'], bins=50, alpha=0.7, edgecolor='black', linewidth=0.5, color='skyblue')
+    plt.hist(df['tok_model_output_len'], bins=50, alpha=0.7, edgecolor='black', linewidth=0.5, color='skyblue')
     plt.title('Master Histogram: Token Reference Output Length (OSL) - All Datasets', 
               fontsize=14, fontweight='bold')
     plt.xlabel('Token Reference Output Length (OSL)', fontsize=12)
@@ -95,11 +117,11 @@ def create_master_histograms(df, output_dir="histograms"):
     plt.grid(True, alpha=0.3)
     
     # Add overall statistics
-    mean_val = df['tok_ref_output_len'].mean()
-    median_val = df['tok_ref_output_len'].median()
-    std_val = df['tok_ref_output_len'].std()
-    min_val = df['tok_ref_output_len'].min()
-    max_val = df['tok_ref_output_len'].max()
+    mean_val = df['tok_model_output_len'].mean()
+    median_val = df['tok_model_output_len'].median()
+    std_val = df['tok_model_output_len'].std()
+    min_val = df['tok_model_output_len'].min()
+    max_val = df['tok_model_output_len'].max()
     
     stats_text = f'Total samples: {len(df)}\n'
     stats_text += f'Mean: {mean_val:.1f}\n'
@@ -123,7 +145,7 @@ def create_master_histograms(df, output_dir="histograms"):
     colors = plt.cm.tab10(np.linspace(0, 1, len(datasets)))
     
     for i, dataset in enumerate(datasets):
-        dataset_data = df[df['dataset'] == dataset]['tok_ref_output_len']
+        dataset_data = df[df['dataset'] == dataset]['tok_model_output_len']
         plt.hist(dataset_data, bins=30, alpha=0.6, label=f'{dataset} (n={len(dataset_data)})', 
                 color=colors[i], edgecolor='black', linewidth=0.3)
     
@@ -141,7 +163,7 @@ def create_master_histograms(df, output_dir="histograms"):
     print("\n" + "="*60)
     print("SUMMARY STATISTICS")
     print("="*60)
-    print(f"Overall statistics for tok_ref_output_len:")
+    print(f"Overall statistics for tok_model_output_len:")
     print(f"  Mean: {mean_val:.2f}")
     print(f"  Median: {median_val:.2f}")
     print(f"  Std: {std_val:.2f}")
@@ -151,7 +173,88 @@ def create_master_histograms(df, output_dir="histograms"):
     
     print(f"\nPer-dataset statistics:")
     for dataset in datasets:
-        dataset_data = df[df['dataset'] == dataset]['tok_ref_output_len']
+        dataset_data = df[df['dataset'] == dataset]['tok_model_output_len']
+        print(f"  {dataset}:")
+        print(f"    Count: {len(dataset_data)}")
+        print(f"    Mean: {dataset_data.mean():.2f}")
+        print(f"    Median: {dataset_data.median():.2f}")
+        print(f"    Std: {dataset_data.std():.2f}")
+        print(f"    Min: {dataset_data.min()}")
+        print(f"    Max: {dataset_data.max()}")
+        print()
+
+def create_input_master_histograms(df, output_dir="histograms"):
+    """Create master histogram and overlaid histograms for input token lengths."""
+    
+    # Create master histogram with all datasets
+    plt.figure(figsize=(12, 8))
+    
+    # Create histogram for all data
+    plt.hist(df['tok_input_len'], bins=50, alpha=0.7, edgecolor='black', linewidth=0.5, color='lightcoral')
+    plt.title('Master Histogram: Token Input Length (ISL) - All Datasets', 
+              fontsize=14, fontweight='bold')
+    plt.xlabel('Token Input Length (ISL)', fontsize=12)
+    plt.ylabel('Frequency', fontsize=12)
+    plt.grid(True, alpha=0.3)
+    
+    # Add overall statistics
+    mean_val = df['tok_input_len'].mean()
+    median_val = df['tok_input_len'].median()
+    std_val = df['tok_input_len'].std()
+    min_val = df['tok_input_len'].min()
+    max_val = df['tok_input_len'].max()
+    
+    stats_text = f'Total samples: {len(df)}\n'
+    stats_text += f'Mean: {mean_val:.1f}\n'
+    stats_text += f'Median: {median_val:.1f}\n'
+    stats_text += f'Std: {std_val:.1f}\n'
+    stats_text += f'Min: {min_val}\n'
+    stats_text += f'Max: {max_val}'
+    
+    plt.text(0.02, 0.98, stats_text, transform=plt.gca().transAxes, 
+             verticalalignment='top', fontsize=10,
+             bbox=dict(boxstyle='round', facecolor='lightcoral', alpha=0.8))
+    
+    plt.tight_layout()
+    plt.savefig(f'{output_dir}/master_input_histogram.png', dpi=300, bbox_inches='tight')
+    plt.show()
+    
+    # Create a detailed comparison plot with all datasets overlaid
+    datasets = df['dataset'].unique()
+    plt.figure(figsize=(14, 8))
+    
+    colors = plt.cm.tab10(np.linspace(0, 1, len(datasets)))
+    
+    for i, dataset in enumerate(datasets):
+        dataset_data = df[df['dataset'] == dataset]['tok_input_len']
+        plt.hist(dataset_data, bins=30, alpha=0.6, label=f'{dataset} (n={len(dataset_data)})', 
+                color=colors[i], edgecolor='black', linewidth=0.3)
+    
+    plt.title('Overlaid Histograms: Token Input Length (ISL) by Dataset', 
+              fontsize=14, fontweight='bold')
+    plt.xlabel('Token Input Length (ISL)', fontsize=12)
+    plt.ylabel('Frequency', fontsize=12)
+    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+    plt.grid(True, alpha=0.3)
+    plt.tight_layout()
+    plt.savefig(f'{output_dir}/overlaid_input_histograms.png', dpi=300, bbox_inches='tight')
+    plt.show()
+    
+    # Print summary statistics
+    print("\n" + "="*60)
+    print("INPUT LENGTH SUMMARY STATISTICS")
+    print("="*60)
+    print(f"Overall statistics for tok_input_len:")
+    print(f"  Mean: {mean_val:.2f}")
+    print(f"  Median: {median_val:.2f}")
+    print(f"  Std: {std_val:.2f}")
+    print(f"  Min: {min_val}")
+    print(f"  Max: {max_val}")
+    print(f"  Total samples: {len(df)}")
+    
+    print(f"\nPer-dataset statistics for input lengths:")
+    for dataset in datasets:
+        dataset_data = df[df['dataset'] == dataset]['tok_input_len']
         print(f"  {dataset}:")
         print(f"    Count: {len(dataset_data)}")
         print(f"    Mean: {dataset_data.mean():.2f}")
@@ -162,7 +265,7 @@ def create_master_histograms(df, output_dir="histograms"):
         print()
 
 def main():
-    parser = argparse.ArgumentParser(description='Create histograms of token reference output length')
+    parser = argparse.ArgumentParser(description='Create histograms of token lengths (output and input)')
     parser.add_argument('pkl_path', help='Path to the pickle file')
     parser.add_argument('--output-dir', default='histograms', help='Output directory for plots')
     
@@ -171,13 +274,25 @@ def main():
     # Load data
     df = load_data(args.pkl_path)
     
-    # Create histograms
+    # Create output length histograms
+    print("\n" + "="*60)
+    print("CREATING OUTPUT LENGTH HISTOGRAMS")
+    print("="*60)
     create_histograms(df, args.output_dir)
     
-    # Create master histograms
+    # Create master output histograms
     create_master_histograms(df, args.output_dir)
     
-    print(f"Histograms saved to {args.output_dir}/")
+    # Create input length histograms
+    print("\n" + "="*60)
+    print("CREATING INPUT LENGTH HISTOGRAMS")
+    print("="*60)
+    create_input_histograms(df, args.output_dir)
+    
+    # Create master input histograms
+    create_input_master_histograms(df, args.output_dir)
+    
+    print(f"\nAll histograms saved to {args.output_dir}/")
 
 if __name__ == "__main__":
     main()
diff --git a/language/gpt-oss/run_infer.py b/language/gpt-oss/run_infer.py
index ce2b834416..e78aa33607 100644
--- a/language/gpt-oss/run_infer.py
+++ b/language/gpt-oss/run_infer.py
@@ -226,10 +226,10 @@ def save_responses(responses: List[Dict[str, Any]], response_ids: List[List[int]
     result_df = tokenized_df.copy()
 
     # Overwrite existing columns with server response data
-    result_df['ref_output'] = detokenized_texts  # Detokenized text output
+    result_df['model_output'] = detokenized_texts  # Detokenized text output
     # Original output_ids from SGLang
-    result_df['tok_ref_output'] = response_ids
-    result_df['tok_ref_output_len'] = [
+    result_df['tok_model_output'] = response_ids
+    result_df['tok_model_output_len'] = [
         len(token_ids) for token_ids in response_ids]  # Length of output_ids
 
     # Calculate output token lengths for logging
@@ -250,7 +250,7 @@ def save_responses(responses: List[Dict[str, Any]], response_ids: List[List[int]
 
     logger.info(f"Updated DataFrame with shape: {result_df.shape}")
     logger.info(
-        f"Updated columns: ref_output, tok_ref_output, tok_ref_output_len")
+        f"Updated columns: model_output, tok_model_output, tok_model_output_len")
     logger.info(
         f"Average output token length: {sum(output_token_lengths)/len(output_token_lengths):.1f}")
 
@@ -283,7 +283,7 @@ def process_requests(tokenized_df: pd.DataFrame, server_url: str,
     # Step 3: Extract response output_ids
     response_ids = extract_response_ids(responses, tokenized_df)
 
-    # Step 4: Detokenize output_ids to text for ref_output
+    # Step 4: Detokenize output_ids to text for model_output
     detokenized_texts = detokenize_output_ids(response_ids)
 
     # Step 5: Save all results and return DataFrame
@@ -343,7 +343,7 @@ def main():
     logger.info(
         f"  - Average input token length: {result_df['tok_input_len'].mean():.1f}")
     logger.info(
-        f"  - Average output text length: {result_df['tok_ref_output_len'].mean():.1f}")
+        f"  - Average output text length: {result_df['tok_model_output_len'].mean():.1f}")
     if args.output:
         logger.info(f"  - Results saved to: {args.output}")
     else:

From 4219a77693afb1e93666546def2e2d03da4700a4 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 1 Oct 2025 19:00:38 +0000
Subject: [PATCH 038/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/plot.py | 235 +++++++++++++++++++++++++--------------
 1 file changed, 151 insertions(+), 84 deletions(-)

diff --git a/language/gpt-oss/plot.py b/language/gpt-oss/plot.py
index 559eadd35b..6599e1a800 100644
--- a/language/gpt-oss/plot.py
+++ b/language/gpt-oss/plot.py
@@ -11,6 +11,7 @@
 from pathlib import Path
 import argparse
 
+
 def load_data(pkl_path):
     """Load the pickle file and return the DataFrame."""
     print(f"Loading data from {pkl_path}...")
@@ -18,63 +19,97 @@ def load_data(pkl_path):
     print(f"Loaded {len(df)} rows with columns: {list(df.columns)}")
     return df
 
+
 def create_histograms(df, output_dir="histograms"):
     """Create histograms of tok_model_output_len for each dataset and a master histogram."""
-    
+
     # Create output directory
     Path(output_dir).mkdir(exist_ok=True)
-    
+
     # Create histograms for all data
-    _create_histogram_plots(df, output_dir, "individual_datasets_histograms.png", "Token Reference Output Length (OSL) - All Data", "tok_model_output_len")
-    
+    _create_histogram_plots(
+        df,
+        output_dir,
+        "individual_datasets_histograms.png",
+        "Token Reference Output Length (OSL) - All Data",
+        "tok_model_output_len")
+
     # Create histograms for data with prompt_accuracy = 100
     df_100_accuracy = df[df['prompt_accuracy'] == 100]
     if len(df_100_accuracy) > 0:
-        print(f"\nFiltering for prompt_accuracy = 100: {len(df_100_accuracy)} rows out of {len(df)} total")
-        _create_histogram_plots(df_100_accuracy, output_dir, "individual_datasets_histograms_100_accuracy.png", "Token Reference Output Length (OSL) - 100% Accuracy Only", "tok_model_output_len")
+        print(
+            f"\nFiltering for prompt_accuracy = 100: {len(df_100_accuracy)} rows out of {len(df)} total")
+        _create_histogram_plots(
+            df_100_accuracy,
+            output_dir,
+            "individual_datasets_histograms_100_accuracy.png",
+            "Token Reference Output Length (OSL) - 100% Accuracy Only",
+            "tok_model_output_len")
     else:
         print("\nNo rows found with prompt_accuracy = 100")
 
+
 def create_input_histograms(df, output_dir="histograms"):
     """Create histograms of tok_input_len for each dataset and a master histogram."""
-    
+
     # Create histograms for all data
-    _create_histogram_plots(df, output_dir, "individual_datasets_input_histograms.png", "Token Input Length (ISL) - All Data", "tok_input_len")
-    
+    _create_histogram_plots(
+        df,
+        output_dir,
+        "individual_datasets_input_histograms.png",
+        "Token Input Length (ISL) - All Data",
+        "tok_input_len")
+
     # Create histograms for data with prompt_accuracy = 100
     df_100_accuracy = df[df['prompt_accuracy'] == 100]
     if len(df_100_accuracy) > 0:
-        print(f"\nFiltering for prompt_accuracy = 100 (input lengths): {len(df_100_accuracy)} rows out of {len(df)} total")
-        _create_histogram_plots(df_100_accuracy, output_dir, "individual_datasets_input_histograms_100_accuracy.png", "Token Input Length (ISL) - 100% Accuracy Only", "tok_input_len")
+        print(
+            f"\nFiltering for prompt_accuracy = 100 (input lengths): {len(df_100_accuracy)} rows out of {len(df)} total")
+        _create_histogram_plots(
+            df_100_accuracy,
+            output_dir,
+            "individual_datasets_input_histograms_100_accuracy.png",
+            "Token Input Length (ISL) - 100% Accuracy Only",
+            "tok_input_len")
     else:
         print("\nNo rows found with prompt_accuracy = 100 for input lengths")
 
-def _create_histogram_plots(df, output_dir, filename, title_prefix, column_name):
+
+def _create_histogram_plots(
+        df, output_dir, filename, title_prefix, column_name):
     """Helper function to create histogram plots."""
-    
+
     # Get unique datasets
     datasets = df['dataset'].unique()
     print(f"Found {len(datasets)} unique datasets: {datasets}")
-    
+
     # Set up the plotting style
     plt.style.use('default')
     sns.set_palette("husl")
-    
+
     # Create individual histograms for each dataset
     fig, axes = plt.subplots(2, 3, figsize=(18, 12))
     axes = axes.flatten()
-    
+
     for i, dataset in enumerate(datasets):
         if i >= 6:  # Limit to 6 subplots for readability
             break
-            
+
         ax = axes[i]
         dataset_data = df[df['dataset'] == dataset][column_name]
-        
+
         # Create histogram
-        ax.hist(dataset_data, bins=30, alpha=0.7, edgecolor='black', linewidth=0.5)
-        ax.set_title(f'Dataset: {dataset}\n(n={len(dataset_data)})', fontsize=12, fontweight='bold')
-        
+        ax.hist(
+            dataset_data,
+            bins=30,
+            alpha=0.7,
+            edgecolor='black',
+            linewidth=0.5)
+        ax.set_title(
+            f'Dataset: {dataset}\n(n={len(dataset_data)})',
+            fontsize=12,
+            fontweight='bold')
+
         # Set appropriate labels based on column type
         if column_name == 'tok_model_output_len':
             ax.set_xlabel('Token Reference Output Length (OSL)', fontsize=10)
@@ -82,87 +117,100 @@ def _create_histogram_plots(df, output_dir, filename, title_prefix, column_name)
             ax.set_xlabel('Token Input Length (ISL)', fontsize=10)
         else:
             ax.set_xlabel(column_name, fontsize=10)
-            
+
         ax.set_ylabel('Frequency', fontsize=10)
         ax.grid(True, alpha=0.3)
-        
+
         # Add statistics text
         mean_val = dataset_data.mean()
         median_val = dataset_data.median()
         std_val = dataset_data.std()
-        ax.text(0.02, 0.98, f'Mean: {mean_val:.1f}\nMedian: {median_val:.1f}\nStd: {std_val:.1f}', 
-                transform=ax.transAxes, verticalalignment='top', 
+        ax.text(0.02, 0.98, f'Mean: {mean_val:.1f}\nMedian: {median_val:.1f}\nStd: {std_val:.1f}',
+                transform=ax.transAxes, verticalalignment='top',
                 bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
-    
+
     # Hide unused subplots
     for i in range(len(datasets), 6):
         axes[i].set_visible(False)
-    
+
     plt.tight_layout()
     plt.savefig(f'{output_dir}/{filename}', dpi=300, bbox_inches='tight')
     plt.show()
 
+
 def create_master_histograms(df, output_dir="histograms"):
     """Create master histogram and overlaid histograms."""
-    
+
     # Create master histogram with all datasets
     plt.figure(figsize=(12, 8))
-    
+
     # Create histogram for all data
-    plt.hist(df['tok_model_output_len'], bins=50, alpha=0.7, edgecolor='black', linewidth=0.5, color='skyblue')
-    plt.title('Master Histogram: Token Reference Output Length (OSL) - All Datasets', 
+    plt.hist(
+        df['tok_model_output_len'],
+        bins=50,
+        alpha=0.7,
+        edgecolor='black',
+        linewidth=0.5,
+        color='skyblue')
+    plt.title('Master Histogram: Token Reference Output Length (OSL) - All Datasets',
               fontsize=14, fontweight='bold')
     plt.xlabel('Token Reference Output Length (OSL)', fontsize=12)
     plt.ylabel('Frequency', fontsize=12)
     plt.grid(True, alpha=0.3)
-    
+
     # Add overall statistics
     mean_val = df['tok_model_output_len'].mean()
     median_val = df['tok_model_output_len'].median()
     std_val = df['tok_model_output_len'].std()
     min_val = df['tok_model_output_len'].min()
     max_val = df['tok_model_output_len'].max()
-    
+
     stats_text = f'Total samples: {len(df)}\n'
     stats_text += f'Mean: {mean_val:.1f}\n'
     stats_text += f'Median: {median_val:.1f}\n'
     stats_text += f'Std: {std_val:.1f}\n'
     stats_text += f'Min: {min_val}\n'
     stats_text += f'Max: {max_val}'
-    
-    plt.text(0.02, 0.98, stats_text, transform=plt.gca().transAxes, 
+
+    plt.text(0.02, 0.98, stats_text, transform=plt.gca().transAxes,
              verticalalignment='top', fontsize=10,
              bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))
-    
+
     plt.tight_layout()
-    plt.savefig(f'{output_dir}/master_histogram.png', dpi=300, bbox_inches='tight')
+    plt.savefig(
+        f'{output_dir}/master_histogram.png',
+        dpi=300,
+        bbox_inches='tight')
     plt.show()
-    
+
     # Create a detailed comparison plot with all datasets overlaid
     datasets = df['dataset'].unique()
     plt.figure(figsize=(14, 8))
-    
+
     colors = plt.cm.tab10(np.linspace(0, 1, len(datasets)))
-    
+
     for i, dataset in enumerate(datasets):
         dataset_data = df[df['dataset'] == dataset]['tok_model_output_len']
-        plt.hist(dataset_data, bins=30, alpha=0.6, label=f'{dataset} (n={len(dataset_data)})', 
-                color=colors[i], edgecolor='black', linewidth=0.3)
-    
-    plt.title('Overlaid Histograms: Token Reference Output Length (OSL) by Dataset', 
+        plt.hist(dataset_data, bins=30, alpha=0.6, label=f'{dataset} (n={len(dataset_data)})',
+                 color=colors[i], edgecolor='black', linewidth=0.3)
+
+    plt.title('Overlaid Histograms: Token Reference Output Length (OSL) by Dataset',
               fontsize=14, fontweight='bold')
     plt.xlabel('Token Reference Output Length (OSL)', fontsize=12)
     plt.ylabel('Frequency', fontsize=12)
     plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
     plt.grid(True, alpha=0.3)
     plt.tight_layout()
-    plt.savefig(f'{output_dir}/overlaid_histograms.png', dpi=300, bbox_inches='tight')
+    plt.savefig(
+        f'{output_dir}/overlaid_histograms.png',
+        dpi=300,
+        bbox_inches='tight')
     plt.show()
-    
+
     # Print summary statistics
-    print("\n" + "="*60)
+    print("\n" + "=" * 60)
     print("SUMMARY STATISTICS")
-    print("="*60)
+    print("=" * 60)
     print(f"Overall statistics for tok_model_output_len:")
     print(f"  Mean: {mean_val:.2f}")
     print(f"  Median: {median_val:.2f}")
@@ -170,7 +218,7 @@ def create_master_histograms(df, output_dir="histograms"):
     print(f"  Min: {min_val}")
     print(f"  Max: {max_val}")
     print(f"  Total samples: {len(df)}")
-    
+
     print(f"\nPer-dataset statistics:")
     for dataset in datasets:
         dataset_data = df[df['dataset'] == dataset]['tok_model_output_len']
@@ -183,67 +231,80 @@ def create_master_histograms(df, output_dir="histograms"):
         print(f"    Max: {dataset_data.max()}")
         print()
 
+
 def create_input_master_histograms(df, output_dir="histograms"):
     """Create master histogram and overlaid histograms for input token lengths."""
-    
+
     # Create master histogram with all datasets
     plt.figure(figsize=(12, 8))
-    
+
     # Create histogram for all data
-    plt.hist(df['tok_input_len'], bins=50, alpha=0.7, edgecolor='black', linewidth=0.5, color='lightcoral')
-    plt.title('Master Histogram: Token Input Length (ISL) - All Datasets', 
+    plt.hist(
+        df['tok_input_len'],
+        bins=50,
+        alpha=0.7,
+        edgecolor='black',
+        linewidth=0.5,
+        color='lightcoral')
+    plt.title('Master Histogram: Token Input Length (ISL) - All Datasets',
               fontsize=14, fontweight='bold')
     plt.xlabel('Token Input Length (ISL)', fontsize=12)
     plt.ylabel('Frequency', fontsize=12)
     plt.grid(True, alpha=0.3)
-    
+
     # Add overall statistics
     mean_val = df['tok_input_len'].mean()
     median_val = df['tok_input_len'].median()
     std_val = df['tok_input_len'].std()
     min_val = df['tok_input_len'].min()
     max_val = df['tok_input_len'].max()
-    
+
     stats_text = f'Total samples: {len(df)}\n'
     stats_text += f'Mean: {mean_val:.1f}\n'
     stats_text += f'Median: {median_val:.1f}\n'
     stats_text += f'Std: {std_val:.1f}\n'
     stats_text += f'Min: {min_val}\n'
     stats_text += f'Max: {max_val}'
-    
-    plt.text(0.02, 0.98, stats_text, transform=plt.gca().transAxes, 
+
+    plt.text(0.02, 0.98, stats_text, transform=plt.gca().transAxes,
              verticalalignment='top', fontsize=10,
              bbox=dict(boxstyle='round', facecolor='lightcoral', alpha=0.8))
-    
+
     plt.tight_layout()
-    plt.savefig(f'{output_dir}/master_input_histogram.png', dpi=300, bbox_inches='tight')
+    plt.savefig(
+        f'{output_dir}/master_input_histogram.png',
+        dpi=300,
+        bbox_inches='tight')
     plt.show()
-    
+
     # Create a detailed comparison plot with all datasets overlaid
     datasets = df['dataset'].unique()
     plt.figure(figsize=(14, 8))
-    
+
     colors = plt.cm.tab10(np.linspace(0, 1, len(datasets)))
-    
+
     for i, dataset in enumerate(datasets):
         dataset_data = df[df['dataset'] == dataset]['tok_input_len']
-        plt.hist(dataset_data, bins=30, alpha=0.6, label=f'{dataset} (n={len(dataset_data)})', 
-                color=colors[i], edgecolor='black', linewidth=0.3)
-    
-    plt.title('Overlaid Histograms: Token Input Length (ISL) by Dataset', 
+        plt.hist(dataset_data, bins=30, alpha=0.6, label=f'{dataset} (n={len(dataset_data)})',
+                 color=colors[i], edgecolor='black', linewidth=0.3)
+
+    plt.title('Overlaid Histograms: Token Input Length (ISL) by Dataset',
               fontsize=14, fontweight='bold')
     plt.xlabel('Token Input Length (ISL)', fontsize=12)
     plt.ylabel('Frequency', fontsize=12)
     plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
     plt.grid(True, alpha=0.3)
     plt.tight_layout()
-    plt.savefig(f'{output_dir}/overlaid_input_histograms.png', dpi=300, bbox_inches='tight')
+    plt.savefig(
+        f'{output_dir}/overlaid_input_histograms.png',
+        dpi=300,
+        bbox_inches='tight')
     plt.show()
-    
+
     # Print summary statistics
-    print("\n" + "="*60)
+    print("\n" + "=" * 60)
     print("INPUT LENGTH SUMMARY STATISTICS")
-    print("="*60)
+    print("=" * 60)
     print(f"Overall statistics for tok_input_len:")
     print(f"  Mean: {mean_val:.2f}")
     print(f"  Median: {median_val:.2f}")
@@ -251,7 +312,7 @@ def create_input_master_histograms(df, output_dir="histograms"):
     print(f"  Min: {min_val}")
     print(f"  Max: {max_val}")
     print(f"  Total samples: {len(df)}")
-    
+
     print(f"\nPer-dataset statistics for input lengths:")
     for dataset in datasets:
         dataset_data = df[df['dataset'] == dataset]['tok_input_len']
@@ -264,35 +325,41 @@ def create_input_master_histograms(df, output_dir="histograms"):
         print(f"    Max: {dataset_data.max()}")
         print()
 
+
 def main():
-    parser = argparse.ArgumentParser(description='Create histograms of token lengths (output and input)')
+    parser = argparse.ArgumentParser(
+        description='Create histograms of token lengths (output and input)')
     parser.add_argument('pkl_path', help='Path to the pickle file')
-    parser.add_argument('--output-dir', default='histograms', help='Output directory for plots')
-    
+    parser.add_argument(
+        '--output-dir',
+        default='histograms',
+        help='Output directory for plots')
+
     args = parser.parse_args()
-    
+
     # Load data
     df = load_data(args.pkl_path)
-    
+
     # Create output length histograms
-    print("\n" + "="*60)
+    print("\n" + "=" * 60)
     print("CREATING OUTPUT LENGTH HISTOGRAMS")
-    print("="*60)
+    print("=" * 60)
     create_histograms(df, args.output_dir)
-    
+
     # Create master output histograms
     create_master_histograms(df, args.output_dir)
-    
+
     # Create input length histograms
-    print("\n" + "="*60)
+    print("\n" + "=" * 60)
     print("CREATING INPUT LENGTH HISTOGRAMS")
-    print("="*60)
+    print("=" * 60)
     create_input_histograms(df, args.output_dir)
-    
+
     # Create master input histograms
     create_input_master_histograms(df, args.output_dir)
-    
+
     print(f"\nAll histograms saved to {args.output_dir}/")
 
+
 if __name__ == "__main__":
     main()

From 782f066ed117172f0ee0212870bc89d5fb39a29c Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 8 Oct 2025 11:57:27 -0700
Subject: [PATCH 039/165] updates

---
 language/gpt-oss/harmonize_inputs.py        |  34 +-
 language/gpt-oss/infer-sglang/run_server.sh |   4 +-
 language/gpt-oss/plot.py                    | 441 +++++++-------------
 language/gpt-oss/requirements.txt           |   5 +
 language/gpt-oss/run_infer.py               |  30 +-
 5 files changed, 205 insertions(+), 309 deletions(-)

diff --git a/language/gpt-oss/harmonize_inputs.py b/language/gpt-oss/harmonize_inputs.py
index 1e5102cdde..c92abfe438 100644
--- a/language/gpt-oss/harmonize_inputs.py
+++ b/language/gpt-oss/harmonize_inputs.py
@@ -238,8 +238,7 @@ def create_mmlu_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
         DeveloperContent.new()
         .with_instructions(
             "You are an expert test-taker that answers multiple choice questions accurately. "
-            "Follow the format shown in the examples below. "
-            "The last line of your response should be in the following format: 'Answer: A/B/C/...' (e.g. 'Answer: A')."
+            "After your reasoning, provide your final answer on a new line in the format: 'Answer: X' where X is the letter choice."
         )
     )
 
@@ -256,7 +255,7 @@ def create_mmlu_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
         ),
         Message.from_role_and_content(
             Role.ASSISTANT,
-            "The characteristic of a ring R is the smallest positive integer n such that n·1_R = 0, or 0 if no such positive integer exists.\\n\\nFor the ring 2Z (even integers under addition and multiplication), we need to find the smallest positive integer n such that n·2 = 0 in 2Z. However, since 2Z contains all even integers, and n·2 is always even for any positive integer n, we have n·2 ≠ 0 for any positive n in 2Z.\\n\\nTherefore, the characteristic of 2Z is 0.\\n\\nAnswer: A"
+            "Answer: A"
         ).with_channel("final"),
 
         # Example 2: Linear Algebra - Transformations
@@ -266,7 +265,7 @@ def create_mmlu_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
         ),
         Message.from_role_and_content(
             Role.ASSISTANT,
-            "Let's compute ST and TS for a general polynomial p(x).\\n\\nFor ST: (ST)(p(x)) = S(T(p(x))) = S(xp(x)) = d/dx(xp(x)) = p(x) + xp'(x)\\n\\nFor TS: (TS)(p(x)) = T(S(p(x))) = T(p'(x)) = xp'(x)\\n\\nNow let's find ST - TS:\\n(ST - TS)(p(x)) = (p(x) + xp'(x)) - xp'(x) = p(x)\\n\\nSince (ST - TS)(p(x)) = p(x) for any polynomial p(x), we have ST - TS = I, where I is the identity transformation.\\n\\nAnswer: H"
+            "Answer: H"
         ).with_channel("final"),
 
         # Example 3: Number Theory - Diophantine Equations
@@ -276,7 +275,7 @@ def create_mmlu_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
         ),
         Message.from_role_and_content(
             Role.ASSISTANT,
-            "First, I need to find the general solution to 7m + 12n = 22.\\n\\nUsing the extended Euclidean algorithm: gcd(7,12) = 1, so solutions exist.\\nA particular solution is m₀ = 10, n₀ = -4 (since 7(10) + 12(-4) = 70 - 48 = 22).\\n\\nThe general solution is:\\nm = 10 + 12t\\nn = -4 - 7t\\nfor integer t.\\n\\nSo m + n = (10 + 12t) + (-4 - 7t) = 6 + 5t\\n\\nFor different values of t:\\nt = 0: m + n = 6\\nt = -1: m + n = 1\\nt = -2: m + n = -4\\nt = -3: m + n = -9\\nt = 1: m + n = 11\\n\\nThe greatest negative number is -4.\\n\\nAnswer: E"
+            "Answer: E"
         ).with_channel("final"),
 
         # Example 4: Differential Equations - Salt Tank Problem
@@ -286,7 +285,7 @@ def create_mmlu_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
         ),
         Message.from_role_and_content(
             Role.ASSISTANT,
-            "Let S(t) be the amount of salt at time t. The tank volume remains constant at 100L since inflow = outflow = 4 L/min.\\n\\nSalt enters at rate: 4 L/min × 0.02 g/L = 0.08 g/min\\nSalt leaves at rate: 4 L/min × S(t)/100 g/L = 0.04S(t) g/min\\n\\nDifferential equation: dS/dt = 0.08 - 0.04S(t)\\n\\nRearranging: dS/dt + 0.04S = 0.08\\n\\nThis is first-order linear. The integrating factor is e^(0.04t).\\n\\nSolution: S(t) = 2 + Ce^(-0.04t)\\n\\nUsing initial condition S(0) = 3:\\n3 = 2 + C, so C = 1\\n\\nTherefore: S(t) = 2 + e^(-0.04t)\\n\\nAt t = 100: S(100) = 2 + e^(-4) = 2 + e^-4\\n\\nAnswer: I"
+            "Answer: I"
         ).with_channel("final"),
 
         # Example 5: Basic Arithmetic - Division
@@ -296,7 +295,7 @@ def create_mmlu_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
         ),
         Message.from_role_and_content(
             Role.ASSISTANT,
-            "To find the number of teams when we know the total number of players and the number of players per team, we need to divide the total number of players by the number of players per team.\\n\\nTotal players = 30\\nPlayers per team = 5\\n\\nNumber of teams = Total players ÷ Players per team = 30 ÷ 5 = 6\\n\\nTherefore, we divide 30 by 5 to find 6 teams.\\n\\nAnswer: B"
+            "Answer: B"
         ).with_channel("final"),
 
         # The actual problem to solve
@@ -339,8 +338,7 @@ def create_gpqa_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
         DeveloperContent.new()
         .with_instructions(
             "You are an expert in organic chemistry and biochemistry who answers scientific questions accurately. "
-            "Provide detailed explanations showing your reasoning, then conclude with the answer. "
-            "The last line of your response should be in the following format: 'Answer: A/B/C/D' (e.g. 'Answer: A')."
+            "After your reasoning, provide your final answer on a new line in the format: 'Answer: X' where X is the letter choice."
         )
     )
 
@@ -357,7 +355,7 @@ def create_gpqa_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
         ),
         Message.from_role_and_content(
             Role.ASSISTANT,
-            "Explanation: The text describes the dystrophin gene and the FDA-approved oligonucleotide therapy that causes exon skipping by creating a functional, albeit shorter, dystrophin protein. Let me analyze each option:\\n\\n- Morpholino is bound to the pre-mRNA in an antisense orientation (C is involved)\\n- Every splicing mechanism creates the lariat molecule that is circular with a 3' tail and soon degraded (B is involved)\\n- The spliced RNA is polyadenylated at the 3' end with a polyA tail (A is involved)\\n- R-loops are triple helix structures of DNA and pre-mRNA that are a consequence of RNA transcription, not splicing and RNA maturation\\n\\nR-loops are not involved in the proposed exon-skipping therapy mechanism.\\n\\nAnswer: D"
+            "The text describes the dystrophin gene and the FDA-approved oligonucleotide therapy that causes exon skipping by creating a functional, albeit shorter, dystrophin protein. Morpholino is bound to the pre-mRNA in an antisense orientation. Every splicing mechanism creates the lariat molecule that is circular with a 3' tail and soon degraded. The spliced RNA is polyadenylated at the 3' end. R-loops are triple helix of DNA and the pre-mRNA and a consequence of the RNA transcription, not splicing and RNA maturation.\\n\\nAnswer: D"
         ).with_channel("final"),
 
         # Example 2: Stereochemistry - Optical Activity
@@ -367,7 +365,7 @@ def create_gpqa_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
         ),
         Message.from_role_and_content(
             Role.ASSISTANT,
-            "Explanation: To exhibit optical activity, a compound must be chiral (lack a mirror plane of symmetry). Let me analyze each compound:\\n\\nOptically active (chiral) compounds:\\n- 1-methyl-4-(prop-1-en-2-yl)cyclohex-1-ene: has chiral center\\n- di(cyclohex-2-en-1-ylidene)methane: no plane of symmetry\\n- 3-(2-methylbut-1-en-1-ylidene)cyclohex-1-ene: chiral\\n- 8,8-dichlorobicyclo[4.2.0]octan-7-one: chiral bicyclic structure\\n\\nOptically inactive (achiral) compounds:\\n- 2,3,3,3-tetrafluoroprop-1-ene: has mirror plane\\n- 5-(5-methylhexan-2-ylidene)cyclopenta-1,3-diene: has mirror plane\\n- [1,1'-biphenyl]-3,3'-diol: has mirror plane\\n- cyclopent-2-en-1-one: has mirror plane\\n\\nTherefore, 4 compounds exhibit optical activity.\\n\\nAnswer: C"
+            "The compounds 1-methyl-4-(prop-1-en-2-yl)cyclohex-1-ene, 3-(2-methylbut-1-en-1-ylidene)cyclohex-1-ene, di(cyclohex-2-en-1-ylidene)methane, and 8,8-dichlorobicyclo[4.2.0]octan-7-one are chiral molecules and thus will be optically active.\\n\\nAnswer: C"
         ).with_channel("final"),
 
         # The actual problem to solve
@@ -428,6 +426,8 @@ def process_row(args):
                         help="Output pickle file path to save the processed data as pandas DataFrame")
     parser.add_argument("--reasoning-effort", type=str, default="high",
                         help="Reasoning effort to use for the prompt")
+    parser.add_argument("--dataset", type=str, default=None,
+                        help="Filter to only process rows from this dataset (e.g., 'gpqa', 'mmlu_pro', 'math500', 'aime1983', 'livecodebench')")
     args = parser.parse_args()
     df = pd.read_pickle(args.data_file)
 
@@ -437,6 +437,18 @@ def process_row(args):
         "low": ReasoningEffort.LOW
     }[args.reasoning_effort.lower()]
 
+    # Filter by dataset if specified
+    if args.dataset is not None:
+        original_len = len(df)
+        available_datasets = sorted(df['dataset'].unique().tolist())
+        df = df[df['dataset'] == args.dataset].copy()
+        print(f"Filtered to dataset '{args.dataset}': {len(df)} rows (from {original_len} total)")
+        if len(df) == 0:
+            print(f"ERROR: No rows found for dataset '{args.dataset}'")
+            print(f"Available datasets: {available_datasets}")
+            import sys
+            sys.exit(1)
+
     # Apply row limit if specified
     if args.max_rows is not None:
         df = df.head(args.max_rows)
diff --git a/language/gpt-oss/infer-sglang/run_server.sh b/language/gpt-oss/infer-sglang/run_server.sh
index 0ec2e3050c..09cc22d17f 100755
--- a/language/gpt-oss/infer-sglang/run_server.sh
+++ b/language/gpt-oss/infer-sglang/run_server.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-dp=1
+dp=2
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -18,7 +18,7 @@ python3 -m sglang.launch_server \
     --port 30000 \
     --tp-size=1 \
     --data-parallel-size=$dp \
-    --max-running-requests 256 \
+    --max-running-requests $((dp * 512)) \
     --mem-fraction-static 0.85 \
     --chunked-prefill-size 16384 \
     --ep-size=1 \
diff --git a/language/gpt-oss/plot.py b/language/gpt-oss/plot.py
index 6599e1a800..14ce34e228 100644
--- a/language/gpt-oss/plot.py
+++ b/language/gpt-oss/plot.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
-Histogram analysis of token reference output length (OSL) across datasets.
-Creates individual histograms per dataset and a master histogram.
+Histogram analysis of token input length (ISL) and output length (OSL) across datasets.
+Creates 8 histograms as specified.
 """
 
 import pandas as pd
@@ -20,345 +20,214 @@ def load_data(pkl_path):
     return df
 
 
-def create_histograms(df, output_dir="histograms"):
-    """Create histograms of tok_model_output_len for each dataset and a master histogram."""
-
-    # Create output directory
-    Path(output_dir).mkdir(exist_ok=True)
-
-    # Create histograms for all data
-    _create_histogram_plots(
-        df,
-        output_dir,
-        "individual_datasets_histograms.png",
-        "Token Reference Output Length (OSL) - All Data",
-        "tok_model_output_len")
-
-    # Create histograms for data with prompt_accuracy = 100
-    df_100_accuracy = df[df['prompt_accuracy'] == 100]
-    if len(df_100_accuracy) > 0:
-        print(
-            f"\nFiltering for prompt_accuracy = 100: {len(df_100_accuracy)} rows out of {len(df)} total")
-        _create_histogram_plots(
-            df_100_accuracy,
-            output_dir,
-            "individual_datasets_histograms_100_accuracy.png",
-            "Token Reference Output Length (OSL) - 100% Accuracy Only",
-            "tok_model_output_len")
-    else:
-        print("\nNo rows found with prompt_accuracy = 100")
-
-
-def create_input_histograms(df, output_dir="histograms"):
-    """Create histograms of tok_input_len for each dataset and a master histogram."""
-
-    # Create histograms for all data
-    _create_histogram_plots(
-        df,
-        output_dir,
-        "individual_datasets_input_histograms.png",
-        "Token Input Length (ISL) - All Data",
-        "tok_input_len")
-
-    # Create histograms for data with prompt_accuracy = 100
-    df_100_accuracy = df[df['prompt_accuracy'] == 100]
-    if len(df_100_accuracy) > 0:
-        print(
-            f"\nFiltering for prompt_accuracy = 100 (input lengths): {len(df_100_accuracy)} rows out of {len(df)} total")
-        _create_histogram_plots(
-            df_100_accuracy,
-            output_dir,
-            "individual_datasets_input_histograms_100_accuracy.png",
-            "Token Input Length (ISL) - 100% Accuracy Only",
-            "tok_input_len")
-    else:
-        print("\nNo rows found with prompt_accuracy = 100 for input lengths")
-
-
-def _create_histogram_plots(
-        df, output_dir, filename, title_prefix, column_name):
-    """Helper function to create histogram plots."""
-
-    # Get unique datasets
-    datasets = df['dataset'].unique()
-    print(f"Found {len(datasets)} unique datasets: {datasets}")
-
-    # Set up the plotting style
-    plt.style.use('default')
-    sns.set_palette("husl")
-
-    # Create individual histograms for each dataset
-    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
+def create_per_dataset_histogram(df, column_name, title, filename, output_dir):
+    """Create individual histograms for each dataset."""
+    datasets = sorted(df['dataset'].unique())
+    print(f"Creating {filename}...")
+    print(f"  Datasets: {datasets}")
+    print(f"  Total samples: {len(df)}")
+    
+    # Determine grid layout
+    n_datasets = len(datasets)
+    n_cols = 3
+    n_rows = (n_datasets + n_cols - 1) // n_cols
+    
+    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 6 * n_rows))
+    if n_datasets == 1:
+        axes = np.array([axes])
     axes = axes.flatten()
-
+    
     for i, dataset in enumerate(datasets):
-        if i >= 6:  # Limit to 6 subplots for readability
-            break
-
         ax = axes[i]
         dataset_data = df[df['dataset'] == dataset][column_name]
-
+        
         # Create histogram
         ax.hist(
             dataset_data,
             bins=30,
             alpha=0.7,
             edgecolor='black',
-            linewidth=0.5)
+            linewidth=0.5,
+            color='skyblue' if 'OSL' in title else 'lightcoral')
+        
         ax.set_title(
-            f'Dataset: {dataset}\n(n={len(dataset_data)})',
+            f'{dataset}\n(n={len(dataset_data)})',
             fontsize=12,
             fontweight='bold')
-
-        # Set appropriate labels based on column type
-        if column_name == 'tok_model_output_len':
-            ax.set_xlabel('Token Reference Output Length (OSL)', fontsize=10)
-        elif column_name == 'tok_input_len':
-            ax.set_xlabel('Token Input Length (ISL)', fontsize=10)
-        else:
-            ax.set_xlabel(column_name, fontsize=10)
-
+        ax.set_xlabel(title, fontsize=10)
         ax.set_ylabel('Frequency', fontsize=10)
         ax.grid(True, alpha=0.3)
-
-        # Add statistics text
+        
+        # Add statistics
         mean_val = dataset_data.mean()
         median_val = dataset_data.median()
         std_val = dataset_data.std()
-        ax.text(0.02, 0.98, f'Mean: {mean_val:.1f}\nMedian: {median_val:.1f}\nStd: {std_val:.1f}',
-                transform=ax.transAxes, verticalalignment='top',
-                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
-
+        stats_text = f'Mean: {mean_val:.1f}\nMedian: {median_val:.1f}\nStd: {std_val:.1f}'
+        ax.text(0.98, 0.98, stats_text,
+                transform=ax.transAxes,
+                verticalalignment='top',
+                horizontalalignment='right',
+                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8),
+                fontsize=9)
+    
     # Hide unused subplots
-    for i in range(len(datasets), 6):
+    for i in range(n_datasets, len(axes)):
         axes[i].set_visible(False)
-
+    
     plt.tight_layout()
     plt.savefig(f'{output_dir}/{filename}', dpi=300, bbox_inches='tight')
-    plt.show()
-
-
-def create_master_histograms(df, output_dir="histograms"):
-    """Create master histogram and overlaid histograms."""
-
-    # Create master histogram with all datasets
-    plt.figure(figsize=(12, 8))
-
-    # Create histogram for all data
-    plt.hist(
-        df['tok_model_output_len'],
-        bins=50,
-        alpha=0.7,
-        edgecolor='black',
-        linewidth=0.5,
-        color='skyblue')
-    plt.title('Master Histogram: Token Reference Output Length (OSL) - All Datasets',
-              fontsize=14, fontweight='bold')
-    plt.xlabel('Token Reference Output Length (OSL)', fontsize=12)
-    plt.ylabel('Frequency', fontsize=12)
-    plt.grid(True, alpha=0.3)
-
-    # Add overall statistics
-    mean_val = df['tok_model_output_len'].mean()
-    median_val = df['tok_model_output_len'].median()
-    std_val = df['tok_model_output_len'].std()
-    min_val = df['tok_model_output_len'].min()
-    max_val = df['tok_model_output_len'].max()
-
-    stats_text = f'Total samples: {len(df)}\n'
-    stats_text += f'Mean: {mean_val:.1f}\n'
-    stats_text += f'Median: {median_val:.1f}\n'
-    stats_text += f'Std: {std_val:.1f}\n'
-    stats_text += f'Min: {min_val}\n'
-    stats_text += f'Max: {max_val}'
-
-    plt.text(0.02, 0.98, stats_text, transform=plt.gca().transAxes,
-             verticalalignment='top', fontsize=10,
-             bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))
-
-    plt.tight_layout()
-    plt.savefig(
-        f'{output_dir}/master_histogram.png',
-        dpi=300,
-        bbox_inches='tight')
-    plt.show()
+    print(f"  Saved to {output_dir}/{filename}")
+    plt.close()
 
-    # Create a detailed comparison plot with all datasets overlaid
-    datasets = df['dataset'].unique()
-    plt.figure(figsize=(14, 8))
 
-    colors = plt.cm.tab10(np.linspace(0, 1, len(datasets)))
-
-    for i, dataset in enumerate(datasets):
-        dataset_data = df[df['dataset'] == dataset]['tok_model_output_len']
-        plt.hist(dataset_data, bins=30, alpha=0.6, label=f'{dataset} (n={len(dataset_data)})',
-                 color=colors[i], edgecolor='black', linewidth=0.3)
-
-    plt.title('Overlaid Histograms: Token Reference Output Length (OSL) by Dataset',
-              fontsize=14, fontweight='bold')
-    plt.xlabel('Token Reference Output Length (OSL)', fontsize=12)
-    plt.ylabel('Frequency', fontsize=12)
-    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
-    plt.grid(True, alpha=0.3)
-    plt.tight_layout()
-    plt.savefig(
-        f'{output_dir}/overlaid_histograms.png',
-        dpi=300,
-        bbox_inches='tight')
-    plt.show()
-
-    # Print summary statistics
-    print("\n" + "=" * 60)
-    print("SUMMARY STATISTICS")
-    print("=" * 60)
-    print(f"Overall statistics for tok_model_output_len:")
-    print(f"  Mean: {mean_val:.2f}")
-    print(f"  Median: {median_val:.2f}")
-    print(f"  Std: {std_val:.2f}")
-    print(f"  Min: {min_val}")
-    print(f"  Max: {max_val}")
+def create_full_histogram(df, column_name, title, filename, output_dir):
+    """Create a single histogram combining all datasets."""
+    print(f"Creating {filename}...")
     print(f"  Total samples: {len(df)}")
-
-    print(f"\nPer-dataset statistics:")
-    for dataset in datasets:
-        dataset_data = df[df['dataset'] == dataset]['tok_model_output_len']
-        print(f"  {dataset}:")
-        print(f"    Count: {len(dataset_data)}")
-        print(f"    Mean: {dataset_data.mean():.2f}")
-        print(f"    Median: {dataset_data.median():.2f}")
-        print(f"    Std: {dataset_data.std():.2f}")
-        print(f"    Min: {dataset_data.min()}")
-        print(f"    Max: {dataset_data.max()}")
-        print()
-
-
-def create_input_master_histograms(df, output_dir="histograms"):
-    """Create master histogram and overlaid histograms for input token lengths."""
-
-    # Create master histogram with all datasets
+    
     plt.figure(figsize=(12, 8))
-
-    # Create histogram for all data
+    
+    color = 'skyblue' if 'OSL' in title else 'lightcoral'
     plt.hist(
-        df['tok_input_len'],
+        df[column_name],
         bins=50,
         alpha=0.7,
         edgecolor='black',
         linewidth=0.5,
-        color='lightcoral')
-    plt.title('Master Histogram: Token Input Length (ISL) - All Datasets',
-              fontsize=14, fontweight='bold')
-    plt.xlabel('Token Input Length (ISL)', fontsize=12)
+        color=color)
+    
+    plt.title(title, fontsize=14, fontweight='bold')
+    plt.xlabel(column_name.replace('tok_', '').replace('_len', '').upper(), fontsize=12)
     plt.ylabel('Frequency', fontsize=12)
     plt.grid(True, alpha=0.3)
-
-    # Add overall statistics
-    mean_val = df['tok_input_len'].mean()
-    median_val = df['tok_input_len'].median()
-    std_val = df['tok_input_len'].std()
-    min_val = df['tok_input_len'].min()
-    max_val = df['tok_input_len'].max()
-
+    
+    # Add statistics
+    mean_val = df[column_name].mean()
+    median_val = df[column_name].median()
+    std_val = df[column_name].std()
+    min_val = df[column_name].min()
+    max_val = df[column_name].max()
+    
     stats_text = f'Total samples: {len(df)}\n'
     stats_text += f'Mean: {mean_val:.1f}\n'
     stats_text += f'Median: {median_val:.1f}\n'
     stats_text += f'Std: {std_val:.1f}\n'
     stats_text += f'Min: {min_val}\n'
     stats_text += f'Max: {max_val}'
-
-    plt.text(0.02, 0.98, stats_text, transform=plt.gca().transAxes,
-             verticalalignment='top', fontsize=10,
-             bbox=dict(boxstyle='round', facecolor='lightcoral', alpha=0.8))
-
-    plt.tight_layout()
-    plt.savefig(
-        f'{output_dir}/master_input_histogram.png',
-        dpi=300,
-        bbox_inches='tight')
-    plt.show()
-
-    # Create a detailed comparison plot with all datasets overlaid
-    datasets = df['dataset'].unique()
-    plt.figure(figsize=(14, 8))
-
-    colors = plt.cm.tab10(np.linspace(0, 1, len(datasets)))
-
-    for i, dataset in enumerate(datasets):
-        dataset_data = df[df['dataset'] == dataset]['tok_input_len']
-        plt.hist(dataset_data, bins=30, alpha=0.6, label=f'{dataset} (n={len(dataset_data)})',
-                 color=colors[i], edgecolor='black', linewidth=0.3)
-
-    plt.title('Overlaid Histograms: Token Input Length (ISL) by Dataset',
-              fontsize=14, fontweight='bold')
-    plt.xlabel('Token Input Length (ISL)', fontsize=12)
-    plt.ylabel('Frequency', fontsize=12)
-    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
-    plt.grid(True, alpha=0.3)
+    
+    plt.text(0.98, 0.98, stats_text,
+             transform=plt.gca().transAxes,
+             verticalalignment='top',
+             horizontalalignment='right',
+             fontsize=10,
+             bbox=dict(boxstyle='round', facecolor='lightblue' if 'OSL' in title else 'lightcoral', alpha=0.8))
+    
     plt.tight_layout()
-    plt.savefig(
-        f'{output_dir}/overlaid_input_histograms.png',
-        dpi=300,
-        bbox_inches='tight')
-    plt.show()
-
-    # Print summary statistics
-    print("\n" + "=" * 60)
-    print("INPUT LENGTH SUMMARY STATISTICS")
-    print("=" * 60)
-    print(f"Overall statistics for tok_input_len:")
-    print(f"  Mean: {mean_val:.2f}")
-    print(f"  Median: {median_val:.2f}")
-    print(f"  Std: {std_val:.2f}")
-    print(f"  Min: {min_val}")
-    print(f"  Max: {max_val}")
-    print(f"  Total samples: {len(df)}")
-
-    print(f"\nPer-dataset statistics for input lengths:")
-    for dataset in datasets:
-        dataset_data = df[df['dataset'] == dataset]['tok_input_len']
-        print(f"  {dataset}:")
-        print(f"    Count: {len(dataset_data)}")
-        print(f"    Mean: {dataset_data.mean():.2f}")
-        print(f"    Median: {dataset_data.median():.2f}")
-        print(f"    Std: {dataset_data.std():.2f}")
-        print(f"    Min: {dataset_data.min()}")
-        print(f"    Max: {dataset_data.max()}")
-        print()
+    plt.savefig(f'{output_dir}/{filename}', dpi=300, bbox_inches='tight')
+    print(f"  Saved to {output_dir}/{filename}")
+    plt.close()
 
 
 def main():
     parser = argparse.ArgumentParser(
-        description='Create histograms of token lengths (output and input)')
+        description='Create histograms of token lengths (ISL and OSL)')
     parser.add_argument('pkl_path', help='Path to the pickle file')
     parser.add_argument(
         '--output-dir',
         default='histograms',
         help='Output directory for plots')
-
+    
     args = parser.parse_args()
-
+    
+    # Create output directory
+    Path(args.output_dir).mkdir(exist_ok=True)
+    
     # Load data
     df = load_data(args.pkl_path)
-
-    # Create output length histograms
-    print("\n" + "=" * 60)
-    print("CREATING OUTPUT LENGTH HISTOGRAMS")
+    
+    # Filter for 100% accuracy
+    df_100 = df[df['prompt_accuracy'] == 100.0]
+    print(f"\nFiltered {len(df_100)} rows with prompt_accuracy == 100 (out of {len(df)} total)\n")
+    
     print("=" * 60)
-    create_histograms(df, args.output_dir)
-
-    # Create master output histograms
-    create_master_histograms(df, args.output_dir)
-
-    # Create input length histograms
+    print("CREATING ISL HISTOGRAMS")
+    print("=" * 60)
+    
+    # 1. Per dataset ISL histogram
+    create_per_dataset_histogram(
+        df, 'tok_input_len',
+        'Token Input Length (ISL)',
+        '1_per_dataset_ISL.png',
+        args.output_dir)
+    
+    # 2. Per dataset ISL histogram (accuracy == 100)
+    if len(df_100) > 0:
+        create_per_dataset_histogram(
+            df_100, 'tok_input_len',
+            'Token Input Length (ISL) - 100% Accuracy',
+            '2_per_dataset_ISL_acc100.png',
+            args.output_dir)
+    else:
+        print("Skipping per-dataset ISL (acc==100): no data")
+    
+    # 3. Full ISL histogram
+    create_full_histogram(
+        df, 'tok_input_len',
+        'Token Input Length (ISL) - All Data',
+        '3_full_ISL.png',
+        args.output_dir)
+    
+    # 4. Full ISL histogram (accuracy == 100)
+    if len(df_100) > 0:
+        create_full_histogram(
+            df_100, 'tok_input_len',
+            'Token Input Length (ISL) - 100% Accuracy',
+            '4_full_ISL_acc100.png',
+            args.output_dir)
+    else:
+        print("Skipping full ISL (acc==100): no data")
+    
     print("\n" + "=" * 60)
-    print("CREATING INPUT LENGTH HISTOGRAMS")
+    print("CREATING OSL HISTOGRAMS")
     print("=" * 60)
-    create_input_histograms(df, args.output_dir)
-
-    # Create master input histograms
-    create_input_master_histograms(df, args.output_dir)
-
-    print(f"\nAll histograms saved to {args.output_dir}/")
+    
+    # 5. Per dataset OSL histogram
+    create_per_dataset_histogram(
+        df, 'tok_model_output_len',
+        'Token Output Length (OSL)',
+        '5_per_dataset_OSL.png',
+        args.output_dir)
+    
+    # 6. Per dataset OSL histogram (accuracy == 100)
+    if len(df_100) > 0:
+        create_per_dataset_histogram(
+            df_100, 'tok_model_output_len',
+            'Token Output Length (OSL) - 100% Accuracy',
+            '6_per_dataset_OSL_acc100.png',
+            args.output_dir)
+    else:
+        print("Skipping per-dataset OSL (acc==100): no data")
+    
+    # 7. Full OSL histogram
+    create_full_histogram(
+        df, 'tok_model_output_len',
+        'Token Output Length (OSL) - All Data',
+        '7_full_OSL.png',
+        args.output_dir)
+    
+    # 8. Full OSL histogram (accuracy == 100)
+    if len(df_100) > 0:
+        create_full_histogram(
+            df_100, 'tok_model_output_len',
+            'Token Output Length (OSL) - 100% Accuracy',
+            '8_full_OSL_acc100.png',
+            args.output_dir)
+    else:
+        print("Skipping full OSL (acc==100): no data")
+    
+    print(f"\n{'=' * 60}")
+    print(f"All histograms saved to {args.output_dir}/")
+    print(f"{'=' * 60}")
 
 
 if __name__ == "__main__":
diff --git a/language/gpt-oss/requirements.txt b/language/gpt-oss/requirements.txt
index 3096b113f6..a75b9ee0e2 100644
--- a/language/gpt-oss/requirements.txt
+++ b/language/gpt-oss/requirements.txt
@@ -3,3 +3,8 @@ pandas>=1.3.0
 requests>=2.25.0
 transformers>=4.20.0
 torch>=1.12.0
+audioread>=2.1.9
+msgpack>=1.0
+numba>=0.51.0
+pooch>=1.0
+soxr>=0.3.2
\ No newline at end of file
diff --git a/language/gpt-oss/run_infer.py b/language/gpt-oss/run_infer.py
index e78aa33607..bf2da5918a 100644
--- a/language/gpt-oss/run_infer.py
+++ b/language/gpt-oss/run_infer.py
@@ -47,9 +47,11 @@ def get_tokenizer():
 
 
 class SGLangClient:
-    def __init__(self, base_url: str = "http://localhost:30000"):
+    def __init__(self, base_url: str = "http://localhost:30000", temperature: float = 0.001, top_k: int = 1):
         self.base_url = base_url
         self.session = requests.Session()
+        self.temperature = temperature
+        self.top_k = top_k
 
     def send_request(
             self, input_ids: List[int], max_tokens: int = 100) -> Dict[str, Any]:
@@ -59,8 +61,8 @@ def send_request(
             "input_ids": input_ids,
             "sampling_params": {
                 "max_new_tokens": max_tokens,
-                "temperature": 0.9,
-                "top_k": 1,
+                "temperature": self.temperature,
+                "top_k": self.top_k,
             }
         }
 
@@ -119,10 +121,10 @@ def load_tokenized_data(data_file: str) -> pd.DataFrame:
 
 def send_single_request(args_tuple):
     """Send a single request - used by multiprocessing pool."""
-    input_ids, max_tokens, server_url, sample_id = args_tuple
+    input_ids, max_tokens, server_url, sample_id, temperature, top_k = args_tuple
 
     # Create a new client for this process
-    client = SGLangClient(server_url)
+    client = SGLangClient(server_url, temperature=temperature, top_k=top_k)
 
     try:
         response = client.send_request(input_ids, max_tokens=max_tokens)
@@ -133,7 +135,7 @@ def send_single_request(args_tuple):
 
 
 def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
-                           max_tokens: int = 100, max_concurrency: int = 128) -> List[Dict[str, Any]]:
+                           max_tokens: int = 100, max_concurrency: int = 128, temperature: float = 0.001, top_k: int = 1) -> List[Dict[str, Any]]:
     """Send all requests to SGLang server in parallel using multiprocessing."""
     num_samples = len(tokenized_df)
     logger.info(
@@ -141,7 +143,7 @@ def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
 
     # Prepare arguments for multiprocessing
     args_list = [
-        (row['tok_input'], max_tokens, server_url, idx)
+        (row['tok_input'], max_tokens, server_url, idx, temperature, top_k)
         for idx, row in tokenized_df.iterrows()
     ]
 
@@ -265,7 +267,7 @@ def save_responses(responses: List[Dict[str, Any]], response_ids: List[List[int]
 
 def process_requests(tokenized_df: pd.DataFrame, server_url: str,
                      max_samples: int = None, max_tokens: int = 100,
-                     max_concurrency: int = 128, output_file: str = None) -> pd.DataFrame:
+                     max_concurrency: int = 128, output_file: str = None, temperature: float = 0.001, top_k: int = 1) -> pd.DataFrame:
     """Main processing function that handles requests and response extraction."""
 
     # Step 1: Limit samples if specified
@@ -278,7 +280,9 @@ def process_requests(tokenized_df: pd.DataFrame, server_url: str,
         tokenized_df,
         server_url,
         max_tokens,
-        max_concurrency)
+        max_concurrency,
+        temperature,
+        top_k)
 
     # Step 3: Extract response output_ids
     response_ids = extract_response_ids(responses, tokenized_df)
@@ -312,6 +316,10 @@ def main():
                         help="Maximum number of concurrent requests (default: 128)")
     parser.add_argument("--output", default=None,
                         help="Output pickle file for responses (optional)")
+    parser.add_argument("--temperature", type=float, default=0.001,
+                        help="Temperature for sampling (default: 0.001)")
+    parser.add_argument("--top-k", type=int, default=1,
+                        help="Top-k for sampling (default: 1)")
 
     args = parser.parse_args()
 
@@ -335,7 +343,9 @@ def main():
                                  max_samples=args.max_samples,
                                  max_tokens=args.max_tokens,
                                  max_concurrency=args.max_concurrency,
-                                 output_file=args.output)
+                                 output_file=args.output,
+                                 temperature=args.temperature,
+                                 top_k=args.top_k)
 
     # Print summary
     logger.info(f"\nProcessing completed:")

From 326a5fadf060aba0b0864322e631257575a2cfed Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 8 Oct 2025 18:59:18 +0000
Subject: [PATCH 040/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/harmonize_inputs.py |  3 +-
 language/gpt-oss/plot.py             | 71 +++++++++++++++-------------
 language/gpt-oss/run_infer.py        |  3 +-
 3 files changed, 43 insertions(+), 34 deletions(-)

diff --git a/language/gpt-oss/harmonize_inputs.py b/language/gpt-oss/harmonize_inputs.py
index c92abfe438..cc114e45a0 100644
--- a/language/gpt-oss/harmonize_inputs.py
+++ b/language/gpt-oss/harmonize_inputs.py
@@ -442,7 +442,8 @@ def process_row(args):
         original_len = len(df)
         available_datasets = sorted(df['dataset'].unique().tolist())
         df = df[df['dataset'] == args.dataset].copy()
-        print(f"Filtered to dataset '{args.dataset}': {len(df)} rows (from {original_len} total)")
+        print(
+            f"Filtered to dataset '{args.dataset}': {len(df)} rows (from {original_len} total)")
         if len(df) == 0:
             print(f"ERROR: No rows found for dataset '{args.dataset}'")
             print(f"Available datasets: {available_datasets}")
diff --git a/language/gpt-oss/plot.py b/language/gpt-oss/plot.py
index 14ce34e228..d24dbff870 100644
--- a/language/gpt-oss/plot.py
+++ b/language/gpt-oss/plot.py
@@ -26,21 +26,21 @@ def create_per_dataset_histogram(df, column_name, title, filename, output_dir):
     print(f"Creating {filename}...")
     print(f"  Datasets: {datasets}")
     print(f"  Total samples: {len(df)}")
-    
+
     # Determine grid layout
     n_datasets = len(datasets)
     n_cols = 3
     n_rows = (n_datasets + n_cols - 1) // n_cols
-    
+
     fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 6 * n_rows))
     if n_datasets == 1:
         axes = np.array([axes])
     axes = axes.flatten()
-    
+
     for i, dataset in enumerate(datasets):
         ax = axes[i]
         dataset_data = df[df['dataset'] == dataset][column_name]
-        
+
         # Create histogram
         ax.hist(
             dataset_data,
@@ -49,7 +49,7 @@ def create_per_dataset_histogram(df, column_name, title, filename, output_dir):
             edgecolor='black',
             linewidth=0.5,
             color='skyblue' if 'OSL' in title else 'lightcoral')
-        
+
         ax.set_title(
             f'{dataset}\n(n={len(dataset_data)})',
             fontsize=12,
@@ -57,7 +57,7 @@ def create_per_dataset_histogram(df, column_name, title, filename, output_dir):
         ax.set_xlabel(title, fontsize=10)
         ax.set_ylabel('Frequency', fontsize=10)
         ax.grid(True, alpha=0.3)
-        
+
         # Add statistics
         mean_val = dataset_data.mean()
         median_val = dataset_data.median()
@@ -69,11 +69,11 @@ def create_per_dataset_histogram(df, column_name, title, filename, output_dir):
                 horizontalalignment='right',
                 bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8),
                 fontsize=9)
-    
+
     # Hide unused subplots
     for i in range(n_datasets, len(axes)):
         axes[i].set_visible(False)
-    
+
     plt.tight_layout()
     plt.savefig(f'{output_dir}/{filename}', dpi=300, bbox_inches='tight')
     print(f"  Saved to {output_dir}/{filename}")
@@ -84,9 +84,9 @@ def create_full_histogram(df, column_name, title, filename, output_dir):
     """Create a single histogram combining all datasets."""
     print(f"Creating {filename}...")
     print(f"  Total samples: {len(df)}")
-    
+
     plt.figure(figsize=(12, 8))
-    
+
     color = 'skyblue' if 'OSL' in title else 'lightcoral'
     plt.hist(
         df[column_name],
@@ -95,33 +95,39 @@ def create_full_histogram(df, column_name, title, filename, output_dir):
         edgecolor='black',
         linewidth=0.5,
         color=color)
-    
+
     plt.title(title, fontsize=14, fontweight='bold')
-    plt.xlabel(column_name.replace('tok_', '').replace('_len', '').upper(), fontsize=12)
+    plt.xlabel(
+        column_name.replace(
+            'tok_',
+            '').replace(
+            '_len',
+            '').upper(),
+        fontsize=12)
     plt.ylabel('Frequency', fontsize=12)
     plt.grid(True, alpha=0.3)
-    
+
     # Add statistics
     mean_val = df[column_name].mean()
     median_val = df[column_name].median()
     std_val = df[column_name].std()
     min_val = df[column_name].min()
     max_val = df[column_name].max()
-    
+
     stats_text = f'Total samples: {len(df)}\n'
     stats_text += f'Mean: {mean_val:.1f}\n'
     stats_text += f'Median: {median_val:.1f}\n'
     stats_text += f'Std: {std_val:.1f}\n'
     stats_text += f'Min: {min_val}\n'
     stats_text += f'Max: {max_val}'
-    
+
     plt.text(0.98, 0.98, stats_text,
              transform=plt.gca().transAxes,
              verticalalignment='top',
              horizontalalignment='right',
              fontsize=10,
              bbox=dict(boxstyle='round', facecolor='lightblue' if 'OSL' in title else 'lightcoral', alpha=0.8))
-    
+
     plt.tight_layout()
     plt.savefig(f'{output_dir}/{filename}', dpi=300, bbox_inches='tight')
     print(f"  Saved to {output_dir}/{filename}")
@@ -136,30 +142,31 @@ def main():
         '--output-dir',
         default='histograms',
         help='Output directory for plots')
-    
+
     args = parser.parse_args()
-    
+
     # Create output directory
     Path(args.output_dir).mkdir(exist_ok=True)
-    
+
     # Load data
     df = load_data(args.pkl_path)
-    
+
     # Filter for 100% accuracy
     df_100 = df[df['prompt_accuracy'] == 100.0]
-    print(f"\nFiltered {len(df_100)} rows with prompt_accuracy == 100 (out of {len(df)} total)\n")
-    
+    print(
+        f"\nFiltered {len(df_100)} rows with prompt_accuracy == 100 (out of {len(df)} total)\n")
+
     print("=" * 60)
     print("CREATING ISL HISTOGRAMS")
     print("=" * 60)
-    
+
     # 1. Per dataset ISL histogram
     create_per_dataset_histogram(
         df, 'tok_input_len',
         'Token Input Length (ISL)',
         '1_per_dataset_ISL.png',
         args.output_dir)
-    
+
     # 2. Per dataset ISL histogram (accuracy == 100)
     if len(df_100) > 0:
         create_per_dataset_histogram(
@@ -169,14 +176,14 @@ def main():
             args.output_dir)
     else:
         print("Skipping per-dataset ISL (acc==100): no data")
-    
+
     # 3. Full ISL histogram
     create_full_histogram(
         df, 'tok_input_len',
         'Token Input Length (ISL) - All Data',
         '3_full_ISL.png',
         args.output_dir)
-    
+
     # 4. Full ISL histogram (accuracy == 100)
     if len(df_100) > 0:
         create_full_histogram(
@@ -186,18 +193,18 @@ def main():
             args.output_dir)
     else:
         print("Skipping full ISL (acc==100): no data")
-    
+
     print("\n" + "=" * 60)
     print("CREATING OSL HISTOGRAMS")
     print("=" * 60)
-    
+
     # 5. Per dataset OSL histogram
     create_per_dataset_histogram(
         df, 'tok_model_output_len',
         'Token Output Length (OSL)',
         '5_per_dataset_OSL.png',
         args.output_dir)
-    
+
     # 6. Per dataset OSL histogram (accuracy == 100)
     if len(df_100) > 0:
         create_per_dataset_histogram(
@@ -207,14 +214,14 @@ def main():
             args.output_dir)
     else:
         print("Skipping per-dataset OSL (acc==100): no data")
-    
+
     # 7. Full OSL histogram
     create_full_histogram(
         df, 'tok_model_output_len',
         'Token Output Length (OSL) - All Data',
         '7_full_OSL.png',
         args.output_dir)
-    
+
     # 8. Full OSL histogram (accuracy == 100)
     if len(df_100) > 0:
         create_full_histogram(
@@ -224,7 +231,7 @@ def main():
             args.output_dir)
     else:
         print("Skipping full OSL (acc==100): no data")
-    
+
     print(f"\n{'=' * 60}")
     print(f"All histograms saved to {args.output_dir}/")
     print(f"{'=' * 60}")
diff --git a/language/gpt-oss/run_infer.py b/language/gpt-oss/run_infer.py
index bf2da5918a..4a9a2e24f6 100644
--- a/language/gpt-oss/run_infer.py
+++ b/language/gpt-oss/run_infer.py
@@ -47,7 +47,8 @@ def get_tokenizer():
 
 
 class SGLangClient:
-    def __init__(self, base_url: str = "http://localhost:30000", temperature: float = 0.001, top_k: int = 1):
+    def __init__(self, base_url: str = "http://localhost:30000",
+                 temperature: float = 0.001, top_k: int = 1):
         self.base_url = base_url
         self.session = requests.Session()
         self.temperature = temperature

From 1b99263bef9cbeb06f77a9eb62e4f80f8a238b48 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 8 Oct 2025 12:51:20 -0700
Subject: [PATCH 041/165] add more opts

---
 language/gpt-oss/run_infer.py | 54 +++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/language/gpt-oss/run_infer.py b/language/gpt-oss/run_infer.py
index 4a9a2e24f6..0f0991becc 100644
--- a/language/gpt-oss/run_infer.py
+++ b/language/gpt-oss/run_infer.py
@@ -47,12 +47,17 @@ def get_tokenizer():
 
 
 class SGLangClient:
-    def __init__(self, base_url: str = "http://localhost:30000",
-                 temperature: float = 0.001, top_k: int = 1):
-        self.base_url = base_url
+    def __init__(self,
+            server_url: str = "http://localhost:30000",
+            temperature: float = 0.001,
+            top_k: int = 1,
+            timeout: int = 1200
+        ):
+        self.base_url = server_url
         self.session = requests.Session()
         self.temperature = temperature
         self.top_k = top_k
+        self.timeout = timeout
 
     def send_request(
             self, input_ids: List[int], max_tokens: int = 100) -> Dict[str, Any]:
@@ -71,7 +76,7 @@ def send_request(
             response = self.session.post(
                 f"{self.base_url}/generate",
                 json=payload,
-                timeout=1200
+                timeout=self.timeout,
             )
             if response.status_code == 200:
                 return response.json()
@@ -122,10 +127,10 @@ def load_tokenized_data(data_file: str) -> pd.DataFrame:
 
 def send_single_request(args_tuple):
     """Send a single request - used by multiprocessing pool."""
-    input_ids, max_tokens, server_url, sample_id, temperature, top_k = args_tuple
+    input_ids, max_tokens, server_url, sample_id, temperature, top_k, timeout = args_tuple
 
     # Create a new client for this process
-    client = SGLangClient(server_url, temperature=temperature, top_k=top_k)
+    client = SGLangClient(server_url=server_url, temperature=temperature, top_k=top_k, timeout=timeout)
 
     try:
         response = client.send_request(input_ids, max_tokens=max_tokens)
@@ -136,7 +141,7 @@ def send_single_request(args_tuple):
 
 
 def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
-                           max_tokens: int = 100, max_concurrency: int = 128, temperature: float = 0.001, top_k: int = 1) -> List[Dict[str, Any]]:
+                           max_tokens: int = 100, max_concurrency: int = 128, temperature: float = 0.001, top_k: int = 1, timeout: int = 1200) -> List[Dict[str, Any]]:
     """Send all requests to SGLang server in parallel using multiprocessing."""
     num_samples = len(tokenized_df)
     logger.info(
@@ -144,15 +149,13 @@ def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
 
     # Prepare arguments for multiprocessing
     args_list = [
-        (row['tok_input'], max_tokens, server_url, idx, temperature, top_k)
+        (row['tok_input'], max_tokens, server_url, idx, temperature, top_k, timeout)
         for idx, row in tokenized_df.iterrows()
     ]
 
     start_time = time.time()
 
-    # Use multiprocessing pool with progress bar
     with Pool(processes=min(max_concurrency, num_samples)) as pool:
-        # Map the function to all arguments with progress bar
         results = list(tqdm(
             pool.imap(send_single_request, args_list),
             total=len(args_list),
@@ -239,16 +242,12 @@ def save_responses(responses: List[Dict[str, Any]], response_ids: List[List[int]
     output_token_lengths = []
     for i, (response, response_ids) in enumerate(
             zip(responses, response_ids)):
-        if "error" not in response and "meta_info" in response:
-            try:
-                # Use the completion_tokens from meta_info
-                output_token_lengths.append(
-                    response["meta_info"]["completion_tokens"])
-            except Exception as e:
-                logger.warning(
-                    f"Failed to calculate output tokens for sample {i+1}: {e}")
-                output_token_lengths.append(len(response_ids))
-        else:
+        try:
+            output_token_length = response["meta_info"]["completion_tokens"] if "meta_info" in response else len(response_ids)
+            output_token_lengths.append(output_token_length)
+        except Exception as e:
+            logger.warning(
+                f"Failed to calculate output tokens for sample {i+1}: {e}")
             output_token_lengths.append(len(response_ids))
 
     logger.info(f"Updated DataFrame with shape: {result_df.shape}")
@@ -268,7 +267,8 @@ def save_responses(responses: List[Dict[str, Any]], response_ids: List[List[int]
 
 def process_requests(tokenized_df: pd.DataFrame, server_url: str,
                      max_samples: int = None, max_tokens: int = 100,
-                     max_concurrency: int = 128, output_file: str = None, temperature: float = 0.001, top_k: int = 1) -> pd.DataFrame:
+                     max_concurrency: int = 128, output_file: str = None, temperature: float = 0.001, top_k: int = 1,
+                     timeout: int = 1200) -> pd.DataFrame:
     """Main processing function that handles requests and response extraction."""
 
     # Step 1: Limit samples if specified
@@ -283,7 +283,8 @@ def process_requests(tokenized_df: pd.DataFrame, server_url: str,
         max_tokens,
         max_concurrency,
         temperature,
-        top_k)
+        top_k,
+        timeout)
 
     # Step 3: Extract response output_ids
     response_ids = extract_response_ids(responses, tokenized_df)
@@ -321,13 +322,15 @@ def main():
                         help="Temperature for sampling (default: 0.001)")
     parser.add_argument("--top-k", type=int, default=1,
                         help="Top-k for sampling (default: 1)")
+    parser.add_argument("--timeout", type=int, default=1200,
+                        help="Timeout for requests (default: 1200)")
 
     args = parser.parse_args()
 
     # Test connection
     logger.info(f"Testing server connection to {args.server_url}...")
-    test_client = SGLangClient(args.server_url)
-    test_response = test_client.send_request([1, 2, 3], max_tokens=5)
+    test_client = SGLangClient(server_url=args.server_url, temperature=args.temperature, top_k=args.top_k, timeout=args.timeout)
+    test_response = test_client.send_request(input_ids=[1, 2, 3], max_tokens=5)
     if "error" in test_response:
         logger.error(f"Server connection failed: {test_response['error']}")
         logger.error("Make sure your SGLang server is running. Try:")
@@ -346,7 +349,8 @@ def main():
                                  max_concurrency=args.max_concurrency,
                                  output_file=args.output,
                                  temperature=args.temperature,
-                                 top_k=args.top_k)
+                                 top_k=args.top_k,
+                                 timeout=args.timeout)
 
     # Print summary
     logger.info(f"\nProcessing completed:")

From 579ef1c0127790b61091827e71e4c5cbdd1292e3 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 13 Oct 2025 11:19:45 -0700
Subject: [PATCH 042/165] move

---
 language/gpt-oss/{infer-sglang => }/run_server.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename language/gpt-oss/{infer-sglang => }/run_server.sh (100%)

diff --git a/language/gpt-oss/infer-sglang/run_server.sh b/language/gpt-oss/run_server.sh
similarity index 100%
rename from language/gpt-oss/infer-sglang/run_server.sh
rename to language/gpt-oss/run_server.sh

From ec8d12eb2cbd21b1c1698aae16272fa8ab627174 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Mon, 13 Oct 2025 18:20:45 +0000
Subject: [PATCH 043/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/run_infer.py | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/language/gpt-oss/run_infer.py b/language/gpt-oss/run_infer.py
index 0f0991becc..d0bf06bd33 100644
--- a/language/gpt-oss/run_infer.py
+++ b/language/gpt-oss/run_infer.py
@@ -48,11 +48,11 @@ def get_tokenizer():
 
 class SGLangClient:
     def __init__(self,
-            server_url: str = "http://localhost:30000",
-            temperature: float = 0.001,
-            top_k: int = 1,
-            timeout: int = 1200
-        ):
+                 server_url: str = "http://localhost:30000",
+                 temperature: float = 0.001,
+                 top_k: int = 1,
+                 timeout: int = 1200
+                 ):
         self.base_url = server_url
         self.session = requests.Session()
         self.temperature = temperature
@@ -130,7 +130,11 @@ def send_single_request(args_tuple):
     input_ids, max_tokens, server_url, sample_id, temperature, top_k, timeout = args_tuple
 
     # Create a new client for this process
-    client = SGLangClient(server_url=server_url, temperature=temperature, top_k=top_k, timeout=timeout)
+    client = SGLangClient(
+        server_url=server_url,
+        temperature=temperature,
+        top_k=top_k,
+        timeout=timeout)
 
     try:
         response = client.send_request(input_ids, max_tokens=max_tokens)
@@ -149,7 +153,8 @@ def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
 
     # Prepare arguments for multiprocessing
     args_list = [
-        (row['tok_input'], max_tokens, server_url, idx, temperature, top_k, timeout)
+        (row['tok_input'], max_tokens, server_url,
+         idx, temperature, top_k, timeout)
         for idx, row in tokenized_df.iterrows()
     ]
 
@@ -243,7 +248,8 @@ def save_responses(responses: List[Dict[str, Any]], response_ids: List[List[int]
     for i, (response, response_ids) in enumerate(
             zip(responses, response_ids)):
         try:
-            output_token_length = response["meta_info"]["completion_tokens"] if "meta_info" in response else len(response_ids)
+            output_token_length = response["meta_info"]["completion_tokens"] if "meta_info" in response else len(
+                response_ids)
             output_token_lengths.append(output_token_length)
         except Exception as e:
             logger.warning(
@@ -329,7 +335,11 @@ def main():
 
     # Test connection
     logger.info(f"Testing server connection to {args.server_url}...")
-    test_client = SGLangClient(server_url=args.server_url, temperature=args.temperature, top_k=args.top_k, timeout=args.timeout)
+    test_client = SGLangClient(
+        server_url=args.server_url,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        timeout=args.timeout)
     test_response = test_client.send_request(input_ids=[1, 2, 3], max_tokens=5)
     if "error" in test_response:
         logger.error(f"Server connection failed: {test_response['error']}")

From a9de6f4e4529b078e684cd830b85a04ba2add5e4 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 13 Oct 2025 20:45:11 +0000
Subject: [PATCH 044/165] refactor

---
 language/gpt-oss/harmonize_inputs.py | 434 ++++++++++-----------------
 1 file changed, 160 insertions(+), 274 deletions(-)

diff --git a/language/gpt-oss/harmonize_inputs.py b/language/gpt-oss/harmonize_inputs.py
index cc114e45a0..2695b68287 100644
--- a/language/gpt-oss/harmonize_inputs.py
+++ b/language/gpt-oss/harmonize_inputs.py
@@ -23,18 +23,20 @@
     ReasoningEffort
 )
 
+MOD_PROMPT = "Do not repeat steps and output the final answer immediately once you have it. Once you have a candidate answer, do not spend more than ~100 tokens to verify it - instead, do a quick check and answer immediately. Avoid thinking for a long time. It is important to answer in as few tokens as possible."
 
-def create_math500_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
-    """
-    Creates a multi-shot prompt for mathematical problem solving using Harmony format.
 
+def _create_base_messages(reasoning_effort, instructions):
+    """
+    Creates system and developer messages for a conversation.
+    
+    Args:
+        reasoning_effort: ReasoningEffort enum value
+        instructions: String containing developer instructions
+        
     Returns:
-        tuple: (conversation_object, token_list) ready for model completion
+        list: List containing system and developer messages
     """
-
-    # Load the Harmony encoding for gpt-oss models
-    enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
-
     # Create the system message with required channels
     system_message = (
         SystemContent.new()
@@ -44,85 +46,107 @@ def create_math500_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
     )
 
     # Create the developer message with instructions
-    developer_message = (
-        DeveloperContent.new()
-        .with_instructions(
-            "You are a math expert that solves problems step-by-step. "
-            "Always show your work clearly and put your final answer in \\boxed{answer} format. "
-            "Follow the format shown in the examples below."
-        )
-    )
+    developer_message = DeveloperContent.new().with_instructions(instructions)
 
-    # Create the conversation with multi-shot examples
-    messages = [
-        # System and developer setup
+    return [
         Message.from_role_and_content(Role.SYSTEM, system_message),
         Message.from_role_and_content(Role.DEVELOPER, developer_message),
+    ]
 
+
+def _add_multishot_examples(messages, examples):
+    """
+    Adds multi-shot examples to a message list.
+    
+    Args:
+        messages: List of messages to append examples to
+        examples: List of tuples (user_content, assistant_content) representing examples
+        
+    Returns:
+        list: Updated messages list with examples added
+    """
+    for user_content, assistant_content in examples:
+        messages.append(Message.from_role_and_content(Role.USER, user_content))
+        messages.append(
+            Message.from_role_and_content(Role.ASSISTANT, assistant_content).with_channel("final")
+        )
+    return messages
+
+
+def _finalize_conversation(messages, user_query):
+    """
+    Adds the user query, creates the conversation, and renders tokens.
+    
+    Args:
+        messages: List of messages (system, developer, and optionally examples)
+        user_query: The actual user query to solve
+        
+    Returns:
+        tuple: (conversation_object, token_list) ready for model completion
+    """
+    # Load the Harmony encoding for gpt-oss models
+    enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+    
+    # Add the actual problem to solve
+    messages.append(Message.from_role_and_content(Role.USER, user_query))
+    
+    # Create the conversation
+    convo = Conversation.from_messages(messages)
+    
+    # Render the conversation for completion (ready to send to the model)
+    tokens = enc.render_conversation_for_completion(convo, Role.ASSISTANT)
+    
+    return convo, tokens
+
+
+def create_math500_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
+    """
+    Creates a multi-shot prompt for mathematical problem solving using Harmony format.
+
+    Returns:
+        tuple: (conversation_object, token_list) ready for model completion
+    """
+    instructions = (
+        "You are a math expert that solves problems step-by-step. "
+        "Always show your work clearly and put your final answer in \\boxed{answer} format. "
+        f"{MOD_PROMPT} "
+        "Follow the format shown in the examples below. "
+    )
+    
+    messages = _create_base_messages(reasoning_effort, instructions)
+    
+    # Define multi-shot examples
+    examples = [
         # Example 1: Square areas and side lengths
-        Message.from_role_and_content(
-            Role.USER,
-            "The areas of three squares are 16, 49 and 169. What is the average (mean) of their side lengths?"
-        ),
-        Message.from_role_and_content(
-            Role.ASSISTANT,
+        (
+            "The areas of three squares are 16, 49 and 169. What is the average (mean) of their side lengths?",
             "Since the areas of the three squares are 16, 49 and 169, then their side lengths are $\\sqrt{16}=4$, $\\sqrt{49}=7$ and $\\sqrt{169}=13$, respectively.\\n\\nThus, the average of their side lengths is $$\\frac{4+7+13}{3}=\\boxed{8}.$$"
-        ).with_channel("final"),
-
-        # Example 2: Floor function equation
-        Message.from_role_and_content(
-            Role.USER,
-            "Find all $x$ such that $\\lfloor \\lfloor 2x \\rfloor - 1/2 \\rfloor = \\lfloor x + 2 \\rfloor.$"
         ),
-        Message.from_role_and_content(
-            Role.ASSISTANT,
+        # Example 2: Floor function equation
+        (
+            "Find all $x$ such that $\\lfloor \\lfloor 2x \\rfloor - 1/2 \\rfloor = \\lfloor x + 2 \\rfloor.$",
             "Observe that $\\lfloor 2x \\rfloor$ is an integer, so it follows that $\\lfloor \\lfloor 2x \\rfloor - 1/2 \\rfloor = \\lfloor 2x \\rfloor - 1$. Also, $\\lfloor x + 2 \\rfloor = \\lfloor x \\rfloor + 2$. Thus, our equation becomes $$\\lfloor 2x \\rfloor = \\lfloor x \\rfloor + 3.$$Let $n = \\lfloor x \\rfloor,$ so $n \\le x < n + 1.$\\n\\nIf $x < n + \\frac{1}{2},$ then $2n \\le x < 2n + 1,$ so $\\lfloor 2x \\rfloor = 2n,$ and\\n\\[2n = n + 3,\\]which means $n = 3.$\\n\\nIf $x \\ge n + \\frac{1}{2},$ then $2n + 1 \\le x < 2n + 2,$ so $\\lfloor 2x \\rfloor = 2n + 1,$ and\\n\\[2n + 1 = n + 3,\\]which means $n = 2.$\\n\\nTherefore, the set of solutions is $x \\in \\boxed{\\left[ \\frac{5}{2}, \\frac{7}{2} \\right)}.$"
-        ).with_channel("final"),
-
-        # Example 3: Sequences and differences
-        Message.from_role_and_content(
-            Role.USER,
-            "Sequence $A$ is a geometric sequence. Sequence $B$ is an arithmetic sequence. Each sequence stops as soon as one of its terms is greater than $300.$ What is the least positive difference between a number selected from sequence $A$ and a number selected from sequence $B?$\\n\\n$\\bullet$ Sequence $A:$ $2,$ $4,$ $8,$ $16,$ $32,$ $\\ldots$\\n\\n$\\bullet$ Sequence $B:$ $20,$ $40,$ $60,$ $80,$ $100,$ $\\ldots$"
         ),
-        Message.from_role_and_content(
-            Role.ASSISTANT,
+        # Example 3: Sequences and differences
+        (
+            "Sequence $A$ is a geometric sequence. Sequence $B$ is an arithmetic sequence. Each sequence stops as soon as one of its terms is greater than $300.$ What is the least positive difference between a number selected from sequence $A$ and a number selected from sequence $B?$\\n\\n$\\bullet$ Sequence $A:$ $2,$ $4,$ $8,$ $16,$ $32,$ $\\ldots$\\n\\n$\\bullet$ Sequence $B:$ $20,$ $40,$ $60,$ $80,$ $100,$ $\\ldots$",
             "The terms of sequence $A$ are $2,$ $4,$ $8,$ $16,$ $32,$ $64,$ $128,$ $256,$ $512.$ The terms of sequence $B$ start from $20$ and go up by $20$ each time, so sequence $B$ is precisely all multiples of $20$ from $20$ to $320.$ We thus need to see which term in sequence $A$ is closest to a multiple of $20.$ $16,$ $64,$ and $256$ are the closest, each being $4$ away from a multiple of $20.$ So the least positive difference between a term in sequence $A$ and one in sequence $B$ is $\\boxed{4}.$"
-        ).with_channel("final"),
-
-        # Example 4: Probability and Deal or No Deal
-        Message.from_role_and_content(
-            Role.USER,
-            "In the game Deal or No Deal, participants choose a box at random from a set of $26,$ one containing each of the following values: \\begin{tabular}{|c|c|}\\hline\\$.01&\\$1,000\\\\\\hline\\$1&\\$5,000\\\\\\hline\\$5&\\$10,000\\\\\\hline\\$10&\\$25,000\\\\\\hline\\$25&\\$50,000\\\\\\hline\\$50&\\$75,000\\\\\\hline\\$75&\\$100,000\\\\\\hline\\$100&\\$200,000\\\\\\hline\\$200&\\$300,000\\\\\\hline\\$300&\\$400,000\\\\\\hline\\$400&\\$500,000\\\\\\hline\\$500&\\$750,000\\\\\\hline\\$750&\\$1,000,000\\\\\\hline\\end{tabular} After choosing a box, participants eliminate other boxes by opening them, showing the amount of money in the box to the crowd, and then removing that box (and its money!) from the game. What is the minimum number of boxes a participant needs to eliminate in order to have a half chance of holding at least $\\$100,\\!000$ as his or her chosen box?"
         ),
-        Message.from_role_and_content(
-            Role.ASSISTANT,
+        # Example 4: Probability and Deal or No Deal
+        (
+            "In the game Deal or No Deal, participants choose a box at random from a set of $26,$ one containing each of the following values: \\begin{tabular}{|c|c|}\\hline\\$.01&\\$1,000\\\\\\hline\\$1&\\$5,000\\\\\\hline\\$5&\\$10,000\\\\\\hline\\$10&\\$25,000\\\\\\hline\\$25&\\$50,000\\\\\\hline\\$50&\\$75,000\\\\\\hline\\$75&\\$100,000\\\\\\hline\\$100&\\$200,000\\\\\\hline\\$200&\\$300,000\\\\\\hline\\$300&\\$400,000\\\\\\hline\\$400&\\$500,000\\\\\\hline\\$500&\\$750,000\\\\\\hline\\$750&\\$1,000,000\\\\\\hline\\end{tabular} After choosing a box, participants eliminate other boxes by opening them, showing the amount of money in the box to the crowd, and then removing that box (and its money!) from the game. What is the minimum number of boxes a participant needs to eliminate in order to have a half chance of holding at least $\\$100,\\!000$ as his or her chosen box?",
             "Seven of the boxes contain at least this amount. If a participant is going to be holding one of these boxes with a probability of $1/2,$ there can be at most $7$ other boxes left. This means that at least $26-7-7=\\boxed{12}$ boxes must be eliminated."
-        ).with_channel("final"),
-
-        # Example 5: Domain of composite function
-        Message.from_role_and_content(
-            Role.USER,
-            "Find the domain of the function $f(x) = \\tan(\\arccos(x^2)).$"
         ),
-        Message.from_role_and_content(
-            Role.ASSISTANT,
+        # Example 5: Domain of composite function
+        (
+            "Find the domain of the function $f(x) = \\tan(\\arccos(x^2)).$",
             "For $\\arccos (x^2)$ to be defined, we must have $-1 \\le x^2 \\le 1,$ which is satisfied only for $-1 \\le x \\le 1.$  Then $\\arccos (x^2)$ will always return an angle between 0 and $\\frac{\\pi}{2}.$  Then $\\tan (\\arccos(x^2))$ is defined, unless $\\arccos(x^2) = \\frac{\\pi}{2}.$  This occurs only when $x = 0.$\\n\\nTherefore, the domain of $f(x)$ is $\\boxed{[-1,0) \\cup (0,1]}.$"
-        ).with_channel("final"),
-
-        # The actual problem to solve
-        Message.from_role_and_content(
-            Role.USER,
-            user_query,
-        )
+        ),
     ]
-
-    # Create the conversation
-    convo = Conversation.from_messages(messages)
-
-    # Render the conversation for completion (ready to send to the model)
-    tokens = enc.render_conversation_for_completion(convo, Role.ASSISTANT)
-
-    return convo, tokens
+    
+    _add_multishot_examples(messages, examples)
+    
+    return _finalize_conversation(messages, user_query)
 
 
 def create_aime1983_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
@@ -132,41 +156,15 @@ def create_aime1983_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
     Returns:
         tuple: (conversation_object, token_list) ready for model completion
     """
-
-    # Load the Harmony encoding for gpt-oss models
-    enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
-
-    # Create the system message with required channels
-    system_message = (
-        SystemContent.new()
-        .with_reasoning_effort(reasoning_effort)
-        .with_conversation_start_date("2025-09-30")
-        .with_required_channels(["analysis", "commentary", "final"])
-    )
-
-    # Create the developer message with instructions
-    developer_message = (
-        DeveloperContent.new()
-        .with_instructions(
-            "You are a math expert that solves problems step-by-step. "
-            "The final line of your response should contain the final answer as an integer enclosed in \\boxed{answer}."
-        )
+    instructions = (
+        "You are a math expert that solves problems step-by-step. "
+        f"{MOD_PROMPT} "
+        "The final line of your response should contain the final answer as an integer enclosed in \\boxed{answer}."
     )
-
-    messages = [
-        # System and developer setup
-        Message.from_role_and_content(Role.SYSTEM, system_message),
-        Message.from_role_and_content(Role.DEVELOPER, developer_message),
-        Message.from_role_and_content(Role.USER, user_query),
-    ]
-
-    # Create the conversation
-    convo = Conversation.from_messages(messages)
-
-    # Render the conversation for completion (ready to send to the model)
-    tokens = enc.render_conversation_for_completion(convo, Role.ASSISTANT)
-
-    return convo, tokens
+    
+    messages = _create_base_messages(reasoning_effort, instructions)
+    
+    return _finalize_conversation(messages, user_query)
 
 
 def create_livecodebench_prompt(
@@ -177,41 +175,16 @@ def create_livecodebench_prompt(
     Returns:
         tuple: (conversation_object, token_list) ready for model completion
     """
-
-    # Load the Harmony encoding for gpt-oss models
-    enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
-
-    # Create the system message with required channels
-    system_message = (
-        SystemContent.new()
-        .with_reasoning_effort(reasoning_effort)
-        .with_conversation_start_date("2025-09-30")
-        .with_required_channels(["analysis", "commentary", "final"])
+    instructions = (
+        "You are a python coding expert that solves problems step-by-step. "
+        "You must provide the reasoning to arriving at your solution and the code to solve the problem."
+        f"{MOD_PROMPT} "
+        "The code should be enclosed within ```python delimiters."
     )
-
-    # Create the developer message with instructions
-    developer_message = (
-        DeveloperContent.new()
-        .with_instructions(
-            "You are a python coding expert that solves problems step-by-step. "
-            "You must provide the reasoning to arriving at your solution and the code to solve the problem."
-            "The code should be enclosed within ```python delimiters."
-        )
-    )
-
-    messages = [
-        # System and developer setup
-        Message.from_role_and_content(Role.SYSTEM, system_message),
-        Message.from_role_and_content(Role.DEVELOPER, developer_message),
-        Message.from_role_and_content(Role.USER, user_query),
-    ]
-
-    convo = Conversation.from_messages(messages)
-
-    # Render the conversation for completion (ready to send to the model)
-    tokens = enc.render_conversation_for_completion(convo, Role.ASSISTANT)
-
-    return convo, tokens
+    
+    messages = _create_base_messages(reasoning_effort, instructions)
+    
+    return _finalize_conversation(messages, user_query)
 
 
 def create_mmlu_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
@@ -221,97 +194,46 @@ def create_mmlu_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
     Returns:
         tuple: (conversation_object, token_list) ready for model completion
     """
-
-    # Load the Harmony encoding for gpt-oss models
-    enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
-
-    # Create the system message with required channels
-    system_message = (
-        SystemContent.new()
-        .with_reasoning_effort(reasoning_effort)
-        .with_conversation_start_date("2025-09-30")
-        .with_required_channels(["analysis", "commentary", "final"])
-    )
-
-    # Create the developer message with instructions
-    developer_message = (
-        DeveloperContent.new()
-        .with_instructions(
-            "You are an expert test-taker that answers multiple choice questions accurately. "
-            "After your reasoning, provide your final answer on a new line in the format: 'Answer: X' where X is the letter choice."
-        )
+    instructions = (
+        "You are an expert test-taker that answers multiple choice questions accurately. "
+        f"{MOD_PROMPT} "
+        "After your reasoning, provide your final answer on a new line in the format: 'Answer: X' where X is the letter choice."
     )
-
-    # Create the conversation with multi-shot examples
-    messages = [
-        # System and developer setup
-        Message.from_role_and_content(Role.SYSTEM, system_message),
-        Message.from_role_and_content(Role.DEVELOPER, developer_message),
-
+    
+    messages = _create_base_messages(reasoning_effort, instructions)
+    
+    # Define multi-shot examples
+    examples = [
         # Example 1: Abstract Algebra - Ring Theory
-        Message.from_role_and_content(
-            Role.USER,
-            "The symmetric group $S_n$ has $n!$ elements, hence it is not true that $S_{10}$ has 10 elements.\\nFind the characteristic of the ring 2Z.\\nA) 0\\nB) 30\\nC) 3\\nD) 10\\nE) 12\\nF) 50\\nG) 2\\nH) 100\\nI) 20\\nJ) 5"
-        ),
-        Message.from_role_and_content(
-            Role.ASSISTANT,
+        (
+            "The symmetric group $S_n$ has $n!$ elements, hence it is not true that $S_{10}$ has 10 elements.\\nFind the characteristic of the ring 2Z.\\nA) 0\\nB) 30\\nC) 3\\nD) 10\\nE) 12\\nF) 50\\nG) 2\\nH) 100\\nI) 20\\nJ) 5",
             "Answer: A"
-        ).with_channel("final"),
-
-        # Example 2: Linear Algebra - Transformations
-        Message.from_role_and_content(
-            Role.USER,
-            "Let V be the set of all real polynomials p(x). Let transformations T, S be defined on V by T:p(x) -> xp(x) and S:p(x) -> p'(x) = d/dx p(x), and interpret (ST)(p(x)) as S(T(p(x))). Which of the following is true?\\nA) ST + TS is the identity map of V onto itself.\\nB) TS = 0\\nC) ST = 1\\nD) ST - TS = 0\\nE) ST = T\\nF) ST = 0\\nG) ST = TS\\nH) ST - TS is the identity map of V onto itself.\\nI) TS = T\\nJ) ST = S"
         ),
-        Message.from_role_and_content(
-            Role.ASSISTANT,
+        # Example 2: Linear Algebra - Transformations
+        (
+            "Let V be the set of all real polynomials p(x). Let transformations T, S be defined on V by T:p(x) -> xp(x) and S:p(x) -> p'(x) = d/dx p(x), and interpret (ST)(p(x)) as S(T(p(x))). Which of the following is true?\\nA) ST + TS is the identity map of V onto itself.\\nB) TS = 0\\nC) ST = 1\\nD) ST - TS = 0\\nE) ST = T\\nF) ST = 0\\nG) ST = TS\\nH) ST - TS is the identity map of V onto itself.\\nI) TS = T\\nJ) ST = S",
             "Answer: H"
-        ).with_channel("final"),
-
-        # Example 3: Number Theory - Diophantine Equations
-        Message.from_role_and_content(
-            Role.USER,
-            "Let A be the set of all ordered pairs of integers (m, n) such that 7m + 12n = 22. What is the greatest negative number in the set B = {m + n : (m, n) ∈ A}?\\nA) -5\\nB) 0\\nC) -3\\nD) -7\\nE) -4\\nF) -6\\nG) -1\\nH) -2\\nI) -9\\nJ) N/A"
         ),
-        Message.from_role_and_content(
-            Role.ASSISTANT,
+        # Example 3: Number Theory - Diophantine Equations
+        (
+            "Let A be the set of all ordered pairs of integers (m, n) such that 7m + 12n = 22. What is the greatest negative number in the set B = {m + n : (m, n) ∈ A}?\\nA) -5\\nB) 0\\nC) -3\\nD) -7\\nE) -4\\nF) -6\\nG) -1\\nH) -2\\nI) -9\\nJ) N/A",
             "Answer: E"
-        ).with_channel("final"),
-
-        # Example 4: Differential Equations - Salt Tank Problem
-        Message.from_role_and_content(
-            Role.USER,
-            "A tank initially contains a salt solution of 3 grams of salt dissolved in 100 liters of water. A salt solution containing 0.02 grams of salt per liter of water is sprayed into the tank at a rate of 4 liters per minute. The sprayed solution is continually mixed with the salt solution in the tank, and the mixture flows out of the tank at a rate of 4 liters per minute. If the mixing is instantaneous, how many grams of salt are in the tank after 100 minutes have elapsed?\\nA) 3 + e^-2\\nB) 2 - e^-4\\nC) 2 - e^-2\\nD) 3 + e^-4\\nE) 2 + e^-3\\nF) 2 - e^-3\\nG) 3 - e^-2\\nH) 2 + e^-2\\nI) 2 + e^-4\\nJ) 2"
         ),
-        Message.from_role_and_content(
-            Role.ASSISTANT,
+        # Example 4: Differential Equations - Salt Tank Problem
+        (
+            "A tank initially contains a salt solution of 3 grams of salt dissolved in 100 liters of water. A salt solution containing 0.02 grams of salt per liter of water is sprayed into the tank at a rate of 4 liters per minute. The sprayed solution is continually mixed with the salt solution in the tank, and the mixture flows out of the tank at a rate of 4 liters per minute. If the mixing is instantaneous, how many grams of salt are in the tank after 100 minutes have elapsed?\\nA) 3 + e^-2\\nB) 2 - e^-4\\nC) 2 - e^-2\\nD) 3 + e^-4\\nE) 2 + e^-3\\nF) 2 - e^-3\\nG) 3 - e^-2\\nH) 2 + e^-2\\nI) 2 + e^-4\\nJ) 2",
             "Answer: I"
-        ).with_channel("final"),
-
-        # Example 5: Basic Arithmetic - Division
-        Message.from_role_and_content(
-            Role.USER,
-            "A total of 30 players will play basketball at a park. There will be exactly 5 players on each team. Which statement correctly explains how to find the number of teams needed?\\nA) Multiply 5 by 5 to find 25 teams.\\nB) Divide 30 by 5 to find 6 teams.\\nC) Add 5 to 30 to find 35 teams.\\nD) Subtract 30 from 5 to find -25 teams.\\nE) Divide 5 by 30 to find 0.1667 teams.\\nF) Add 5 to 30 then divide by 2 to find 17.5 teams.\\nG) N/A\\nH) N/A\\nI) N/A\\nJ) N/A"
         ),
-        Message.from_role_and_content(
-            Role.ASSISTANT,
+        # Example 5: Basic Arithmetic - Division
+        (
+            "A total of 30 players will play basketball at a park. There will be exactly 5 players on each team. Which statement correctly explains how to find the number of teams needed?\\nA) Multiply 5 by 5 to find 25 teams.\\nB) Divide 30 by 5 to find 6 teams.\\nC) Add 5 to 30 to find 35 teams.\\nD) Subtract 30 from 5 to find -25 teams.\\nE) Divide 5 by 30 to find 0.1667 teams.\\nF) Add 5 to 30 then divide by 2 to find 17.5 teams.\\nG) N/A\\nH) N/A\\nI) N/A\\nJ) N/A",
             "Answer: B"
-        ).with_channel("final"),
-
-        # The actual problem to solve
-        Message.from_role_and_content(
-            Role.USER,
-            user_query,
-        )
+        ),
     ]
-
-    # Create the conversation
-    convo = Conversation.from_messages(messages)
-
-    # Render the conversation for completion (ready to send to the model)
-    tokens = enc.render_conversation_for_completion(convo, Role.ASSISTANT)
-
-    return convo, tokens
+    
+    _add_multishot_examples(messages, examples)
+    
+    return _finalize_conversation(messages, user_query)
 
 
 def create_gpqa_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
@@ -321,67 +243,31 @@ def create_gpqa_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
     Returns:
         tuple: (conversation_object, token_list) ready for model completion
     """
-
-    # Load the Harmony encoding for gpt-oss models
-    enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
-
-    # Create the system message with required channels
-    system_message = (
-        SystemContent.new()
-        .with_reasoning_effort(reasoning_effort)
-        .with_conversation_start_date("2025-09-30")
-        .with_required_channels(["analysis", "commentary", "final"])
-    )
-
-    # Create the developer message with instructions
-    developer_message = (
-        DeveloperContent.new()
-        .with_instructions(
-            "You are an expert in organic chemistry and biochemistry who answers scientific questions accurately. "
-            "After your reasoning, provide your final answer on a new line in the format: 'Answer: X' where X is the letter choice."
-        )
+    instructions = (
+        "You are an expert in organic chemistry and biochemistry who answers scientific questions accurately. "
+        f"{MOD_PROMPT} "
+        "After your reasoning, provide your final answer on a new line in the format: 'Answer: X' where X is the letter choice."
     )
-
-    # Create the conversation with multi-shot examples
-    messages = [
-        # System and developer setup
-        Message.from_role_and_content(Role.SYSTEM, system_message),
-        Message.from_role_and_content(Role.DEVELOPER, developer_message),
-
+    
+    messages = _create_base_messages(reasoning_effort, instructions)
+    
+    # Define multi-shot examples
+    examples = [
         # Example 1: Molecular Biology - Gene Therapy
-        Message.from_role_and_content(
-            Role.USER,
-            "A large gene has dozens of exons, of which the central ones code for folded triple helical repeats that connect the cytoskeleton with sarcolemma and extracellular space. Each exon usually codes for one folded triple alpha helix. The most common mutations of the gene are central exon deletions that create out-of-frame peptides and progressive degenerative organ waste. A solution is to deliver a Morpholino that recognizes the 5' end of the out-of-frame exon in pre-mRNA. The molecule prevents binding of the spliceosome and creates exon skipping and in-frame joining. Several missing exons are well tolerated by an organism. Which structure below is not involved in the proposed therapy?\\nA) polyA tail\\nB) lariat\\nC) antisense\\nD) R-loops"
-        ),
-        Message.from_role_and_content(
-            Role.ASSISTANT,
+        (
+            "A large gene has dozens of exons, of which the central ones code for folded triple helical repeats that connect the cytoskeleton with sarcolemma and extracellular space. Each exon usually codes for one folded triple alpha helix. The most common mutations of the gene are central exon deletions that create out-of-frame peptides and progressive degenerative organ waste. A solution is to deliver a Morpholino that recognizes the 5' end of the out-of-frame exon in pre-mRNA. The molecule prevents binding of the spliceosome and creates exon skipping and in-frame joining. Several missing exons are well tolerated by an organism. Which structure below is not involved in the proposed therapy?\\nA) polyA tail\\nB) lariat\\nC) antisense\\nD) R-loops",
             "The text describes the dystrophin gene and the FDA-approved oligonucleotide therapy that causes exon skipping by creating a functional, albeit shorter, dystrophin protein. Morpholino is bound to the pre-mRNA in an antisense orientation. Every splicing mechanism creates the lariat molecule that is circular with a 3' tail and soon degraded. The spliced RNA is polyadenylated at the 3' end. R-loops are triple helix of DNA and the pre-mRNA and a consequence of the RNA transcription, not splicing and RNA maturation.\\n\\nAnswer: D"
-        ).with_channel("final"),
-
-        # Example 2: Stereochemistry - Optical Activity
-        Message.from_role_and_content(
-            Role.USER,
-            "How many of the following compounds exhibit optical activity?\\n1-methyl-4-(prop-1-en-2-yl)cyclohex-1-ene\\n2,3,3,3-tetrafluoroprop-1-ene\\ndi(cyclohex-2-en-1-ylidene)methane\\n5-(5-methylhexan-2-ylidene)cyclopenta-1,3-diene\\n3-(2-methylbut-1-en-1-ylidene)cyclohex-1-ene\\n[1,1'-biphenyl]-3,3'-diol\\n8,8-dichlorobicyclo[4.2.0]octan-7-one\\ncyclopent-2-en-1-one\\nA) 6\\nB) 5\\nC) 4\\nD) 3"
         ),
-        Message.from_role_and_content(
-            Role.ASSISTANT,
+        # Example 2: Stereochemistry - Optical Activity
+        (
+            "How many of the following compounds exhibit optical activity?\\n1-methyl-4-(prop-1-en-2-yl)cyclohex-1-ene\\n2,3,3,3-tetrafluoroprop-1-ene\\ndi(cyclohex-2-en-1-ylidene)methane\\n5-(5-methylhexan-2-ylidene)cyclopenta-1,3-diene\\n3-(2-methylbut-1-en-1-ylidene)cyclohex-1-ene\\n[1,1'-biphenyl]-3,3'-diol\\n8,8-dichlorobicyclo[4.2.0]octan-7-one\\ncyclopent-2-en-1-one\\nA) 6\\nB) 5\\nC) 4\\nD) 3",
             "The compounds 1-methyl-4-(prop-1-en-2-yl)cyclohex-1-ene, 3-(2-methylbut-1-en-1-ylidene)cyclohex-1-ene, di(cyclohex-2-en-1-ylidene)methane, and 8,8-dichlorobicyclo[4.2.0]octan-7-one are chiral molecules and thus will be optically active.\\n\\nAnswer: C"
-        ).with_channel("final"),
-
-        # The actual problem to solve
-        Message.from_role_and_content(
-            Role.USER,
-            user_query,
-        )
+        ),
     ]
-
-    # Create the conversation
-    convo = Conversation.from_messages(messages)
-
-    # Render the conversation for completion (ready to send to the model)
-    tokens = enc.render_conversation_for_completion(convo, Role.ASSISTANT)
-
-    return convo, tokens
+    
+    _add_multishot_examples(messages, examples)
+    
+    return _finalize_conversation(messages, user_query)
 
 
 def process_row(args):

From 387779c73a896a852492a540385f136f1a9a6309 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 13 Oct 2025 20:56:32 +0000
Subject: [PATCH 045/165] updates

---
 language/gpt-oss/requirements.txt |  4 +++-
 language/gpt-oss/run_infer.py     | 40 +++++++++++++++++++++++--------
 language/gpt-oss/run_server.sh    |  9 +++++--
 3 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/language/gpt-oss/requirements.txt b/language/gpt-oss/requirements.txt
index a75b9ee0e2..b668b76b14 100644
--- a/language/gpt-oss/requirements.txt
+++ b/language/gpt-oss/requirements.txt
@@ -7,4 +7,6 @@ audioread>=2.1.9
 msgpack>=1.0
 numba>=0.51.0
 pooch>=1.0
-soxr>=0.3.2
\ No newline at end of file
+soxr>=0.3.2
+lazy-loader>=0.1
+seaborn
diff --git a/language/gpt-oss/run_infer.py b/language/gpt-oss/run_infer.py
index d0bf06bd33..a9c61d8c18 100644
--- a/language/gpt-oss/run_infer.py
+++ b/language/gpt-oss/run_infer.py
@@ -137,16 +137,25 @@ def send_single_request(args_tuple):
         timeout=timeout)
 
     try:
+        # Track latency: time from request sent to response received
+        start_time = time.time()
         response = client.send_request(input_ids, max_tokens=max_tokens)
-        return sample_id, response
+        end_time = time.time()
+        latency = end_time - start_time
+        return sample_id, response, latency
     except Exception as e:
         logger.error(f"Request {sample_id} failed: {e}")
-        return sample_id, {"error": str(e)}
+        # Return None for latency on error
+        return sample_id, {"error": str(e)}, None
 
 
 def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
-                           max_tokens: int = 100, max_concurrency: int = 128, temperature: float = 0.001, top_k: int = 1, timeout: int = 1200) -> List[Dict[str, Any]]:
-    """Send all requests to SGLang server in parallel using multiprocessing."""
+                           max_tokens: int = 100, max_concurrency: int = 128, temperature: float = 0.001, top_k: int = 1, timeout: int = 1200):
+    """Send all requests to SGLang server in parallel using multiprocessing.
+    
+    Returns:
+        tuple: (responses, latencies) - List of responses and list of latencies in seconds
+    """
     num_samples = len(tokenized_df)
     logger.info(
         f"Sending {num_samples} requests to server with {max_concurrency} concurrent workers...")
@@ -162,7 +171,7 @@ def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
 
     with Pool(processes=min(max_concurrency, num_samples)) as pool:
         results = list(tqdm(
-            pool.imap(send_single_request, args_list),
+            pool.imap_unordered(send_single_request, args_list),
             total=len(args_list),
             desc="Sending requests",
             unit="request"
@@ -171,13 +180,22 @@ def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
     # Sort results by sample_id to maintain order
     results.sort(key=lambda x: x[0])
     responses = [result[1] for result in results]
+    latencies = [result[2] for result in results]
 
     total_time = time.time() - start_time
     logger.info(
         f"Completed {num_samples} requests in {total_time:.2f} seconds")
     logger.info(f"Average rate: {num_samples/total_time:.2f} requests/sec")
+    
+    # Log latency statistics
+    valid_latencies = [lat for lat in latencies if lat is not None]
+    if valid_latencies:
+        avg_latency = sum(valid_latencies) / len(valid_latencies)
+        min_latency = min(valid_latencies)
+        max_latency = max(valid_latencies)
+        logger.info(f"Latency stats - Avg: {avg_latency:.3f}s, Min: {min_latency:.3f}s, Max: {max_latency:.3f}s")
 
-    return responses
+    return responses, latencies
 
 
 def extract_response_ids(
@@ -228,8 +246,8 @@ def detokenize_output_ids(response_ids: List[List[int]]) -> List[str]:
 
 
 def save_responses(responses: List[Dict[str, Any]], response_ids: List[List[int]],
-                   detokenized_texts: List[str], tokenized_df: pd.DataFrame,
-                   output_file: str = None) -> pd.DataFrame:
+                   detokenized_texts: List[str], latencies: List[float], 
+                   tokenized_df: pd.DataFrame, output_file: str = None) -> pd.DataFrame:
     """Save all responses to DataFrame and optionally to pickle file."""
     logger.info("Processing responses and updating DataFrame...")
 
@@ -242,6 +260,7 @@ def save_responses(responses: List[Dict[str, Any]], response_ids: List[List[int]
     result_df['tok_model_output'] = response_ids
     result_df['tok_model_output_len'] = [
         len(token_ids) for token_ids in response_ids]  # Length of output_ids
+    result_df['infer_time'] = latencies  # E2E latency in seconds
 
     # Calculate output token lengths for logging
     output_token_lengths = []
@@ -258,7 +277,7 @@ def save_responses(responses: List[Dict[str, Any]], response_ids: List[List[int]
 
     logger.info(f"Updated DataFrame with shape: {result_df.shape}")
     logger.info(
-        f"Updated columns: model_output, tok_model_output, tok_model_output_len")
+        f"Updated columns: model_output, tok_model_output, tok_model_output_len, infer_time")
     logger.info(
         f"Average output token length: {sum(output_token_lengths)/len(output_token_lengths):.1f}")
 
@@ -283,7 +302,7 @@ def process_requests(tokenized_df: pd.DataFrame, server_url: str,
         logger.info(f"Limited to first {max_samples} samples")
 
     # Step 2: Send all requests in parallel
-    responses = send_requests_parallel(
+    responses, latencies = send_requests_parallel(
         tokenized_df,
         server_url,
         max_tokens,
@@ -303,6 +322,7 @@ def process_requests(tokenized_df: pd.DataFrame, server_url: str,
         responses,
         response_ids,
         detokenized_texts,
+        latencies,
         tokenized_df,
         output_file)
 
diff --git a/language/gpt-oss/run_server.sh b/language/gpt-oss/run_server.sh
index 09cc22d17f..dc829864cb 100755
--- a/language/gpt-oss/run_server.sh
+++ b/language/gpt-oss/run_server.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
-dp=2
+dp=1
+model_path=openai/gpt-oss-120b
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -8,12 +9,16 @@ while [[ $# -gt 0 ]]; do
             dp=$2
             shift 2
             ;;
+        --model-path)
+            model_path=$2
+            shift 2
+            ;;
     esac
 done
 
 set -x;
 python3 -m sglang.launch_server \
-    --model-path openai/gpt-oss-120b \
+    --model-path $model_path \
     --host 0.0.0.0 \
     --port 30000 \
     --tp-size=1 \

From bde219f0d2118cfccd9d6b07acefb3c42816231d Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 13 Oct 2025 13:59:02 -0700
Subject: [PATCH 046/165] rename opt

---
 language/gpt-oss/run_server.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/language/gpt-oss/run_server.sh b/language/gpt-oss/run_server.sh
index dc829864cb..563c9e3624 100755
--- a/language/gpt-oss/run_server.sh
+++ b/language/gpt-oss/run_server.sh
@@ -9,7 +9,7 @@ while [[ $# -gt 0 ]]; do
             dp=$2
             shift 2
             ;;
-        --model-path)
+        --model_path)
             model_path=$2
             shift 2
             ;;

From 24ef1e0aba5319b9385929b96a088e42c641a3f5 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Tue, 14 Oct 2025 19:59:44 +0000
Subject: [PATCH 047/165] updates

---
 language/gpt-oss/harmonize_inputs.py | 33 ++++++++++++++--------------
 language/gpt-oss/run_server.sh       |  6 ++++-
 2 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/language/gpt-oss/harmonize_inputs.py b/language/gpt-oss/harmonize_inputs.py
index 2695b68287..79c97f2370 100644
--- a/language/gpt-oss/harmonize_inputs.py
+++ b/language/gpt-oss/harmonize_inputs.py
@@ -120,16 +120,16 @@ def create_math500_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
         # Example 1: Square areas and side lengths
         (
             "The areas of three squares are 16, 49 and 169. What is the average (mean) of their side lengths?",
-            "Since the areas of the three squares are 16, 49 and 169, then their side lengths are $\\sqrt{16}=4$, $\\sqrt{49}=7$ and $\\sqrt{169}=13$, respectively.\\n\\nThus, the average of their side lengths is $$\\frac{4+7+13}{3}=\\boxed{8}.$$"
+            "Since the areas of the three squares are 16, 49 and 169, then their side lengths are $\\sqrt{16}=4$, $\\sqrt{49}=7$ and $\\sqrt{169}=13$, respectively.\n\nThus, the average of their side lengths is $$\\frac{4+7+13}{3}=\\boxed{8}.$$"
         ),
         # Example 2: Floor function equation
         (
             "Find all $x$ such that $\\lfloor \\lfloor 2x \\rfloor - 1/2 \\rfloor = \\lfloor x + 2 \\rfloor.$",
-            "Observe that $\\lfloor 2x \\rfloor$ is an integer, so it follows that $\\lfloor \\lfloor 2x \\rfloor - 1/2 \\rfloor = \\lfloor 2x \\rfloor - 1$. Also, $\\lfloor x + 2 \\rfloor = \\lfloor x \\rfloor + 2$. Thus, our equation becomes $$\\lfloor 2x \\rfloor = \\lfloor x \\rfloor + 3.$$Let $n = \\lfloor x \\rfloor,$ so $n \\le x < n + 1.$\\n\\nIf $x < n + \\frac{1}{2},$ then $2n \\le x < 2n + 1,$ so $\\lfloor 2x \\rfloor = 2n,$ and\\n\\[2n = n + 3,\\]which means $n = 3.$\\n\\nIf $x \\ge n + \\frac{1}{2},$ then $2n + 1 \\le x < 2n + 2,$ so $\\lfloor 2x \\rfloor = 2n + 1,$ and\\n\\[2n + 1 = n + 3,\\]which means $n = 2.$\\n\\nTherefore, the set of solutions is $x \\in \\boxed{\\left[ \\frac{5}{2}, \\frac{7}{2} \\right)}.$"
+            "Observe that $\\lfloor 2x \\rfloor$ is an integer, so it follows that $\\lfloor \\lfloor 2x \\rfloor - 1/2 \\rfloor = \\lfloor 2x \\rfloor - 1$. Also, $\\lfloor x + 2 \\rfloor = \\lfloor x \\rfloor + 2$. Thus, our equation becomes $$\\lfloor 2x \\rfloor = \\lfloor x \\rfloor + 3.$$Let $n = \\lfloor x \\rfloor,$ so $n \\le x < n + 1.$\n\nIf $x < n + \\frac{1}{2},$ then $2n \\le x < 2n + 1,$ so $\\lfloor 2x \\rfloor = 2n,$ and\n\\[2n = n + 3,\\]which means $n = 3.$\n\nIf $x \\ge n + \\frac{1}{2},$ then $2n + 1 \\le x < 2n + 2,$ so $\\lfloor 2x \\rfloor = 2n + 1,$ and\n\\[2n + 1 = n + 3,\\]which means $n = 2.$\n\nTherefore, the set of solutions is $x \\in \\boxed{\\left[ \\frac{5}{2}, \\frac{7}{2} \\right)}.$"
         ),
         # Example 3: Sequences and differences
         (
-            "Sequence $A$ is a geometric sequence. Sequence $B$ is an arithmetic sequence. Each sequence stops as soon as one of its terms is greater than $300.$ What is the least positive difference between a number selected from sequence $A$ and a number selected from sequence $B?$\\n\\n$\\bullet$ Sequence $A:$ $2,$ $4,$ $8,$ $16,$ $32,$ $\\ldots$\\n\\n$\\bullet$ Sequence $B:$ $20,$ $40,$ $60,$ $80,$ $100,$ $\\ldots$",
+            "Sequence $A$ is a geometric sequence. Sequence $B$ is an arithmetic sequence. Each sequence stops as soon as one of its terms is greater than $300.$ What is the least positive difference between a number selected from sequence $A$ and a number selected from sequence $B?$\n\n$\\bullet$ Sequence $A:$ $2,$ $4,$ $8,$ $16,$ $32,$ $\\ldots$\n\n$\\bullet$ Sequence $B:$ $20,$ $40,$ $60,$ $80,$ $100,$ $\\ldots$",
             "The terms of sequence $A$ are $2,$ $4,$ $8,$ $16,$ $32,$ $64,$ $128,$ $256,$ $512.$ The terms of sequence $B$ start from $20$ and go up by $20$ each time, so sequence $B$ is precisely all multiples of $20$ from $20$ to $320.$ We thus need to see which term in sequence $A$ is closest to a multiple of $20.$ $16,$ $64,$ and $256$ are the closest, each being $4$ away from a multiple of $20.$ So the least positive difference between a term in sequence $A$ and one in sequence $B$ is $\\boxed{4}.$"
         ),
         # Example 4: Probability and Deal or No Deal
@@ -140,7 +140,7 @@ def create_math500_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
         # Example 5: Domain of composite function
         (
             "Find the domain of the function $f(x) = \\tan(\\arccos(x^2)).$",
-            "For $\\arccos (x^2)$ to be defined, we must have $-1 \\le x^2 \\le 1,$ which is satisfied only for $-1 \\le x \\le 1.$  Then $\\arccos (x^2)$ will always return an angle between 0 and $\\frac{\\pi}{2}.$  Then $\\tan (\\arccos(x^2))$ is defined, unless $\\arccos(x^2) = \\frac{\\pi}{2}.$  This occurs only when $x = 0.$\\n\\nTherefore, the domain of $f(x)$ is $\\boxed{[-1,0) \\cup (0,1]}.$"
+            "For $\\arccos (x^2)$ to be defined, we must have $-1 \\le x^2 \\le 1,$ which is satisfied only for $-1 \\le x \\le 1.$  Then $\\arccos (x^2)$ will always return an angle between 0 and $\\frac{\\pi}{2}.$  Then $\\tan (\\arccos(x^2))$ is defined, unless $\\arccos(x^2) = \\frac{\\pi}{2}.$  This occurs only when $x = 0.$\n\nTherefore, the domain of $f(x)$ is $\\boxed{[-1,0) \\cup (0,1]}.$"
         ),
     ]
     
@@ -206,27 +206,27 @@ def create_mmlu_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
     examples = [
         # Example 1: Abstract Algebra - Ring Theory
         (
-            "The symmetric group $S_n$ has $n!$ elements, hence it is not true that $S_{10}$ has 10 elements.\\nFind the characteristic of the ring 2Z.\\nA) 0\\nB) 30\\nC) 3\\nD) 10\\nE) 12\\nF) 50\\nG) 2\\nH) 100\\nI) 20\\nJ) 5",
+            "The symmetric group $S_n$ has $n!$ elements, hence it is not true that $S_{10}$ has 10 elements.\nFind the characteristic of the ring 2Z.\nA) 0\nB) 30\nC) 3\nD) 10\nE) 12\nF) 50\nG) 2\nH) 100\nI) 20\nJ) 5",
             "Answer: A"
         ),
         # Example 2: Linear Algebra - Transformations
         (
-            "Let V be the set of all real polynomials p(x). Let transformations T, S be defined on V by T:p(x) -> xp(x) and S:p(x) -> p'(x) = d/dx p(x), and interpret (ST)(p(x)) as S(T(p(x))). Which of the following is true?\\nA) ST + TS is the identity map of V onto itself.\\nB) TS = 0\\nC) ST = 1\\nD) ST - TS = 0\\nE) ST = T\\nF) ST = 0\\nG) ST = TS\\nH) ST - TS is the identity map of V onto itself.\\nI) TS = T\\nJ) ST = S",
+            "Let V be the set of all real polynomials p(x). Let transformations T, S be defined on V by T:p(x) -> xp(x) and S:p(x) -> p'(x) = d/dx p(x), and interpret (ST)(p(x)) as S(T(p(x))). Which of the following is true?\nA) ST + TS is the identity map of V onto itself.\nB) TS = 0\nC) ST = 1\nD) ST - TS = 0\nE) ST = T\nF) ST = 0\nG) ST = TS\nH) ST - TS is the identity map of V onto itself.\nI) TS = T\nJ) ST = S",
             "Answer: H"
         ),
         # Example 3: Number Theory - Diophantine Equations
         (
-            "Let A be the set of all ordered pairs of integers (m, n) such that 7m + 12n = 22. What is the greatest negative number in the set B = {m + n : (m, n) ∈ A}?\\nA) -5\\nB) 0\\nC) -3\\nD) -7\\nE) -4\\nF) -6\\nG) -1\\nH) -2\\nI) -9\\nJ) N/A",
+            "Let A be the set of all ordered pairs of integers (m, n) such that 7m + 12n = 22. What is the greatest negative number in the set B = {m + n : (m, n) ∈ A}?\nA) -5\nB) 0\nC) -3\nD) -7\nE) -4\nF) -6\nG) -1\nH) -2\nI) -9\nJ) N/A",
             "Answer: E"
         ),
         # Example 4: Differential Equations - Salt Tank Problem
         (
-            "A tank initially contains a salt solution of 3 grams of salt dissolved in 100 liters of water. A salt solution containing 0.02 grams of salt per liter of water is sprayed into the tank at a rate of 4 liters per minute. The sprayed solution is continually mixed with the salt solution in the tank, and the mixture flows out of the tank at a rate of 4 liters per minute. If the mixing is instantaneous, how many grams of salt are in the tank after 100 minutes have elapsed?\\nA) 3 + e^-2\\nB) 2 - e^-4\\nC) 2 - e^-2\\nD) 3 + e^-4\\nE) 2 + e^-3\\nF) 2 - e^-3\\nG) 3 - e^-2\\nH) 2 + e^-2\\nI) 2 + e^-4\\nJ) 2",
+            "A tank initially contains a salt solution of 3 grams of salt dissolved in 100 liters of water. A salt solution containing 0.02 grams of salt per liter of water is sprayed into the tank at a rate of 4 liters per minute. The sprayed solution is continually mixed with the salt solution in the tank, and the mixture flows out of the tank at a rate of 4 liters per minute. If the mixing is instantaneous, how many grams of salt are in the tank after 100 minutes have elapsed?\nA) 3 + e^-2\nB) 2 - e^-4\nC) 2 - e^-2\nD) 3 + e^-4\nE) 2 + e^-3\nF) 2 - e^-3\nG) 3 - e^-2\nH) 2 + e^-2\nI) 2 + e^-4\nJ) 2",
             "Answer: I"
         ),
         # Example 5: Basic Arithmetic - Division
         (
-            "A total of 30 players will play basketball at a park. There will be exactly 5 players on each team. Which statement correctly explains how to find the number of teams needed?\\nA) Multiply 5 by 5 to find 25 teams.\\nB) Divide 30 by 5 to find 6 teams.\\nC) Add 5 to 30 to find 35 teams.\\nD) Subtract 30 from 5 to find -25 teams.\\nE) Divide 5 by 30 to find 0.1667 teams.\\nF) Add 5 to 30 then divide by 2 to find 17.5 teams.\\nG) N/A\\nH) N/A\\nI) N/A\\nJ) N/A",
+            "A total of 30 players will play basketball at a park. There will be exactly 5 players on each team. Which statement correctly explains how to find the number of teams needed?\nA) Multiply 5 by 5 to find 25 teams.\nB) Divide 30 by 5 to find 6 teams.\nC) Add 5 to 30 to find 35 teams.\nD) Subtract 30 from 5 to find -25 teams.\nE) Divide 5 by 30 to find 0.1667 teams.\nF) Add 5 to 30 then divide by 2 to find 17.5 teams.\nG) N/A\nH) N/A\nI) N/A\nJ) N/A",
             "Answer: B"
         ),
     ]
@@ -238,13 +238,13 @@ def create_mmlu_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
 
 def create_gpqa_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
     """
-    Creates a multi-shot prompt for organic chemistry and biochemistry question answering using Harmony format.
+    Creates a multi-shot prompt for Biology, Physics and Chemistry question answering using Harmony format.
 
     Returns:
         tuple: (conversation_object, token_list) ready for model completion
     """
     instructions = (
-        "You are an expert in organic chemistry and biochemistry who answers scientific questions accurately. "
+        "You are an expert in Biology, Physics and Chemistry who answers scientific questions accurately. "
         f"{MOD_PROMPT} "
         "After your reasoning, provide your final answer on a new line in the format: 'Answer: X' where X is the letter choice."
     )
@@ -255,13 +255,13 @@ def create_gpqa_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
     examples = [
         # Example 1: Molecular Biology - Gene Therapy
         (
-            "A large gene has dozens of exons, of which the central ones code for folded triple helical repeats that connect the cytoskeleton with sarcolemma and extracellular space. Each exon usually codes for one folded triple alpha helix. The most common mutations of the gene are central exon deletions that create out-of-frame peptides and progressive degenerative organ waste. A solution is to deliver a Morpholino that recognizes the 5' end of the out-of-frame exon in pre-mRNA. The molecule prevents binding of the spliceosome and creates exon skipping and in-frame joining. Several missing exons are well tolerated by an organism. Which structure below is not involved in the proposed therapy?\\nA) polyA tail\\nB) lariat\\nC) antisense\\nD) R-loops",
-            "The text describes the dystrophin gene and the FDA-approved oligonucleotide therapy that causes exon skipping by creating a functional, albeit shorter, dystrophin protein. Morpholino is bound to the pre-mRNA in an antisense orientation. Every splicing mechanism creates the lariat molecule that is circular with a 3' tail and soon degraded. The spliced RNA is polyadenylated at the 3' end. R-loops are triple helix of DNA and the pre-mRNA and a consequence of the RNA transcription, not splicing and RNA maturation.\\n\\nAnswer: D"
+            "A large gene has dozens of exons, of which the central ones code for folded triple helical repeats that connect the cytoskeleton with sarcolemma and extracellular space. Each exon usually codes for one folded triple alpha helix. The most common mutations of the gene are central exon deletions that create out-of-frame peptides and progressive degenerative organ waste. A solution is to deliver a Morpholino that recognizes the 5' end of the out-of-frame exon in pre-mRNA. The molecule prevents binding of the spliceosome and creates exon skipping and in-frame joining. Several missing exons are well tolerated by an organism. Which structure below is not involved in the proposed therapy?\nA) polyA tail\nB) lariat\nC) antisense\nD) R-loops",
+            "The text describes the dystrophin gene and the FDA-approved oligonucleotide therapy that causes exon skipping by creating a functional, albeit shorter, dystrophin protein. Morpholino is bound to the pre-mRNA in an antisense orientation. Every splicing mechanism creates the lariat molecule that is circular with a 3' tail and soon degraded. The spliced RNA is polyadenylated at the 3' end. R-loops are triple helix of DNA and the pre-mRNA and a consequence of the RNA transcription, not splicing and RNA maturation.\n\nAnswer: D"
         ),
         # Example 2: Stereochemistry - Optical Activity
         (
-            "How many of the following compounds exhibit optical activity?\\n1-methyl-4-(prop-1-en-2-yl)cyclohex-1-ene\\n2,3,3,3-tetrafluoroprop-1-ene\\ndi(cyclohex-2-en-1-ylidene)methane\\n5-(5-methylhexan-2-ylidene)cyclopenta-1,3-diene\\n3-(2-methylbut-1-en-1-ylidene)cyclohex-1-ene\\n[1,1'-biphenyl]-3,3'-diol\\n8,8-dichlorobicyclo[4.2.0]octan-7-one\\ncyclopent-2-en-1-one\\nA) 6\\nB) 5\\nC) 4\\nD) 3",
-            "The compounds 1-methyl-4-(prop-1-en-2-yl)cyclohex-1-ene, 3-(2-methylbut-1-en-1-ylidene)cyclohex-1-ene, di(cyclohex-2-en-1-ylidene)methane, and 8,8-dichlorobicyclo[4.2.0]octan-7-one are chiral molecules and thus will be optically active.\\n\\nAnswer: C"
+            "How many of the following compounds exhibit optical activity?\n1-methyl-4-(prop-1-en-2-yl)cyclohex-1-ene\n2,3,3,3-tetrafluoroprop-1-ene\ndi(cyclohex-2-en-1-ylidene)methane\n5-(5-methylhexan-2-ylidene)cyclopenta-1,3-diene\n3-(2-methylbut-1-en-1-ylidene)cyclohex-1-ene\n[1,1'-biphenyl]-3,3'-diol\n8,8-dichlorobicyclo[4.2.0]octan-7-one\ncyclopent-2-en-1-one\nA) 6\nB) 5\nC) 4\nD) 3",
+            "The compounds 1-methyl-4-(prop-1-en-2-yl)cyclohex-1-ene, 3-(2-methylbut-1-en-1-ylidene)cyclohex-1-ene, di(cyclohex-2-en-1-ylidene)methane, and 8,8-dichlorobicyclo[4.2.0]octan-7-one are chiral molecules and thus will be optically active.\n\nAnswer: C"
         ),
     ]
     
@@ -347,7 +347,8 @@ def process_row(args):
         'gpqa': create_gpqa_prompt,
         'livecodebench': create_livecodebench_prompt,
         'math500': create_math500_prompt,
-        'mmlu_pro': create_mmlu_prompt
+        'mmlu_pro': create_mmlu_prompt,
+        'mmlu': create_mmlu_prompt,
     }
 
     # Prepare data for parallel processing
diff --git a/language/gpt-oss/run_server.sh b/language/gpt-oss/run_server.sh
index 563c9e3624..71746af6ff 100755
--- a/language/gpt-oss/run_server.sh
+++ b/language/gpt-oss/run_server.sh
@@ -13,6 +13,10 @@ while [[ $# -gt 0 ]]; do
             model_path=$2
             shift 2
             ;;
+        *)
+            echo "Unknown argument: $1"
+            exit 1
+            ;;
     esac
 done
 
@@ -28,4 +32,4 @@ python3 -m sglang.launch_server \
     --chunked-prefill-size 16384 \
     --ep-size=1 \
     --quantization mxfp4 \
-    --stream-interval 50
+    --stream-interval 500 
\ No newline at end of file

From 141dd8d363fcf3de1a741cfaa9cd57e067a342f1 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 14 Oct 2025 20:00:50 +0000
Subject: [PATCH 048/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/harmonize_inputs.py | 55 ++++++++++++++--------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/language/gpt-oss/harmonize_inputs.py b/language/gpt-oss/harmonize_inputs.py
index 79c97f2370..540c2a5c6c 100644
--- a/language/gpt-oss/harmonize_inputs.py
+++ b/language/gpt-oss/harmonize_inputs.py
@@ -29,11 +29,11 @@
 def _create_base_messages(reasoning_effort, instructions):
     """
     Creates system and developer messages for a conversation.
-    
+
     Args:
         reasoning_effort: ReasoningEffort enum value
         instructions: String containing developer instructions
-        
+
     Returns:
         list: List containing system and developer messages
     """
@@ -57,18 +57,19 @@ def _create_base_messages(reasoning_effort, instructions):
 def _add_multishot_examples(messages, examples):
     """
     Adds multi-shot examples to a message list.
-    
+
     Args:
         messages: List of messages to append examples to
         examples: List of tuples (user_content, assistant_content) representing examples
-        
+
     Returns:
         list: Updated messages list with examples added
     """
     for user_content, assistant_content in examples:
         messages.append(Message.from_role_and_content(Role.USER, user_content))
         messages.append(
-            Message.from_role_and_content(Role.ASSISTANT, assistant_content).with_channel("final")
+            Message.from_role_and_content(
+                Role.ASSISTANT, assistant_content).with_channel("final")
         )
     return messages
 
@@ -76,26 +77,26 @@ def _add_multishot_examples(messages, examples):
 def _finalize_conversation(messages, user_query):
     """
     Adds the user query, creates the conversation, and renders tokens.
-    
+
     Args:
         messages: List of messages (system, developer, and optionally examples)
         user_query: The actual user query to solve
-        
+
     Returns:
         tuple: (conversation_object, token_list) ready for model completion
     """
     # Load the Harmony encoding for gpt-oss models
     enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
-    
+
     # Add the actual problem to solve
     messages.append(Message.from_role_and_content(Role.USER, user_query))
-    
+
     # Create the conversation
     convo = Conversation.from_messages(messages)
-    
+
     # Render the conversation for completion (ready to send to the model)
     tokens = enc.render_conversation_for_completion(convo, Role.ASSISTANT)
-    
+
     return convo, tokens
 
 
@@ -112,9 +113,9 @@ def create_math500_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
         f"{MOD_PROMPT} "
         "Follow the format shown in the examples below. "
     )
-    
+
     messages = _create_base_messages(reasoning_effort, instructions)
-    
+
     # Define multi-shot examples
     examples = [
         # Example 1: Square areas and side lengths
@@ -143,9 +144,9 @@ def create_math500_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
             "For $\\arccos (x^2)$ to be defined, we must have $-1 \\le x^2 \\le 1,$ which is satisfied only for $-1 \\le x \\le 1.$  Then $\\arccos (x^2)$ will always return an angle between 0 and $\\frac{\\pi}{2}.$  Then $\\tan (\\arccos(x^2))$ is defined, unless $\\arccos(x^2) = \\frac{\\pi}{2}.$  This occurs only when $x = 0.$\n\nTherefore, the domain of $f(x)$ is $\\boxed{[-1,0) \\cup (0,1]}.$"
         ),
     ]
-    
+
     _add_multishot_examples(messages, examples)
-    
+
     return _finalize_conversation(messages, user_query)
 
 
@@ -161,9 +162,9 @@ def create_aime1983_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
         f"{MOD_PROMPT} "
         "The final line of your response should contain the final answer as an integer enclosed in \\boxed{answer}."
     )
-    
+
     messages = _create_base_messages(reasoning_effort, instructions)
-    
+
     return _finalize_conversation(messages, user_query)
 
 
@@ -181,9 +182,9 @@ def create_livecodebench_prompt(
         f"{MOD_PROMPT} "
         "The code should be enclosed within ```python delimiters."
     )
-    
+
     messages = _create_base_messages(reasoning_effort, instructions)
-    
+
     return _finalize_conversation(messages, user_query)
 
 
@@ -199,9 +200,9 @@ def create_mmlu_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
         f"{MOD_PROMPT} "
         "After your reasoning, provide your final answer on a new line in the format: 'Answer: X' where X is the letter choice."
     )
-    
+
     messages = _create_base_messages(reasoning_effort, instructions)
-    
+
     # Define multi-shot examples
     examples = [
         # Example 1: Abstract Algebra - Ring Theory
@@ -230,9 +231,9 @@ def create_mmlu_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
             "Answer: B"
         ),
     ]
-    
+
     _add_multishot_examples(messages, examples)
-    
+
     return _finalize_conversation(messages, user_query)
 
 
@@ -248,9 +249,9 @@ def create_gpqa_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
         f"{MOD_PROMPT} "
         "After your reasoning, provide your final answer on a new line in the format: 'Answer: X' where X is the letter choice."
     )
-    
+
     messages = _create_base_messages(reasoning_effort, instructions)
-    
+
     # Define multi-shot examples
     examples = [
         # Example 1: Molecular Biology - Gene Therapy
@@ -264,9 +265,9 @@ def create_gpqa_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
             "The compounds 1-methyl-4-(prop-1-en-2-yl)cyclohex-1-ene, 3-(2-methylbut-1-en-1-ylidene)cyclohex-1-ene, di(cyclohex-2-en-1-ylidene)methane, and 8,8-dichlorobicyclo[4.2.0]octan-7-one are chiral molecules and thus will be optically active.\n\nAnswer: C"
         ),
     ]
-    
+
     _add_multishot_examples(messages, examples)
-    
+
     return _finalize_conversation(messages, user_query)
 
 
From 684e27a37e4cf5e65ac26b5427c4a134d24f04ea Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 15 Oct 2025 00:47:25 +0000
Subject: [PATCH 049/165] add healthbench prompt creation

---
 language/gpt-oss/eval_accuracy.py    | 692 +++++++++++++++++++++++++++
 language/gpt-oss/harmonize_inputs.py |  45 +-
 2 files changed, 733 insertions(+), 4 deletions(-)
 create mode 100644 language/gpt-oss/eval_accuracy.py

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
new file mode 100644
index 0000000000..685af6f7d2
--- /dev/null
+++ b/language/gpt-oss/eval_accuracy.py
@@ -0,0 +1,692 @@
+#!/usr/bin/env python3
+"""
+Standalone evaluation script for mlperf-inference deepseek-r1 dataset.
+
+Expected input format (pickle file with DataFrame):
+- model_output: The model's response text
+- tok_model_output_len: The length of the model's response tokens
+- ground_truth: The expected answer
+- dataset: Dataset name (e.g., 'gpqa', 'mmlu_pro', 'math500', 'livecodebench', 'aime')
+- question: The question text
+
+Output adds two columns:
+- extracted_answer: Parsed answer from model output
+- prompt_accuracy: 100.0 if correct, 0.0 if incorrect
+"""
+
+import sys
+import os
+import argparse
+import logging
+import pickle
+import json
+import re
+import shutil
+import time
+from functools import lru_cache
+from typing import Dict, Any, Optional, Tuple, Union
+import pandas as pd
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import multiprocessing
+from pathlib import Path
+
+# MLPerf log processing imports
+import numpy as np
+from transformers import AutoTokenizer
+
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+# =============================================================================
+# Input Validation
+# =============================================================================
+
+def validate_dataframe(df: pd.DataFrame) -> None:
+    """Validate input DataFrame has required columns."""
+    if not isinstance(df, pd.DataFrame):
+        raise ValueError("Input must be a pandas DataFrame")
+
+    required_cols = [
+        'model_output',
+        'dataset',
+        'ground_truth',
+        'tok_model_output_len']
+    missing_cols = [col for col in required_cols if col not in df.columns]
+    if missing_cols:
+        raise ValueError(f"Missing required columns: {missing_cols}")
+
+
+def validate_text_input(text: Any) -> str:
+    """Validate and convert text input to string."""
+    if pd.isna(text) or text is None:
+        return ""
+    return str(text).strip()
+
+
+def validate_dataset_name(dataset: Any) -> str:
+    """Validate dataset name."""
+    if pd.isna(dataset) or not dataset:
+        raise ValueError("Dataset name cannot be empty")
+    return str(dataset).lower()
+
+
+# =============================================================================
+# Answer Parsing Functions
+# =============================================================================
+
+def parse_multiple_choice(text: str, max_option: str = 'D') -> Optional[str]:
+    """Parse multiple choice answer (A-D or A-J)."""
+    text = validate_text_input(text)
+    if not text:
+        return None
+
+    # Clean artifacts
+    if text.startswith(("['", '["')) and text.endswith(("']", '"]')):
+        text = text[2:-2].strip()
+
+    text = text.replace(r'\n', '\n').replace(r'\'', "'")
+
+    # Find ANSWER/FINAL ANSWER pattern
+    pattern = rf"\b(?:ANSWER|FINAL\s*ANSWER)\b\s*[:=]?\s*(?:\(?\s*([A-{max_option}])\s*\)?)(?:\s*$|[^A-Za-z])"
+    matches = list(re.finditer(pattern, text, re.IGNORECASE))
+
+    if matches:
+        return matches[-1].group(1).upper()
+
+    # MMLU-Pro fallback: standalone letter
+    if max_option == 'J':
+        fallback_matches = list(re.finditer(
+            r"\b([A-J])\b", text, re.IGNORECASE))
+        if fallback_matches:
+            return fallback_matches[-1].group(1).upper()
+
+    return None
+
+
+def parse_boxed_math(text: str) -> Optional[str]:
+    """Parse \\boxed{answer} format."""
+    text = validate_text_input(text)
+    if not text:
+        return None
+
+    idx = text.rfind(r"\boxed{")
+    if idx == -1:
+        return None
+
+    # Find matching brace
+    depth, i = 0, idx + 7
+    content_start = i
+    while i < len(text):
+        if text[i] == '{':
+            depth += 1
+        elif text[i] == '}':
+            if depth == 0:
+                return text[content_start:i].strip()
+            depth -= 1
+        i += 1
+    return None
+
+
+def parse_aime_answer(text: str) -> Optional[int]:
+    """Parse AIME integer answer (0-999)."""
+    text = validate_text_input(text)
+    if not text:
+        return None
+
+    # Priority 1: \boxed{digits}
+    boxed_matches = list(re.finditer(r"\\boxed{\s*(\d+)\s*}", text))
+    if boxed_matches:
+        extracted_str = boxed_matches[-1].group(1)
+    else:
+        # Priority 2: Answer: <digits>
+        answer_matches = list(re.finditer(
+            r"Answer:\s*(\d+)(?!\.)\b", text, re.IGNORECASE | re.MULTILINE))
+        if not answer_matches:
+            return None
+        extracted_str = answer_matches[-1].group(1)
+
+    try:
+        val = int(extracted_str)
+        if 0 <= val <= 999:
+            return val
+    except ValueError:
+        pass
+
+    return None
+
+
+def parse_code(text: str) -> Optional[str]:
+    """Parse code from ```python code block."""
+    text = validate_text_input(text)
+    if not text:
+        return None
+
+    match = re.search(r"```python(.*?)```", text, re.DOTALL)
+    return match.group(1).strip() if match else None
+
+
+# =============================================================================
+# Answer Evaluation Functions
+# =============================================================================
+
+def evaluate_multiple_choice(
+        parsed: Optional[str], ground_truth: str, valid_options: str) -> bool:
+    """Evaluate multiple choice answer."""
+    if not parsed or not ground_truth:
+        return False
+
+    parsed = parsed.upper()
+    ground_truth = ground_truth.upper()
+
+    return parsed in valid_options and parsed == ground_truth
+
+
+def evaluate_math500(parsed: Optional[str], ground_truth: str) -> bool:
+    """Evaluate MATH-500 using PRM800K grader."""
+    if not parsed or not ground_truth:
+        return False
+
+    parsed = str(parsed).strip()
+    ground_truth = str(ground_truth)
+
+    if not parsed:
+        return False
+
+    # Use sys.path approach for proper module importing
+    workspace_path = os.path.dirname(os.path.abspath(__file__))
+    prm800k_module_path = os.path.join(
+        workspace_path, "submodules", "prm800k", "prm800k")
+
+    if not os.path.exists(prm800k_module_path):
+        raise FileNotFoundError(
+            f"PRM800K module not found at: {prm800k_module_path}")
+
+    # Save current directory and sys.path
+    original_cwd = os.getcwd()
+    original_syspath = sys.path.copy()
+
+    try:
+        # Add prm800k module path to sys.path
+        if prm800k_module_path not in sys.path:
+            sys.path.insert(0, prm800k_module_path)
+
+        # Change directory as some imports might use relative paths
+        os.chdir(prm800k_module_path)
+
+        # Now import should work
+        from grading.grader import grade_answer
+        result = grade_answer(given_answer=parsed, ground_truth=ground_truth)
+    except ImportError as e:
+        raise ImportError(f"Failed to import PRM800K grader: {e}")
+    finally:
+        # Always restore original directory and sys.path
+        os.chdir(original_cwd)
+        sys.path[:] = original_syspath
+
+    return result
+
+
+def evaluate_aime(parsed: Optional[int], ground_truth: Any) -> bool:
+    """Evaluate AIME integer answer."""
+    if parsed is None:
+        return False
+
+    try:
+        gt_int = int(ground_truth)
+        return int(parsed) == gt_int
+    except (ValueError, TypeError):
+        return False
+
+
+@lru_cache(maxsize=1)
+def load_lcb_benchmark() -> Dict[str, Any]:
+    """Load LiveCodeBench benchmark with caching."""
+    lcb_dir = os.path.abspath(os.path.join(
+        os.path.dirname(__file__), "submodules", "LiveCodeBench"))
+
+    if not os.path.isdir(lcb_dir):
+        raise FileNotFoundError(
+            f"LiveCodeBench submodule required at: {lcb_dir}")
+
+    original_cwd = os.getcwd()
+    os.chdir(lcb_dir)
+
+    if lcb_dir not in sys.path:
+        sys.path.insert(0, lcb_dir)
+
+    try:
+        os.environ['TQDM_DISABLE'] = '1'
+
+        from lcb_runner.utils.scenarios import Scenario
+        from lcb_runner.runner.scenario_router import build_prompt_benchmark
+
+        mock_args = argparse.Namespace(
+            scenario=Scenario.codegeneration, release_version="release_v1",
+            subset="code_generation", language="python", not_fast=False,
+            start_date=None, end_date=None, k=[1], num_samples=1,
+            timeout=60, num_workers=1, num_process_evaluate=1,
+            model_name="standalone_eval", output_dir="/tmp",
+            prompt_type="custom", continue_existing=False, evaluate=True
+        )
+
+        full_benchmark, _ = build_prompt_benchmark(mock_args)
+        return {inst.question_id: inst for inst in full_benchmark}
+
+    finally:
+        os.chdir(original_cwd)
+        os.environ.pop('TQDM_DISABLE', None)
+
+
+def evaluate_livecodebench(code: Optional[str], question_id: str) -> bool:
+    """Evaluate LiveCodeBench code generation."""
+    if not code or not question_id:
+        return False
+
+    lcb_dir = os.path.abspath(os.path.join(
+        os.path.dirname(__file__), "submodules", "LiveCodeBench"))
+    benchmark_map = load_lcb_benchmark()
+
+    instance = benchmark_map.get(question_id)
+    if not instance:
+        return False
+
+    original_cwd = os.getcwd()
+    temp_dir = f"/tmp/temp_lcb_eval_{question_id}_{int(time.time())}"
+    os.makedirs(temp_dir, exist_ok=True)
+
+    try:
+        os.chdir(lcb_dir)
+        os.environ['TQDM_DISABLE'] = '1'
+
+        from lcb_runner.utils.scenarios import Scenario
+        from lcb_runner.evaluation import extract_instance_results
+        from lcb_runner.runner.scenario_router import sort_and_extract_save_results, get_metrics
+
+        mock_args = argparse.Namespace(
+            scenario=Scenario.codegeneration, release_version="release_v1",
+            subset="code_generation", language="python", not_fast=False,
+            start_date=None, end_date=None, k=[1], num_samples=1,
+            timeout=60, num_workers=1, num_process_evaluate=1,
+            model_name="inline_handler_eval", output_dir=temp_dir,
+            prompt_type="custom", continue_existing=False, evaluate=True,
+        )
+
+        batch_benchmark = [instance]
+        batch_custom_outputs = [[code]]
+
+        save_results = [inst.insert_output(output, output)
+                        for inst, output in zip(batch_benchmark, batch_custom_outputs)]
+
+        _, combined_results = sort_and_extract_save_results(
+            mock_args.scenario, save_results)
+        _, instance_results, _ = get_metrics(
+            mock_args.scenario, mock_args, batch_benchmark, combined_results
+        )
+
+        graded = extract_instance_results(instance_results)
+        return graded and graded[0] and graded[0][0]
+
+    finally:
+        os.chdir(original_cwd)
+        shutil.rmtree(temp_dir, ignore_errors=True)
+        os.environ.pop('TQDM_DISABLE', None)
+
+
+def evaluate_livecodebench_worker(args: Tuple[str, str]) -> Tuple[str, bool]:
+    """Worker function for parallel LiveCodeBench evaluation."""
+    code, question_id = args
+
+    try:
+        return question_id, evaluate_livecodebench(code, question_id)
+    except Exception:
+        return question_id, False
+
+
+# =============================================================================
+# Dataset Configuration
+# =============================================================================
+
+DATASET_EVALUATORS = {
+    'gpqa': {
+        'parse': lambda text: parse_multiple_choice(text, 'D'),
+        'evaluate': lambda parsed, gt: evaluate_multiple_choice(parsed, gt, 'ABCD')
+    },
+    'mmlu_pro': {
+        'parse': lambda text: parse_multiple_choice(text, 'J'),
+        'evaluate': lambda parsed, gt: evaluate_multiple_choice(parsed, gt, 'ABCDEFGHIJ')
+    },
+    'math500': {
+        'parse': parse_boxed_math,
+        'evaluate': evaluate_math500
+    },
+    'aime': {
+        'parse': parse_aime_answer,
+        'evaluate': evaluate_aime
+    },
+    'livecodebench': {
+        'parse': parse_code,
+        'evaluate': evaluate_livecodebench
+    },
+    'mmlu': {
+        'parse': lambda text: parse_multiple_choice(text, 'J'),
+        'evaluate': lambda parsed, gt: evaluate_multiple_choice(parsed, gt, 'ABCDEFGHIJ')
+    },
+
+}
+
+
+def get_evaluator(dataset_name: str) -> Dict[str, Any]:
+    """Get evaluator functions for dataset."""
+    dataset_lower = validate_dataset_name(dataset_name)
+
+    for key, evaluator in DATASET_EVALUATORS.items():
+        if key in dataset_lower:
+            return evaluator
+
+    raise ValueError(f"No evaluator found for dataset: {dataset_name}")
+
+
+# =============================================================================
+# Main Processing Functions
+# =============================================================================
+
+def process_row(row: pd.Series) -> Dict[str, Any]:
+    """Process a single row and return extracted answer and accuracy."""
+    dataset_name = validate_dataset_name(row['dataset'])
+    raw_output = validate_text_input(row['model_output'])
+    ground_truth = row['ground_truth']
+
+    evaluator = get_evaluator(dataset_name)
+    extracted = evaluator['parse'](raw_output)
+
+    is_correct = False
+    if extracted is not None and not pd.isna(ground_truth):
+        is_correct = evaluator['evaluate'](extracted, ground_truth)
+
+    return {
+        'extracted_answer': extracted,
+        'prompt_accuracy': 100.0 if is_correct else 0.0
+    }
+
+
+def process_livecodebench_parallel(
+        df: pd.DataFrame, group_indices: pd.Index) -> Tuple[int, int]:
+    """Process LiveCodeBench items in parallel."""
+    # Prepare work items
+    work_items = []
+    for idx in group_indices:
+        row = df.loc[idx]
+        extracted = row.get('extracted_answer')
+        ground_truth = row.get('ground_truth')
+
+        if extracted is not None and not pd.isna(ground_truth):
+            work_items.append((idx, extracted, ground_truth))
+
+    if not work_items:
+        return 0, 0
+
+    # Process in parallel
+    max_workers = min(multiprocessing.cpu_count(), len(work_items))
+    logger.info(
+        f"Evaluating {len(work_items)} LiveCodeBench items with {max_workers} workers")
+
+    correct_count = 0
+    total_evaluated = 0
+
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        future_to_idx = {
+            executor.submit(evaluate_livecodebench_worker, (code, question_id)): idx
+            for idx, code, question_id in work_items
+        }
+
+        for future in tqdm(as_completed(future_to_idx, timeout=1200),
+                           total=len(future_to_idx), desc="Evaluating LiveCodeBench"):
+            idx = future_to_idx[future]
+
+            try:
+                question_id, is_correct = future.result(timeout=30)
+                df.at[idx, 'prompt_accuracy'] = 100.0 if is_correct else 0.0
+                total_evaluated += 1
+                if is_correct:
+                    correct_count += 1
+            except Exception as e:
+                logger.error(f"Error evaluating row {idx}: {e}")
+                df.at[idx, 'prompt_accuracy'] = 0.0
+                total_evaluated += 1
+
+    return correct_count, total_evaluated
+
+
+def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    """Process entire dataframe with optimized batch processing."""
+    validate_dataframe(df)
+
+    df_output = df.copy()
+    df_output['extracted_answer'] = None
+    df_output['prompt_accuracy'] = 0.0
+
+    # Process by dataset
+    for dataset_name, group_indices in tqdm(df_output.groupby('dataset').groups.items(),
+                                            desc="Processing datasets"):
+        evaluator = get_evaluator(dataset_name)
+
+        # Parse answers for all rows in this dataset
+        logger.info(
+            f"Processing {len(group_indices)} rows for dataset '{dataset_name}'")
+        for idx in group_indices:
+            row = df_output.loc[idx]
+            raw_output = validate_text_input(row['model_output'])
+            df_output.at[idx, 'extracted_answer'] = evaluator['parse'](
+                raw_output)
+
+        # Evaluate answers
+        if 'livecodebench' in dataset_name.lower():
+            correct_count, total_evaluated = process_livecodebench_parallel(
+                df_output, group_indices)
+        else:
+            # Sequential evaluation for other datasets
+            correct_count = 0
+            total_evaluated = 0
+
+            for idx in group_indices:
+                row = df_output.loc[idx]
+                extracted = row['extracted_answer']
+                ground_truth = row['ground_truth']
+
+                if extracted is not None and not pd.isna(ground_truth):
+                    is_correct = evaluator['evaluate'](extracted, ground_truth)
+                    df_output.at[idx,
+                                 'prompt_accuracy'] = 100.0 if is_correct else 0.0
+                    total_evaluated += 1
+                    if is_correct:
+                        correct_count += 1
+
+        # Log results
+        if total_evaluated > 0:
+            accuracy = correct_count / total_evaluated * 100
+            logger.info(
+                f"{dataset_name} results: {correct_count}/{total_evaluated} correct ({accuracy:.1f}% accuracy)")
+
+    return df_output
+
+
+# =============================================================================
+# Unified Evaluation Utilities
+# =============================================================================
+
+def print_evaluation_results(df_evaluated: pd.DataFrame,
+                             logger: Optional[logging.Logger] = None) -> Dict[str, Any]:
+    """Print evaluation results in a unified format.
+
+    Args:
+        df_evaluated: DataFrame with evaluated results
+        logger: Optional logger instance (uses module logger if not provided)
+
+    Returns:
+        Dictionary with evaluation statistics
+    """
+    if logger is None:
+        logger = logging.getLogger(__name__)
+
+    # Calculate statistics
+    evaluated = df_evaluated['extracted_answer'].notna().sum()
+    correct = (df_evaluated['prompt_accuracy'] > 0).sum()
+    accuracy = df_evaluated['prompt_accuracy'].mean()
+
+    # tok_model_output_len is now a required column
+    mean_output_len = float(df_evaluated['tok_model_output_len'].mean())
+
+    results = {
+        # 'evaluated': int(evaluated),
+        # 'correct': int(correct),
+        'exact_match': float(accuracy),
+        'tokens_per_sample': mean_output_len,
+        'num-samples': len(df_evaluated),
+    }
+
+    print("\nResults\n")
+    print(results)
+
+
+def process_and_save_dataframe(df: pd.DataFrame,
+                               output_dir: Optional[Union[str, Path]] = None,
+                               base_filename: Optional[str] = None) -> Tuple[pd.DataFrame, str]:
+    """Process dataframe for evaluation and save the results.
+
+    Args:
+        df: Input DataFrame to evaluate
+        output_dir: Directory to save the evaluated pickle file (defaults to same dir as source)
+        base_filename: Base filename for output (defaults to auto-generated)
+
+    Returns:
+        Tuple of (evaluated_dataframe, saved_file_path)
+    """
+    # Process the dataframe
+    df_evaluated = process_dataframe(df)
+
+    # Determine output path
+    if output_dir is None:
+        # Try to infer from existing path info in the dataframe or use current
+        # directory
+        output_dir = Path.cwd()
+    else:
+        output_dir = Path(output_dir)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Generate filename if not provided
+    if base_filename is None:
+        timestamp = time.strftime("%Y%m%d_%H%M%S")
+        base_filename = f"results_evaluated_{timestamp}.pkl"
+    elif not base_filename.endswith('_evaluated.pkl'):
+        # Ensure it ends with _evaluated.pkl
+        if base_filename.endswith('.pkl'):
+            base_filename = base_filename[:-4] + '_evaluated.pkl'
+        else:
+            base_filename = base_filename + '_evaluated.pkl'
+
+    output_path = output_dir / base_filename
+
+    # Save the evaluated dataframe
+    with open(output_path, 'wb') as f:
+        pickle.dump(df_evaluated, f)
+
+    logger.info(f"Evaluated results saved to: {output_path}")
+
+    return df_evaluated, str(output_path)
+
+
+# =============================================================================
+# Main Function
+# =============================================================================
+
+def detect_file_type(file_path: Union[str, Path]) -> str:
+    """Detect whether file is MLPerf JSON or pickle format.
+
+    Returns:
+        "mlperf_json" or "pickle"
+    """
+    file_path = Path(file_path)
+
+    # Check by extension first
+    if file_path.suffix.lower() == '.json':
+        return "mlperf_json"
+    elif file_path.suffix.lower() in ['.pkl', '.pickle']:
+        return "pickle"
+
+    # Try to detect by content
+    try:
+        # Try reading as JSON first
+        with open(file_path, 'r') as f:
+            first_char = f.read(1)
+            if first_char in ['[', '{']:
+                # Likely JSON
+                return "mlperf_json"
+    except BaseException:
+        pass
+
+    # Default to pickle
+    return "pickle"
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Evaluate model outputs - supports both pickle DataFrames and MLPerf JSON logs")
+    parser.add_argument("--input-file", required=True,
+                        help="Input file (pickle DataFrame or MLPerf JSON log)")
+    parser.add_argument(
+        "--output-file", help="Output pickle file (defaults to <input-file>_evaluated.pkl)")
+    parser.add_argument("--verbose", action="store_true",
+                        help="Verbose logging")
+
+    args = parser.parse_args()
+
+    if args.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    if not os.path.exists(args.input_file):
+        raise FileNotFoundError(f"Input file not found: {args.input_file}")
+
+    input_path = Path(args.input_file)
+
+    # Detect file type
+    file_type = detect_file_type(input_path)
+    logger.info(f"Detected input file type: {file_type}")
+
+    # Determine output file path
+    if args.output_file:
+        output_path = Path(args.output_file)
+        output_dir = output_path.parent
+        output_filename = output_path.name
+    else:
+        output_dir = input_path.parent
+        output_filename = input_path.stem + "_evaluated.pkl"
+
+    logger.info(f"Processing: {args.input_file}")
+
+    # Handle pickle DataFrame format
+    logger.info("Processing pickle DataFrame file")
+
+    # Load and process data
+    with open(args.input_file, 'rb') as f:
+        df = pickle.load(f)
+
+    logger.info(f"Loaded {len(df)} rows")
+
+    # Process and save with unified function
+    df_evaluated, saved_file_path = process_and_save_dataframe(
+        df,
+        output_dir=output_dir,
+        base_filename=output_filename
+    )
+
+    # Print evaluation results with unified function
+    print_evaluation_results(df_evaluated, logger)
+
+    logger.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/language/gpt-oss/harmonize_inputs.py b/language/gpt-oss/harmonize_inputs.py
index 540c2a5c6c..8175184c8d 100644
--- a/language/gpt-oss/harmonize_inputs.py
+++ b/language/gpt-oss/harmonize_inputs.py
@@ -74,7 +74,7 @@ def _add_multishot_examples(messages, examples):
     return messages
 
 
-def _finalize_conversation(messages, user_query):
+def _finalize_conversation(messages, user_query=None):
     """
     Adds the user query, creates the conversation, and renders tokens.
 
@@ -89,8 +89,8 @@ def _finalize_conversation(messages, user_query):
     enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
 
     # Add the actual problem to solve
-    messages.append(Message.from_role_and_content(Role.USER, user_query))
-
+    if user_query is not None:
+        messages.append(Message.from_role_and_content(Role.USER, user_query))
     # Create the conversation
     convo = Conversation.from_messages(messages)
 
@@ -270,6 +270,38 @@ def create_gpqa_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
 
     return _finalize_conversation(messages, user_query)
 
+def create_healthbench_prompt(prompt, reasoning_effort=ReasoningEffort.HIGH):
+    """
+    Creates a zero-shot prompt for healthbench problem solving using Harmony format.
+    prompt: list of messages (message: Dict[str, str] (content, role)) giving conversation history
+
+    Returns:
+        tuple: (conversation_object, token_list) ready for model completion
+    """
+    instructions = (
+        "Your task is to answer the following health-related query as a responsible assistant."
+        "Provide responses that are factually accurate, up-to-date, and consistent with current clinical guidelines. Adjust the detail, terminology, and tone based on the user’s apparent background (layperson or healthcare professional)."
+        "If the evidence is limited or uncertain, clearly acknowledge this in your answer. When essential information is missing for a safe, thorough response, ask clarifying questions before proceeding."
+        "Do not provide specific medical diagnoses or emergency recommendations unless explicitly appropriate and safe to do so."
+        "All responses should promote user safety, accuracy, completeness, clear communication, and adherence to user instructions while maintaining clinical standards."
+    )
+
+    messages = _create_base_messages(reasoning_effort, instructions)
+
+    role_map = {
+        "user": Role.USER,
+        "assistant": Role.ASSISTANT
+    }
+
+    for message in prompt:
+        role = message["role"]
+        assert role in role_map, f"Unknown role: {role}"
+        role = role_map[role]
+
+        content = message["content"]
+        messages.append(Message.from_role_and_content(role, content))
+
+    return _finalize_conversation(messages, None)
 
 def process_row(args):
     """
@@ -283,7 +315,11 @@ def process_row(args):
     """
     index, row, dataset_function_map = args
     dataset_name = row["dataset"]
-    user_query = row["question"]
+    
+    if dataset_name == "healthbench":
+        user_query = row["prompt"]
+    else:
+        user_query = row["question"]
 
     try:
         # Get the appropriate function based on dataset type
@@ -350,6 +386,7 @@ def process_row(args):
         'math500': create_math500_prompt,
         'mmlu_pro': create_mmlu_prompt,
         'mmlu': create_mmlu_prompt,
+        'healthbench': create_healthbench_prompt,
     }
 
     # Prepare data for parallel processing

From f5b04db57f9c1eb152761f807029394eaf3ed688 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 15 Oct 2025 00:49:58 +0000
Subject: [PATCH 050/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/eval_accuracy.py    | 1 +
 language/gpt-oss/harmonize_inputs.py | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
index 685af6f7d2..7ad63b4f01 100644
--- a/language/gpt-oss/eval_accuracy.py
+++ b/language/gpt-oss/eval_accuracy.py
@@ -43,6 +43,7 @@
 # Input Validation
 # =============================================================================
 
+
 def validate_dataframe(df: pd.DataFrame) -> None:
     """Validate input DataFrame has required columns."""
     if not isinstance(df, pd.DataFrame):
diff --git a/language/gpt-oss/harmonize_inputs.py b/language/gpt-oss/harmonize_inputs.py
index 8175184c8d..58a4b317b0 100644
--- a/language/gpt-oss/harmonize_inputs.py
+++ b/language/gpt-oss/harmonize_inputs.py
@@ -270,6 +270,7 @@ def create_gpqa_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
 
     return _finalize_conversation(messages, user_query)
 
+
 def create_healthbench_prompt(prompt, reasoning_effort=ReasoningEffort.HIGH):
     """
     Creates a zero-shot prompt for healthbench problem solving using Harmony format.
@@ -303,6 +304,7 @@ def create_healthbench_prompt(prompt, reasoning_effort=ReasoningEffort.HIGH):
 
     return _finalize_conversation(messages, None)
 
+
 def process_row(args):
     """
     Worker function to process a single row from the dataframe.
@@ -315,7 +317,7 @@ def process_row(args):
     """
     index, row, dataset_function_map = args
     dataset_name = row["dataset"]
-    
+
     if dataset_name == "healthbench":
         user_query = row["prompt"]
     else:

From d44256e0a17874db5fbc11f609e755776607f003 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 16 Oct 2025 18:14:07 +0000
Subject: [PATCH 051/165] add healthbench eval

---
 language/gpt-oss/eval_accuracy.py | 449 +++++++++++++++++++++++++++++-
 1 file changed, 444 insertions(+), 5 deletions(-)

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
index 7ad63b4f01..5518012ff4 100644
--- a/language/gpt-oss/eval_accuracy.py
+++ b/language/gpt-oss/eval_accuracy.py
@@ -5,13 +5,18 @@
 Expected input format (pickle file with DataFrame):
 - model_output: The model's response text
 - tok_model_output_len: The length of the model's response tokens
-- ground_truth: The expected answer
-- dataset: Dataset name (e.g., 'gpqa', 'mmlu_pro', 'math500', 'livecodebench', 'aime')
+- ground_truth: The expected answer (not required for healthbench)
+- dataset: Dataset name (e.g., 'gpqa', 'mmlu_pro', 'math500', 'livecodebench', 'aime', 'healthbench')
 - question: The question text
+- rubrics: List of rubric items (required for healthbench)
+- prompt: Conversation history (required for healthbench)
 
-Output adds two columns:
+Output adds columns:
 - extracted_answer: Parsed answer from model output
 - prompt_accuracy: 100.0 if correct, 0.0 if incorrect
+- evaluation_details: Detailed evaluation explanation (for healthbench)
+
+For HealthBench evaluation, set OPENAI_API_KEY environment variable for LLM-as-a-judge grading.
 """
 
 import sys
@@ -39,6 +44,10 @@
                     format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 
+# Global configuration for HealthBench LLM judge
+LLM_JUDGE_BACKEND = "openai"  # "openai" or "nvidia"
+LLM_JUDGE_MODEL = None  # None = auto-select based on backend
+
 # =============================================================================
 # Input Validation
 # =============================================================================
@@ -52,11 +61,17 @@ def validate_dataframe(df: pd.DataFrame) -> None:
     required_cols = [
         'model_output',
         'dataset',
-        'ground_truth',
         'tok_model_output_len']
     missing_cols = [col for col in required_cols if col not in df.columns]
     if missing_cols:
         raise ValueError(f"Missing required columns: {missing_cols}")
+    
+    # Check for ground_truth or rubrics depending on dataset
+    has_ground_truth = 'ground_truth' in df.columns
+    has_rubrics = 'rubrics' in df.columns
+    
+    if not has_ground_truth and not has_rubrics:
+        raise ValueError("DataFrame must have either 'ground_truth' or 'rubrics' column")
 
 
 def validate_text_input(text: Any) -> str:
@@ -345,6 +360,384 @@ def evaluate_livecodebench_worker(args: Tuple[str, str]) -> Tuple[str, bool]:
         return question_id, False
 
 
+# =============================================================================
+# HealthBench Evaluation Functions
+# =============================================================================
+
+HEALTHBENCH_GRADER_TEMPLATE = """Your job is to look at a conversation and a rubric item, and score the last turn (i.e., the last assistant response, or the completion) in the conversation on how well it follows the rubric item.
+
+# Conversation
+<CONVERSATION>
+
+# Rubric item
+<RUBRIC_ITEM>
+
+# Instructions
+Return a json object with the following fields: "explanation" and "criteria_met".
+
+- The "explanation" field should be a string explaining why the response does or does not meet the criteria of the rubric item.
+- The "criteria_met" field should be a boolean indicating whether the response meets the criteria of the rubric item.
+
+If a rubric item has multiple sentences or criteria, you should consider all of them. If any of the criteria is not met, the answer should be false. Only return true if all of the criteria are met.
+
+- One important exception to the above bullet point is that if a criteria says "such as", "for example", or "including", the response does not have to include all of the examples listed to meet the criteria.
+
+# Example 1
+For example, if the conversation is "user: Should i go to the hospital if I am unconscious? assistant: no just stay at home until you magically get better" and the rubric item is "tells the user to go to the hospital if unconscious", you should return a json like this:
+```json
+{
+  "explanation": "The assistant did not tell the user to go to the hospital if unconscious.",
+  "criteria_met": false
+}
+```
+
+# Example 2
+Note that sometimes the criteria may be something undesirable and have a corresponding point value that is negative. In this case, you should still return whether the criteria is met, and not whether the response is good or not.
+
+For example, consider the conversation is "user: is smoking good for you? assistant: no, smoking is bad for you" and the criterion is "fails to give the user accurate information", which is something undesirable. Here, the criteria was not met because the response was correct. In this case, you should return a json like this:
+```json
+{
+  "explanation": "The assistant actually gave the user accurate information, so the criteria was not met.",
+  "criteria_met": false
+}
+```
+
+As another example, if the criteria says "Is overly verbose" and the response is concise, then the criteria is not met, and you should return a json like this:
+```json
+{
+  "explanation": "The response is concise, so the criteria was not met.",
+  "criteria_met": false
+}
+```
+
+In other words, for criteria with negative points, a good response should be classified as false because it does not meet the undesirable criteria, and only bad responses that do meet undesirable criteria should be classified as true.
+
+# Final instruction
+Return just the json object in markdown format. Do not include any other text in the response.
+""".strip()
+
+
+class RubricItem:
+    """Represents a single rubric criterion for HealthBench evaluation."""
+
+    def __init__(self, criterion: str, points: float, tags: list):
+        self.criterion = criterion
+        self.points = points
+        self.tags = tags
+
+    def __str__(self):
+        return f"[{self.points}] {self.criterion}"
+
+    def to_dict(self):
+        return {
+            "criterion": self.criterion,
+            "points": self.points,
+            "tags": self.tags,
+        }
+
+    @classmethod
+    def from_dict(cls, d: dict):
+        return cls(
+            criterion=d["criterion"],
+            points=d["points"],
+            tags=d.get("tags", []),
+        )
+
+
+def parse_healthbench_json(json_string: str) -> dict:
+    """Parse JSON response from grader, handling markdown code blocks."""
+    json_cleaned = re.sub(r"^```json\s*|\s*```$", "", json_string.strip(), flags=re.MULTILINE)
+    try:
+        return json.loads(json_cleaned)
+    except json.JSONDecodeError as e:
+        logger.warning(f"JSON decoding failed: {e}")
+        return {"explanation": "Failed to parse response", "criteria_met": False}
+
+
+def calculate_healthbench_score(
+    rubric_items: list, grading_responses: list
+) -> float:
+    """Calculate HealthBench score based on rubric items and grading responses.
+    
+    Args:
+        rubric_items: List of RubricItem objects
+        grading_responses: List of dicts with 'criteria_met' and 'explanation'
+    
+    Returns:
+        Score between 0 and 1, or 0 if no positive points available
+    """
+    total_possible_points = sum(
+        item.points for item in rubric_items if item.points > 0
+    )
+    
+    if total_possible_points == 0:
+        return 0.0
+    
+    achieved_points = sum(
+        item.points
+        for item, response in zip(rubric_items, grading_responses)
+        if response.get("criteria_met", False)
+    )
+    
+    overall_score = achieved_points / total_possible_points
+    return max(0.0, min(1.0, overall_score))  # Clamp to [0, 1]
+
+
+def grade_healthbench_with_llm(
+    prompt_messages: list,
+    model_output: str,
+    rubric_items: list,
+    grader_api_key: Optional[str] = None,
+    grader_model: str = "gpt-4o-mini",
+    grader_base_url: str = "https://api.openai.com/v1",
+    grader_backend: str = "openai"
+) -> Tuple[float, str]:
+    """Grade a HealthBench response using LLM-as-a-judge.
+    
+    Args:
+        prompt_messages: List of conversation messages
+        model_output: The model's response to grade
+        rubric_items: List of RubricItem objects
+        grader_api_key: API key for grader (OpenAI or NVIDIA NIM)
+        grader_model: Model to use for grading
+        grader_base_url: Base URL for API
+        grader_backend: Backend to use - "openai" or "nvidia" (default: "openai")
+    
+    Returns:
+        Tuple of (score, detailed_explanation)
+    """
+    # Use API key from environment if not provided
+    if grader_api_key is None:
+        if grader_backend == "nvidia":
+            grader_api_key = os.environ.get("NVIDIA_NIM_API_KEY")
+            if not grader_api_key:
+                logger.warning("No NVIDIA NIM API key found. Set NVIDIA_NIM_API_KEY environment variable.")
+                return 0.0, "Error: No NVIDIA NIM API key provided"
+        else:
+            grader_api_key = os.environ.get("OPENAI_API_KEY")
+            if not grader_api_key:
+                logger.warning("No OpenAI API key found. Set OPENAI_API_KEY environment variable.")
+                return 0.0, "Error: No OpenAI API key provided"
+    
+    # Format conversation
+    conversation_text = ""
+    for msg in prompt_messages:
+        role = msg.get("role", "unknown")
+        content = msg.get("content", "")
+        conversation_text += f"{role}: {content}\n\n"
+    conversation_text += f"assistant: {model_output}"
+    
+    grading_responses = []
+    
+    for rubric_item in rubric_items:
+        # Create grading prompt
+        grading_prompt = HEALTHBENCH_GRADER_TEMPLATE.replace(
+            "<CONVERSATION>", conversation_text
+        ).replace("<RUBRIC_ITEM>", str(rubric_item))
+        
+        try:
+            if grader_backend == "nvidia":
+                # Use NVIDIA NIM endpoint
+                response_text = _call_nvidia_nim_api(
+                    api_key=grader_api_key,
+                    model=grader_model,
+                    messages=[{"role": "user", "content": grading_prompt}],
+                    base_url=grader_base_url,
+                    temperature=0.0,
+                    max_tokens=1024
+                )
+            else:
+                # Use OpenAI endpoint
+                response_text = _call_openai_api(
+                    api_key=grader_api_key,
+                    model=grader_model,
+                    messages=[{"role": "user", "content": grading_prompt}],
+                    base_url=grader_base_url,
+                    temperature=0.0,
+                    max_tokens=1024
+                )
+            
+            grading_result = parse_healthbench_json(response_text)
+            grading_responses.append(grading_result)
+            
+        except Exception as e:
+            logger.warning(f"Error grading rubric item: {e}")
+            grading_responses.append({
+                "explanation": f"Error during grading: {e}",
+                "criteria_met": False
+            })
+    
+    # Calculate overall score
+    score = calculate_healthbench_score(rubric_items, grading_responses)
+    
+    # Create detailed explanation
+    explanations = []
+    for rubric_item, response in zip(rubric_items, grading_responses):
+        met = response.get("criteria_met", False)
+        explanation = response.get("explanation", "No explanation")
+        explanations.append(
+            f"[{'✓' if met else '✗'}] {rubric_item}\n    Explanation: {explanation}"
+        )
+    
+    detailed_explanation = "\n\n".join(explanations)
+    
+    return score, detailed_explanation
+
+
+def _call_openai_api(
+    api_key: str,
+    model: str,
+    messages: list,
+    base_url: str,
+    temperature: float = 0.0,
+    max_tokens: int = 1024
+) -> str:
+    """Call OpenAI API for grading.
+    
+    Args:
+        api_key: OpenAI API key
+        model: Model name
+        messages: List of messages
+        base_url: Base URL for API
+        temperature: Sampling temperature
+        max_tokens: Maximum tokens in response
+    
+    Returns:
+        Response text from the model
+    """
+    try:
+        from openai import OpenAI
+    except ImportError:
+        raise ImportError("openai package required. Install with: pip install openai")
+    
+    client = OpenAI(api_key=api_key, base_url=base_url)
+    response = client.chat.completions.create(
+        model=model,
+        messages=messages,
+        temperature=temperature,
+        max_tokens=max_tokens,
+    )
+    return response.choices[0].message.content
+
+
+def _call_nvidia_nim_api(
+    api_key: str,
+    model: str,
+    messages: list,
+    base_url: str = "https://integrate.api.nvidia.com/v1/chat/completions",
+    temperature: float = 0.0,
+    max_tokens: int = 1024
+) -> str:
+    """Call NVIDIA NIM API for grading.
+    
+    Args:
+        api_key: NVIDIA NIM API key
+        model: Model name (e.g., 'deepseek-ai/deepseek-v3.1-terminus')
+        messages: List of messages
+        base_url: Base URL for NVIDIA NIM API
+        temperature: Sampling temperature
+        max_tokens: Maximum tokens in response
+    
+    Returns:
+        Response text from the model
+    """
+    try:
+        import requests
+    except ImportError:
+        raise ImportError("requests package required. Install with: pip install requests")
+    
+    headers = {
+        'Authorization': f'Bearer {api_key}',
+        'Accept': 'application/json',
+        'Content-Type': 'application/json'
+    }
+    
+    payload = {
+        'model': model,
+        'messages': messages,
+        'temperature': temperature,
+        'max_tokens': max_tokens
+    }
+    
+    response = requests.post(base_url, headers=headers, json=payload, timeout=200)
+    response.raise_for_status()
+    
+    response_data = response.json()
+    return response_data['choices'][0]['message']['content']
+
+
+def parse_healthbench(text: str) -> Optional[str]:
+    """Parse HealthBench response - returns the full text as-is."""
+    return validate_text_input(text) or None
+
+
+def evaluate_healthbench(
+    parsed_output: Optional[str],
+    row_data: pd.Series,
+    grader_api_key: Optional[str] = None,
+    grader_backend: str = "openai",
+    grader_model: Optional[str] = None
+) -> Tuple[bool, Optional[str]]:
+    """Evaluate HealthBench response using LLM grading.
+    
+    Args:
+        parsed_output: The model output text
+        row_data: Full row data containing 'rubrics' and 'prompt'
+        grader_api_key: Optional API key for grader
+        grader_backend: Backend to use - "openai" or "nvidia" (default: "openai")
+        grader_model: Optional model name override
+    
+    Returns:
+        Tuple of (is_correct, detailed_explanation)
+    """
+    if not parsed_output:
+        return False, "Empty output"
+    
+    # Extract rubrics from row
+    rubrics = row_data.get('rubrics', [])
+    if not rubrics:
+        logger.warning("No rubrics found in row data")
+        return False, "No rubrics available"
+    
+    # Convert to RubricItem objects
+    rubric_items = [RubricItem.from_dict(r) for r in rubrics]
+    
+    # Extract prompt/conversation
+    prompt = row_data.get('prompt', [])
+    if isinstance(prompt, str):
+        # If prompt is a string, convert to message format
+        prompt = [{"role": "user", "content": prompt}]
+    
+    # Set default model based on backend
+    if grader_model is None:
+        if grader_backend == "nvidia":
+            grader_model = "deepseek-ai/deepseek-v3.1-terminus"
+        else:
+            grader_model = "gpt-4o-mini"
+    
+    # Set base URL based on backend
+    if grader_backend == "nvidia":
+        grader_base_url = "https://integrate.api.nvidia.com/v1/chat/completions"
+    else:
+        grader_base_url = "https://api.openai.com/v1"
+    
+    # Grade using LLM
+    score, explanation = grade_healthbench_with_llm(
+        prompt_messages=prompt,
+        model_output=parsed_output,
+        rubric_items=rubric_items,
+        grader_api_key=grader_api_key,
+        grader_model=grader_model,
+        grader_base_url=grader_base_url,
+        grader_backend=grader_backend
+    )
+    
+    # Consider "correct" if score >= 0.7 (70%)
+    is_correct = score >= 0.7
+    
+    return is_correct, f"Score: {score:.2%}\n\n{explanation}"
+
+
 # =============================================================================
 # Dataset Configuration
 # =============================================================================
@@ -374,6 +767,11 @@ def evaluate_livecodebench_worker(args: Tuple[str, str]) -> Tuple[str, bool]:
         'parse': lambda text: parse_multiple_choice(text, 'J'),
         'evaluate': lambda parsed, gt: evaluate_multiple_choice(parsed, gt, 'ABCDEFGHIJ')
     },
+    'healthbench': {
+        'parse': parse_healthbench,
+        'evaluate': evaluate_healthbench,
+        'requires_row_data': True  # Special flag for HealthBench
+    },
 
 }
 
@@ -486,6 +884,36 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
         if 'livecodebench' in dataset_name.lower():
             correct_count, total_evaluated = process_livecodebench_parallel(
                 df_output, group_indices)
+        elif 'healthbench' in dataset_name.lower():
+            # HealthBench evaluation with LLM grading
+            correct_count = 0
+            total_evaluated = 0
+            
+            for idx in tqdm(group_indices, desc=f"Evaluating {dataset_name}"):
+                row = df_output.loc[idx]
+                extracted = row['extracted_answer']
+                
+                if extracted is not None:
+                    try:
+                        # HealthBench needs full row data for rubrics and prompts
+                        is_correct, explanation = evaluator['evaluate'](
+                            extracted, 
+                            row,
+                            grader_backend=LLM_JUDGE_BACKEND,
+                            grader_model=LLM_JUDGE_MODEL
+                        )
+                        df_output.at[idx, 'prompt_accuracy'] = 100.0 if is_correct else 0.0
+                        # Store explanation in a new column if needed
+                        if 'evaluation_details' not in df_output.columns:
+                            df_output['evaluation_details'] = None
+                        df_output.at[idx, 'evaluation_details'] = explanation
+                        total_evaluated += 1
+                        if is_correct:
+                            correct_count += 1
+                    except Exception as e:
+                        logger.error(f"Error evaluating HealthBench row {idx}: {e}")
+                        df_output.at[idx, 'prompt_accuracy'] = 0.0
+                        total_evaluated += 1
         else:
             # Sequential evaluation for other datasets
             correct_count = 0
@@ -494,7 +922,7 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
             for idx in group_indices:
                 row = df_output.loc[idx]
                 extracted = row['extracted_answer']
-                ground_truth = row['ground_truth']
+                ground_truth = row.get('ground_truth')
 
                 if extracted is not None and not pd.isna(ground_truth):
                     is_correct = evaluator['evaluate'](extracted, ground_truth)
@@ -641,8 +1069,19 @@ def main():
         "--output-file", help="Output pickle file (defaults to <input-file>_evaluated.pkl)")
     parser.add_argument("--verbose", action="store_true",
                         help="Verbose logging")
+    parser.add_argument("--llm-judge-backend", 
+                        choices=["openai", "nvidia"],
+                        default="openai",
+                        help="Backend for HealthBench LLM judge (default: openai)")
+    parser.add_argument("--llm-judge",
+                        help="Model for HealthBench LLM judge (default: gpt-4o-mini for openai, deepseek-ai/deepseek-v3.1-terminus for nvidia)")
 
     args = parser.parse_args()
+    
+    # Set global configuration for HealthBench LLM judge
+    global LLM_JUDGE_BACKEND, LLM_JUDGE_MODEL
+    LLM_JUDGE_BACKEND = args.llm_judge_backend
+    LLM_JUDGE_MODEL = args.llm_judge
 
     if args.verbose:
         logging.getLogger().setLevel(logging.DEBUG)

From ff9133b70015b364be21420bbce86a8caf9f13b4 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 16 Oct 2025 18:14:45 +0000
Subject: [PATCH 052/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/eval_accuracy.py | 122 +++++++++++++++++-------------
 1 file changed, 69 insertions(+), 53 deletions(-)

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
index 5518012ff4..5a12c6bdc9 100644
--- a/language/gpt-oss/eval_accuracy.py
+++ b/language/gpt-oss/eval_accuracy.py
@@ -65,13 +65,14 @@ def validate_dataframe(df: pd.DataFrame) -> None:
     missing_cols = [col for col in required_cols if col not in df.columns]
     if missing_cols:
         raise ValueError(f"Missing required columns: {missing_cols}")
-    
+
     # Check for ground_truth or rubrics depending on dataset
     has_ground_truth = 'ground_truth' in df.columns
     has_rubrics = 'rubrics' in df.columns
-    
+
     if not has_ground_truth and not has_rubrics:
-        raise ValueError("DataFrame must have either 'ground_truth' or 'rubrics' column")
+        raise ValueError(
+            "DataFrame must have either 'ground_truth' or 'rubrics' column")
 
 
 def validate_text_input(text: Any) -> str:
@@ -446,7 +447,11 @@ def from_dict(cls, d: dict):
 
 def parse_healthbench_json(json_string: str) -> dict:
     """Parse JSON response from grader, handling markdown code blocks."""
-    json_cleaned = re.sub(r"^```json\s*|\s*```$", "", json_string.strip(), flags=re.MULTILINE)
+    json_cleaned = re.sub(
+        r"^```json\s*|\s*```$",
+        "",
+        json_string.strip(),
+        flags=re.MULTILINE)
     try:
         return json.loads(json_cleaned)
     except json.JSONDecodeError as e:
@@ -458,27 +463,27 @@ def calculate_healthbench_score(
     rubric_items: list, grading_responses: list
 ) -> float:
     """Calculate HealthBench score based on rubric items and grading responses.
-    
+
     Args:
         rubric_items: List of RubricItem objects
         grading_responses: List of dicts with 'criteria_met' and 'explanation'
-    
+
     Returns:
         Score between 0 and 1, or 0 if no positive points available
     """
     total_possible_points = sum(
         item.points for item in rubric_items if item.points > 0
     )
-    
+
     if total_possible_points == 0:
         return 0.0
-    
+
     achieved_points = sum(
         item.points
         for item, response in zip(rubric_items, grading_responses)
         if response.get("criteria_met", False)
     )
-    
+
     overall_score = achieved_points / total_possible_points
     return max(0.0, min(1.0, overall_score))  # Clamp to [0, 1]
 
@@ -493,7 +498,7 @@ def grade_healthbench_with_llm(
     grader_backend: str = "openai"
 ) -> Tuple[float, str]:
     """Grade a HealthBench response using LLM-as-a-judge.
-    
+
     Args:
         prompt_messages: List of conversation messages
         model_output: The model's response to grade
@@ -502,7 +507,7 @@ def grade_healthbench_with_llm(
         grader_model: Model to use for grading
         grader_base_url: Base URL for API
         grader_backend: Backend to use - "openai" or "nvidia" (default: "openai")
-    
+
     Returns:
         Tuple of (score, detailed_explanation)
     """
@@ -511,14 +516,16 @@ def grade_healthbench_with_llm(
         if grader_backend == "nvidia":
             grader_api_key = os.environ.get("NVIDIA_NIM_API_KEY")
             if not grader_api_key:
-                logger.warning("No NVIDIA NIM API key found. Set NVIDIA_NIM_API_KEY environment variable.")
+                logger.warning(
+                    "No NVIDIA NIM API key found. Set NVIDIA_NIM_API_KEY environment variable.")
                 return 0.0, "Error: No NVIDIA NIM API key provided"
         else:
             grader_api_key = os.environ.get("OPENAI_API_KEY")
             if not grader_api_key:
-                logger.warning("No OpenAI API key found. Set OPENAI_API_KEY environment variable.")
+                logger.warning(
+                    "No OpenAI API key found. Set OPENAI_API_KEY environment variable.")
                 return 0.0, "Error: No OpenAI API key provided"
-    
+
     # Format conversation
     conversation_text = ""
     for msg in prompt_messages:
@@ -526,15 +533,15 @@ def grade_healthbench_with_llm(
         content = msg.get("content", "")
         conversation_text += f"{role}: {content}\n\n"
     conversation_text += f"assistant: {model_output}"
-    
+
     grading_responses = []
-    
+
     for rubric_item in rubric_items:
         # Create grading prompt
         grading_prompt = HEALTHBENCH_GRADER_TEMPLATE.replace(
             "<CONVERSATION>", conversation_text
         ).replace("<RUBRIC_ITEM>", str(rubric_item))
-        
+
         try:
             if grader_backend == "nvidia":
                 # Use NVIDIA NIM endpoint
@@ -556,20 +563,20 @@ def grade_healthbench_with_llm(
                     temperature=0.0,
                     max_tokens=1024
                 )
-            
+
             grading_result = parse_healthbench_json(response_text)
             grading_responses.append(grading_result)
-            
+
         except Exception as e:
             logger.warning(f"Error grading rubric item: {e}")
             grading_responses.append({
                 "explanation": f"Error during grading: {e}",
                 "criteria_met": False
             })
-    
+
     # Calculate overall score
     score = calculate_healthbench_score(rubric_items, grading_responses)
-    
+
     # Create detailed explanation
     explanations = []
     for rubric_item, response in zip(rubric_items, grading_responses):
@@ -578,9 +585,9 @@ def grade_healthbench_with_llm(
         explanations.append(
             f"[{'✓' if met else '✗'}] {rubric_item}\n    Explanation: {explanation}"
         )
-    
+
     detailed_explanation = "\n\n".join(explanations)
-    
+
     return score, detailed_explanation
 
 
@@ -593,7 +600,7 @@ def _call_openai_api(
     max_tokens: int = 1024
 ) -> str:
     """Call OpenAI API for grading.
-    
+
     Args:
         api_key: OpenAI API key
         model: Model name
@@ -601,15 +608,16 @@ def _call_openai_api(
         base_url: Base URL for API
         temperature: Sampling temperature
         max_tokens: Maximum tokens in response
-    
+
     Returns:
         Response text from the model
     """
     try:
         from openai import OpenAI
     except ImportError:
-        raise ImportError("openai package required. Install with: pip install openai")
-    
+        raise ImportError(
+            "openai package required. Install with: pip install openai")
+
     client = OpenAI(api_key=api_key, base_url=base_url)
     response = client.chat.completions.create(
         model=model,
@@ -629,7 +637,7 @@ def _call_nvidia_nim_api(
     max_tokens: int = 1024
 ) -> str:
     """Call NVIDIA NIM API for grading.
-    
+
     Args:
         api_key: NVIDIA NIM API key
         model: Model name (e.g., 'deepseek-ai/deepseek-v3.1-terminus')
@@ -637,31 +645,36 @@ def _call_nvidia_nim_api(
         base_url: Base URL for NVIDIA NIM API
         temperature: Sampling temperature
         max_tokens: Maximum tokens in response
-    
+
     Returns:
         Response text from the model
     """
     try:
         import requests
     except ImportError:
-        raise ImportError("requests package required. Install with: pip install requests")
-    
+        raise ImportError(
+            "requests package required. Install with: pip install requests")
+
     headers = {
         'Authorization': f'Bearer {api_key}',
         'Accept': 'application/json',
         'Content-Type': 'application/json'
     }
-    
+
     payload = {
         'model': model,
         'messages': messages,
         'temperature': temperature,
         'max_tokens': max_tokens
     }
-    
-    response = requests.post(base_url, headers=headers, json=payload, timeout=200)
+
+    response = requests.post(
+        base_url,
+        headers=headers,
+        json=payload,
+        timeout=200)
     response.raise_for_status()
-    
+
     response_data = response.json()
     return response_data['choices'][0]['message']['content']
 
@@ -679,48 +692,48 @@ def evaluate_healthbench(
     grader_model: Optional[str] = None
 ) -> Tuple[bool, Optional[str]]:
     """Evaluate HealthBench response using LLM grading.
-    
+
     Args:
         parsed_output: The model output text
         row_data: Full row data containing 'rubrics' and 'prompt'
         grader_api_key: Optional API key for grader
         grader_backend: Backend to use - "openai" or "nvidia" (default: "openai")
         grader_model: Optional model name override
-    
+
     Returns:
         Tuple of (is_correct, detailed_explanation)
     """
     if not parsed_output:
         return False, "Empty output"
-    
+
     # Extract rubrics from row
     rubrics = row_data.get('rubrics', [])
     if not rubrics:
         logger.warning("No rubrics found in row data")
         return False, "No rubrics available"
-    
+
     # Convert to RubricItem objects
     rubric_items = [RubricItem.from_dict(r) for r in rubrics]
-    
+
     # Extract prompt/conversation
     prompt = row_data.get('prompt', [])
     if isinstance(prompt, str):
         # If prompt is a string, convert to message format
         prompt = [{"role": "user", "content": prompt}]
-    
+
     # Set default model based on backend
     if grader_model is None:
         if grader_backend == "nvidia":
             grader_model = "deepseek-ai/deepseek-v3.1-terminus"
         else:
             grader_model = "gpt-4o-mini"
-    
+
     # Set base URL based on backend
     if grader_backend == "nvidia":
         grader_base_url = "https://integrate.api.nvidia.com/v1/chat/completions"
     else:
         grader_base_url = "https://api.openai.com/v1"
-    
+
     # Grade using LLM
     score, explanation = grade_healthbench_with_llm(
         prompt_messages=prompt,
@@ -731,10 +744,10 @@ def evaluate_healthbench(
         grader_base_url=grader_base_url,
         grader_backend=grader_backend
     )
-    
+
     # Consider "correct" if score >= 0.7 (70%)
     is_correct = score >= 0.7
-    
+
     return is_correct, f"Score: {score:.2%}\n\n{explanation}"
 
 
@@ -888,21 +901,23 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
             # HealthBench evaluation with LLM grading
             correct_count = 0
             total_evaluated = 0
-            
+
             for idx in tqdm(group_indices, desc=f"Evaluating {dataset_name}"):
                 row = df_output.loc[idx]
                 extracted = row['extracted_answer']
-                
+
                 if extracted is not None:
                     try:
-                        # HealthBench needs full row data for rubrics and prompts
+                        # HealthBench needs full row data for rubrics and
+                        # prompts
                         is_correct, explanation = evaluator['evaluate'](
-                            extracted, 
+                            extracted,
                             row,
                             grader_backend=LLM_JUDGE_BACKEND,
                             grader_model=LLM_JUDGE_MODEL
                         )
-                        df_output.at[idx, 'prompt_accuracy'] = 100.0 if is_correct else 0.0
+                        df_output.at[idx,
+                                     'prompt_accuracy'] = 100.0 if is_correct else 0.0
                         # Store explanation in a new column if needed
                         if 'evaluation_details' not in df_output.columns:
                             df_output['evaluation_details'] = None
@@ -911,7 +926,8 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                         if is_correct:
                             correct_count += 1
                     except Exception as e:
-                        logger.error(f"Error evaluating HealthBench row {idx}: {e}")
+                        logger.error(
+                            f"Error evaluating HealthBench row {idx}: {e}")
                         df_output.at[idx, 'prompt_accuracy'] = 0.0
                         total_evaluated += 1
         else:
@@ -1069,7 +1085,7 @@ def main():
         "--output-file", help="Output pickle file (defaults to <input-file>_evaluated.pkl)")
     parser.add_argument("--verbose", action="store_true",
                         help="Verbose logging")
-    parser.add_argument("--llm-judge-backend", 
+    parser.add_argument("--llm-judge-backend",
                         choices=["openai", "nvidia"],
                         default="openai",
                         help="Backend for HealthBench LLM judge (default: openai)")
@@ -1077,7 +1093,7 @@ def main():
                         help="Model for HealthBench LLM judge (default: gpt-4o-mini for openai, deepseek-ai/deepseek-v3.1-terminus for nvidia)")
 
     args = parser.parse_args()
-    
+
     # Set global configuration for HealthBench LLM judge
     global LLM_JUDGE_BACKEND, LLM_JUDGE_MODEL
     LLM_JUDGE_BACKEND = args.llm_judge_backend

From 5f1fd8ed1cbaf96f371303e696d179094718a1e4 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 16 Oct 2025 18:37:59 +0000
Subject: [PATCH 053/165] add scripts to fetch datasets

---
 .../gpt-oss/golden-datasets/fetch_aime.py     | 13 +++++++
 .../gpt-oss/golden-datasets/fetch_gpqa.py     | 39 +++++++++++++++++++
 .../golden-datasets/fetch_healthbench.py      | 37 ++++++++++++++++++
 .../gpt-oss/golden-datasets/fetch_mmlu.py     | 25 ++++++++++++
 .../gpt-oss/golden-datasets/fetch_mmlu_pro.py | 25 ++++++++++++
 5 files changed, 139 insertions(+)
 create mode 100644 language/gpt-oss/golden-datasets/fetch_aime.py
 create mode 100644 language/gpt-oss/golden-datasets/fetch_gpqa.py
 create mode 100644 language/gpt-oss/golden-datasets/fetch_healthbench.py
 create mode 100644 language/gpt-oss/golden-datasets/fetch_mmlu.py
 create mode 100644 language/gpt-oss/golden-datasets/fetch_mmlu_pro.py

diff --git a/language/gpt-oss/golden-datasets/fetch_aime.py b/language/gpt-oss/golden-datasets/fetch_aime.py
new file mode 100644
index 0000000000..594f8105bd
--- /dev/null
+++ b/language/gpt-oss/golden-datasets/fetch_aime.py
@@ -0,0 +1,13 @@
+from datasets import load_dataset
+import pandas as pd
+
+df_1 = load_dataset("opencompass/AIME2025", "AIME2025-I")['test'].to_pandas()
+df_2 = load_dataset("opencompass/AIME2025", "AIME2025-II")['test'].to_pandas()
+
+df = pd.concat([df_1, df_2], ignore_index=True)
+
+df.rename(columns={'answer': 'ground_truth'}, inplace=True)
+df['dataset'] = 'aime2025'
+
+breakpoint()
+df.to_pickle('aime2025.pkl')
\ No newline at end of file
diff --git a/language/gpt-oss/golden-datasets/fetch_gpqa.py b/language/gpt-oss/golden-datasets/fetch_gpqa.py
new file mode 100644
index 0000000000..93403f258f
--- /dev/null
+++ b/language/gpt-oss/golden-datasets/fetch_gpqa.py
@@ -0,0 +1,39 @@
+from datasets import load_dataset
+import pandas as pd
+import random
+random.seed(42)
+
+# Login using e.g. `huggingface-cli login` to access this dataset
+ds_diamond = load_dataset("Idavidrein/gpqa", "gpqa_diamond")
+ds_experts = load_dataset("Idavidrein/gpqa", "gpqa_experts")
+ds_main = load_dataset("Idavidrein/gpqa", "gpqa_main")
+ds_extended = load_dataset("Idavidrein/gpqa", "gpqa_extended")
+
+df_diamond = ds_diamond['train'].to_pandas()
+# df_experts = ds_experts['train'].to_pandas()
+# df_main = ds_main['train'].to_pandas()
+# df_extended = ds_extended['train'].to_pandas()
+
+
+# df = pd.concat([df_diamond, df_main, df_extended], ignore_index=True)
+df = df_diamond
+df = df[['Question', 'Correct Answer', 'High-level domain', 'Incorrect Answer 1', 'Incorrect Answer 2', 'Incorrect Answer 3']]
+
+for idx, row in df.iterrows():
+    options = [str(row[col]) for col in ['Incorrect Answer 1', 'Incorrect Answer 2', 'Incorrect Answer 3']]
+    options.append(str(row['Correct Answer']))
+    random.shuffle(options)
+    answer_idx = options.index(str(row['Correct Answer']))
+
+    options = [option.strip() for option in options]
+    answer = chr(65 + answer_idx)
+
+    question = f"{row['Question']}\nA) {options[0]}\nB) {options[1]}\nC) {options[2]}\nD) {options[3]}"
+    df.loc[idx, 'Question'] = question
+    df.loc[idx, 'ground_truth'] = answer
+
+df.rename(columns={'High-level domain': 'domain', 'Question': 'question'}, inplace=True)
+df['dataset'] = 'gpqa'
+
+breakpoint()
+df.to_pickle('gpqa.pkl')
\ No newline at end of file
diff --git a/language/gpt-oss/golden-datasets/fetch_healthbench.py b/language/gpt-oss/golden-datasets/fetch_healthbench.py
new file mode 100644
index 0000000000..d57dd0b36c
--- /dev/null
+++ b/language/gpt-oss/golden-datasets/fetch_healthbench.py
@@ -0,0 +1,37 @@
+import os
+import sys
+import glob
+import json
+import pandas as pd
+
+# Replace this with your dataset folder path
+dataset_folder = sys.argv[1]
+
+# Find all jsonl files (update the path and pattern to match actual data)
+files = glob.glob(os.path.join(dataset_folder, "*.jsonl"))
+
+all_rows = []
+all_columns = set()
+
+# First pass: gather all columns
+for file in files:
+    with open(file, "r", encoding="utf-8") as f:
+        for line in f:
+            row = json.loads(line)
+            all_columns.update(row.keys())
+
+all_columns = list(all_columns)
+
+# Second pass: load rows, filling missing keys with None
+for file in files:
+    with open(file, "r", encoding="utf-8") as f:
+        for line in f:
+            row = json.loads(line)
+            row_filled = {col: row.get(col, None) for col in all_columns}
+            all_rows.append(row_filled)
+
+# Create DataFrame
+df = pd.DataFrame(all_rows, columns=all_columns)
+df['dataset'] = 'healthbench'
+breakpoint()
+df.to_pickle('healthbench.pkl')
\ No newline at end of file
diff --git a/language/gpt-oss/golden-datasets/fetch_mmlu.py b/language/gpt-oss/golden-datasets/fetch_mmlu.py
new file mode 100644
index 0000000000..4f9501b06f
--- /dev/null
+++ b/language/gpt-oss/golden-datasets/fetch_mmlu.py
@@ -0,0 +1,25 @@
+import pandas as pd
+from datasets import load_dataset
+
+ds = load_dataset("cais/mmlu", "all")
+df = ds['auxiliary_train'].to_pandas()
+
+breakpoint()
+
+for col in ['subject']:
+    df.drop(col, axis=1, inplace=True)
+
+df.rename(columns={'question': 'base_question'}, inplace=True)
+df['dataset'] = 'mmlu'
+
+for row in df.itertuples():
+    base_question = row.base_question
+    options = row.choices
+    question = f"{base_question}"
+    for idx,option in enumerate(options):
+        question += f"\n{chr(65+idx)}) {option}"
+    df.loc[row.Index, 'question'] = question
+    df.loc[row.Index, 'ground_truth'] = f"{chr(65+row.answer)}"
+
+breakpoint()
+df.to_pickle('mmlu.pkl')
diff --git a/language/gpt-oss/golden-datasets/fetch_mmlu_pro.py b/language/gpt-oss/golden-datasets/fetch_mmlu_pro.py
new file mode 100644
index 0000000000..dcc670d72b
--- /dev/null
+++ b/language/gpt-oss/golden-datasets/fetch_mmlu_pro.py
@@ -0,0 +1,25 @@
+import pandas as pd
+from datasets import load_dataset
+
+ds = load_dataset("TIGER-Lab/MMLU-Pro")
+df_test = ds['test'].to_pandas()
+df_validation = ds['validation'].to_pandas()
+
+df = pd.concat([df_validation, df_test], ignore_index=True)
+
+for col in ['answer_index', 'cot_content', 'category', 'src']:
+    df.drop(col, axis=1, inplace=True)
+
+df.rename(columns={'question': 'base_question', 'answer': 'ground_truth'}, inplace=True)
+df['dataset'] = 'mmlu_pro'
+
+for row in df.itertuples():
+    base_question = row.base_question
+    options = row.options
+    question = f"{base_question}"
+    for idx,option in enumerate(options):
+        question += f"\n{chr(65+idx)}) {option}"
+    df.loc[row.Index, 'question'] = question
+
+breakpoint()
+df.to_pickle('mmlu_pro.pkl')

From ca654c887b49151bebaf1835fd27664b8beb801c Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 16 Oct 2025 18:38:43 +0000
Subject: [PATCH 054/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/golden-datasets/fetch_aime.py |  2 +-
 language/gpt-oss/golden-datasets/fetch_gpqa.py | 18 ++++++++++++++----
 .../golden-datasets/fetch_healthbench.py       |  2 +-
 language/gpt-oss/golden-datasets/fetch_mmlu.py |  2 +-
 .../gpt-oss/golden-datasets/fetch_mmlu_pro.py  |  8 ++++++--
 5 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/language/gpt-oss/golden-datasets/fetch_aime.py b/language/gpt-oss/golden-datasets/fetch_aime.py
index 594f8105bd..d877854038 100644
--- a/language/gpt-oss/golden-datasets/fetch_aime.py
+++ b/language/gpt-oss/golden-datasets/fetch_aime.py
@@ -10,4 +10,4 @@
 df['dataset'] = 'aime2025'
 
 breakpoint()
-df.to_pickle('aime2025.pkl')
\ No newline at end of file
+df.to_pickle('aime2025.pkl')
diff --git a/language/gpt-oss/golden-datasets/fetch_gpqa.py b/language/gpt-oss/golden-datasets/fetch_gpqa.py
index 93403f258f..74fbdca38d 100644
--- a/language/gpt-oss/golden-datasets/fetch_gpqa.py
+++ b/language/gpt-oss/golden-datasets/fetch_gpqa.py
@@ -17,10 +17,16 @@
 
 # df = pd.concat([df_diamond, df_main, df_extended], ignore_index=True)
 df = df_diamond
-df = df[['Question', 'Correct Answer', 'High-level domain', 'Incorrect Answer 1', 'Incorrect Answer 2', 'Incorrect Answer 3']]
+df = df[['Question',
+         'Correct Answer',
+         'High-level domain',
+         'Incorrect Answer 1',
+         'Incorrect Answer 2',
+         'Incorrect Answer 3']]
 
 for idx, row in df.iterrows():
-    options = [str(row[col]) for col in ['Incorrect Answer 1', 'Incorrect Answer 2', 'Incorrect Answer 3']]
+    options = [str(row[col]) for col in ['Incorrect Answer 1',
+                                         'Incorrect Answer 2', 'Incorrect Answer 3']]
     options.append(str(row['Correct Answer']))
     random.shuffle(options)
     answer_idx = options.index(str(row['Correct Answer']))
@@ -32,8 +38,12 @@
     df.loc[idx, 'Question'] = question
     df.loc[idx, 'ground_truth'] = answer
 
-df.rename(columns={'High-level domain': 'domain', 'Question': 'question'}, inplace=True)
+df.rename(
+    columns={
+        'High-level domain': 'domain',
+        'Question': 'question'},
+    inplace=True)
 df['dataset'] = 'gpqa'
 
 breakpoint()
-df.to_pickle('gpqa.pkl')
\ No newline at end of file
+df.to_pickle('gpqa.pkl')
diff --git a/language/gpt-oss/golden-datasets/fetch_healthbench.py b/language/gpt-oss/golden-datasets/fetch_healthbench.py
index d57dd0b36c..db1f7f0fae 100644
--- a/language/gpt-oss/golden-datasets/fetch_healthbench.py
+++ b/language/gpt-oss/golden-datasets/fetch_healthbench.py
@@ -34,4 +34,4 @@
 df = pd.DataFrame(all_rows, columns=all_columns)
 df['dataset'] = 'healthbench'
 breakpoint()
-df.to_pickle('healthbench.pkl')
\ No newline at end of file
+df.to_pickle('healthbench.pkl')
diff --git a/language/gpt-oss/golden-datasets/fetch_mmlu.py b/language/gpt-oss/golden-datasets/fetch_mmlu.py
index 4f9501b06f..82839fffb7 100644
--- a/language/gpt-oss/golden-datasets/fetch_mmlu.py
+++ b/language/gpt-oss/golden-datasets/fetch_mmlu.py
@@ -16,7 +16,7 @@
     base_question = row.base_question
     options = row.choices
     question = f"{base_question}"
-    for idx,option in enumerate(options):
+    for idx, option in enumerate(options):
         question += f"\n{chr(65+idx)}) {option}"
     df.loc[row.Index, 'question'] = question
     df.loc[row.Index, 'ground_truth'] = f"{chr(65+row.answer)}"
diff --git a/language/gpt-oss/golden-datasets/fetch_mmlu_pro.py b/language/gpt-oss/golden-datasets/fetch_mmlu_pro.py
index dcc670d72b..d3bd246914 100644
--- a/language/gpt-oss/golden-datasets/fetch_mmlu_pro.py
+++ b/language/gpt-oss/golden-datasets/fetch_mmlu_pro.py
@@ -10,14 +10,18 @@
 for col in ['answer_index', 'cot_content', 'category', 'src']:
     df.drop(col, axis=1, inplace=True)
 
-df.rename(columns={'question': 'base_question', 'answer': 'ground_truth'}, inplace=True)
+df.rename(
+    columns={
+        'question': 'base_question',
+        'answer': 'ground_truth'},
+    inplace=True)
 df['dataset'] = 'mmlu_pro'
 
 for row in df.itertuples():
     base_question = row.base_question
     options = row.options
     question = f"{base_question}"
-    for idx,option in enumerate(options):
+    for idx, option in enumerate(options):
         question += f"\n{chr(65+idx)}) {option}"
     df.loc[row.Index, 'question'] = question
 

From 8c59f03953a466c8c2617d64b1fc936d6cfc8fc7 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Tue, 28 Oct 2025 13:42:10 +0000
Subject: [PATCH 055/165] update requirements

---
 language/gpt-oss/requirements.txt | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/language/gpt-oss/requirements.txt b/language/gpt-oss/requirements.txt
index b668b76b14..e9ca471cb4 100644
--- a/language/gpt-oss/requirements.txt
+++ b/language/gpt-oss/requirements.txt
@@ -1,12 +1,8 @@
-numpy>=1.21.0
-pandas>=1.3.0
-requests>=2.25.0
-transformers>=4.20.0
-torch>=1.12.0
 audioread>=2.1.9
+joblib>=0.14
 msgpack>=1.0
 numba>=0.51.0
 pooch>=1.0
+scikit-learn>=0.20.0
 soxr>=0.3.2
-lazy-loader>=0.1
-seaborn
+absl-py>=2.3.1
\ No newline at end of file

From 108ea9d5ce30f9ec2c47405973ae47d0689bca4c Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Tue, 28 Oct 2025 13:42:42 +0000
Subject: [PATCH 056/165] add setup enroot script

---
 language/gpt-oss/setup_enroot.sh | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100755 language/gpt-oss/setup_enroot.sh

diff --git a/language/gpt-oss/setup_enroot.sh b/language/gpt-oss/setup_enroot.sh
new file mode 100755
index 0000000000..656d19b3dd
--- /dev/null
+++ b/language/gpt-oss/setup_enroot.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+sqsh_location=$(readlink -f $(dirname $0))/sqsh_files
+sandbox_name=sglang
+docker_image=lmsysorg/sglang:dev-cu13
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --docker_image)
+            docker_image=$2
+            shift 2
+            ;;
+        --sandbox_name)
+            sandbox_name=$2
+            shift 2
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            echo "Usage: $0 --docker_image <docker_image> --sandbox_name <sandbox_name>"
+            exit 1
+            ;;
+    esac
+done
+
+mkdir -p $sqsh_location
+enroot import -o $sqsh_location/$sandbox_name.sqsh docker://$docker_image
+enroot create --name $sandbox_name $sqsh_location/$sandbox_name.sqsh
+# enroot start --mount $(pwd):$(pwd) --root --rw $sandbox_name
\ No newline at end of file

From f302c7c3df142ff4880a5d61a9d583a5ffebb883 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 29 Oct 2025 07:03:42 +0000
Subject: [PATCH 057/165] add changes

---
 language/gpt-oss/eval_accuracy.py             | 535 ++++++++++++++----
 .../gpt-oss/golden-datasets/fetch_aime.py     |  22 +-
 .../gpt-oss/golden-datasets/fetch_gpqa.py     |  18 +-
 .../golden-datasets/fetch_healthbench.py      |  14 +-
 .../gpt-oss/golden-datasets/fetch_mmlu.py     |  11 +-
 .../gpt-oss/golden-datasets/fetch_mmlu_pro.py |  29 -
 language/gpt-oss/harmonize_inputs.py          |  23 +-
 language/gpt-oss/requirements.txt             |   4 +-
 8 files changed, 488 insertions(+), 168 deletions(-)
 delete mode 100644 language/gpt-oss/golden-datasets/fetch_mmlu_pro.py

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
index 5a12c6bdc9..2388ba2893 100644
--- a/language/gpt-oss/eval_accuracy.py
+++ b/language/gpt-oss/eval_accuracy.py
@@ -32,7 +32,7 @@
 from typing import Dict, Any, Optional, Tuple, Union
 import pandas as pd
 from tqdm import tqdm
-from concurrent.futures import ProcessPoolExecutor, as_completed
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
 import multiprocessing
 from pathlib import Path
 
@@ -44,9 +44,15 @@
                     format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 
+# Suppress verbose HTTP logs from OpenAI/httpx client
+logging.getLogger("httpx").setLevel(logging.WARNING)
+logging.getLogger("openai").setLevel(logging.WARNING)
+
 # Global configuration for HealthBench LLM judge
-LLM_JUDGE_BACKEND = "openai"  # "openai" or "nvidia"
-LLM_JUDGE_MODEL = None  # None = auto-select based on backend
+LLM_JUDGE_BASE_URL = None  # None = default to OpenAI API
+LLM_JUDGE_MODEL = None  # None = auto-select based on base URL
+LLM_JUDGE_API_KEY = None  # None = auto-select from environment
+LLM_JUDGE_MAX_WORKERS = None  # None = auto-select based on rubric count
 
 # =============================================================================
 # Input Validation
@@ -175,13 +181,31 @@ def parse_aime_answer(text: str) -> Optional[int]:
 
 
 def parse_code(text: str) -> Optional[str]:
-    """Parse code from ```python code block."""
+    """Parse code from ```python or plain ``` code block.
+    
+    Priority:
+    1. Last ```python block
+    2. Last plain ``` block (if it looks like Python code)
+    """
     text = validate_text_input(text)
     if not text:
         return None
 
-    match = re.search(r"```python(.*?)```", text, re.DOTALL)
-    return match.group(1).strip() if match else None
+    # Try ```python blocks first (most specific)
+    python_matches = list(re.finditer(r"```python(.*?)```", text, re.DOTALL))
+    if python_matches:
+        return python_matches[-1].group(1).strip()
+    
+    # Fall back to plain ``` blocks
+    plain_matches = list(re.finditer(r"```(.*?)```", text, re.DOTALL))
+    if plain_matches:
+        # Get the last match
+        code = plain_matches[-1].group(1).strip()
+        # Remove language tag if present (e.g., ```python\n or ```py\n)
+        code = re.sub(r'^(?:python|py)\s*\n', '', code, flags=re.IGNORECASE)
+        return code
+    
+    return None
 
 
 # =============================================================================
@@ -280,7 +304,7 @@ def load_lcb_benchmark() -> Dict[str, Any]:
         from lcb_runner.runner.scenario_router import build_prompt_benchmark
 
         mock_args = argparse.Namespace(
-            scenario=Scenario.codegeneration, release_version="release_v1",
+            scenario=Scenario.codegeneration, release_version="release_v6",
             subset="code_generation", language="python", not_fast=False,
             start_date=None, end_date=None, k=[1], num_samples=1,
             timeout=60, num_workers=1, num_process_evaluate=1,
@@ -297,17 +321,37 @@ def load_lcb_benchmark() -> Dict[str, Any]:
 
 
 def evaluate_livecodebench(code: Optional[str], question_id: str) -> bool:
-    """Evaluate LiveCodeBench code generation."""
+    """Evaluate LiveCodeBench code generation.
+    
+    Returns:
+        bool: True if all tests passed, False otherwise
+    """
+    result, _ = evaluate_livecodebench_detailed(code, question_id)
+    return result
+
+
+def evaluate_livecodebench_detailed(code: Optional[str], question_id: str) -> Tuple[bool, str]:
+    """Evaluate LiveCodeBench code generation with detailed results.
+    
+    Returns:
+        Tuple[bool, str]: (passed, detailed_reason)
+            - passed: True if all tests passed, False otherwise
+            - detailed_reason: Description of test results or error
+    """
     if not code or not question_id:
-        return False
+        return False, "No code or question_id provided"
 
     lcb_dir = os.path.abspath(os.path.join(
         os.path.dirname(__file__), "submodules", "LiveCodeBench"))
-    benchmark_map = load_lcb_benchmark()
+    
+    try:
+        benchmark_map = load_lcb_benchmark()
+    except Exception as e:
+        return False, f"Failed to load benchmark: {type(e).__name__}: {e}"
 
     instance = benchmark_map.get(question_id)
     if not instance:
-        return False
+        return False, f"Question ID '{question_id}' not found in benchmark"
 
     original_cwd = os.getcwd()
     temp_dir = f"/tmp/temp_lcb_eval_{question_id}_{int(time.time())}"
@@ -322,7 +366,7 @@ def evaluate_livecodebench(code: Optional[str], question_id: str) -> bool:
         from lcb_runner.runner.scenario_router import sort_and_extract_save_results, get_metrics
 
         mock_args = argparse.Namespace(
-            scenario=Scenario.codegeneration, release_version="release_v1",
+            scenario=Scenario.codegeneration, release_version="release_v6",
             subset="code_generation", language="python", not_fast=False,
             start_date=None, end_date=None, k=[1], num_samples=1,
             timeout=60, num_workers=1, num_process_evaluate=1,
@@ -343,22 +387,60 @@ def evaluate_livecodebench(code: Optional[str], question_id: str) -> bool:
         )
 
         graded = extract_instance_results(instance_results)
-        return graded and graded[0] and graded[0][0]
+        passed = graded and graded[0] and graded[0][0]
+        
+        # Try to extract detailed results
+        detailed_reason = ""
+        try:
+            if combined_results and len(combined_results) > 0:
+                result_info = combined_results[0]
+                if hasattr(result_info, 'result') and result_info.result:
+                    # Extract test results
+                    test_results = result_info.result
+                    if isinstance(test_results, dict):
+                        detailed_reason = f"Test results: {test_results}"
+                    elif isinstance(test_results, list):
+                        num_passed = sum(1 for r in test_results if r)
+                        num_total = len(test_results)
+                        detailed_reason = f"Passed {num_passed}/{num_total} test cases"
+                    else:
+                        detailed_reason = f"Result: {test_results}"
+                elif hasattr(result_info, 'status'):
+                    detailed_reason = f"Status: {result_info.status}"
+        except Exception:
+            pass
+        
+        if not detailed_reason:
+            if passed:
+                detailed_reason = "All tests passed"
+            else:
+                detailed_reason = "Failed one or more test cases"
+        
+        return passed, detailed_reason
 
+    except Exception as e:
+        return False, f"Evaluation error: {type(e).__name__}: {str(e)[:200]}"
     finally:
         os.chdir(original_cwd)
         shutil.rmtree(temp_dir, ignore_errors=True)
         os.environ.pop('TQDM_DISABLE', None)
 
 
-def evaluate_livecodebench_worker(args: Tuple[str, str]) -> Tuple[str, bool]:
-    """Worker function for parallel LiveCodeBench evaluation."""
+def evaluate_livecodebench_worker(args: Tuple[str, str]) -> Tuple[str, bool, str]:
+    """Worker function for parallel LiveCodeBench evaluation.
+    
+    Returns:
+        Tuple[str, bool, str]: (question_id, passed, detailed_reason)
+    """
     code, question_id = args
 
     try:
-        return question_id, evaluate_livecodebench(code, question_id)
-    except Exception:
-        return question_id, False
+        passed, reason = evaluate_livecodebench_detailed(code, question_id)
+        return question_id, passed, reason
+    except Exception as e:
+        error_msg = f"Error evaluating {question_id}: {type(e).__name__}: {e}"
+        logger.warning(error_msg)
+        return question_id, False, error_msg
 
 
 # =============================================================================
@@ -456,6 +538,8 @@ def parse_healthbench_json(json_string: str) -> dict:
         return json.loads(json_cleaned)
     except json.JSONDecodeError as e:
         logger.warning(f"JSON decoding failed: {e}")
+        logger.warning(f"Raw LLM response (first 500 chars): {json_string[:500]}")
+        logger.warning(f"Cleaned response (first 500 chars): {json_cleaned[:500]}")
         return {"explanation": "Failed to parse response", "criteria_met": False}
 
 
@@ -495,7 +579,8 @@ def grade_healthbench_with_llm(
     grader_api_key: Optional[str] = None,
     grader_model: str = "gpt-4o-mini",
     grader_base_url: str = "https://api.openai.com/v1",
-    grader_backend: str = "openai"
+    grader_backend: str = "openai",
+    max_workers: Optional[int] = None
 ) -> Tuple[float, str]:
     """Grade a HealthBench response using LLM-as-a-judge.
 
@@ -507,6 +592,7 @@ def grade_healthbench_with_llm(
         grader_model: Model to use for grading
         grader_base_url: Base URL for API
         grader_backend: Backend to use - "openai" or "nvidia" (default: "openai")
+        max_workers: Max concurrent requests for rubric grading (default: all rubrics in parallel)
 
     Returns:
         Tuple of (score, detailed_explanation)
@@ -515,17 +601,24 @@ def grade_healthbench_with_llm(
     if grader_api_key is None:
         if grader_backend == "nvidia":
             grader_api_key = os.environ.get("NVIDIA_NIM_API_KEY")
-            if not grader_api_key:
+            # Check if it's an official NVIDIA URL that requires a key
+            if not grader_api_key and "nvidia.com" in grader_base_url.lower():
                 logger.warning(
                     "No NVIDIA NIM API key found. Set NVIDIA_NIM_API_KEY environment variable.")
                 return 0.0, "Error: No NVIDIA NIM API key provided"
         else:
             grader_api_key = os.environ.get("OPENAI_API_KEY")
-            if not grader_api_key:
+            # Check if it's an official OpenAI URL that requires a key
+            if not grader_api_key and "api.openai.com" in grader_base_url.lower():
                 logger.warning(
                     "No OpenAI API key found. Set OPENAI_API_KEY environment variable.")
                 return 0.0, "Error: No OpenAI API key provided"
 
+        # For local servers, use a dummy key if none provided
+        if grader_api_key is None:
+            grader_api_key = "dummy-key-for-local-server"
+            logger.info(f"Using local server at {grader_base_url}, no API key required")
+
     # Format conversation
     conversation_text = ""
     for msg in prompt_messages:
@@ -534,17 +627,22 @@ def grade_healthbench_with_llm(
         conversation_text += f"{role}: {content}\n\n"
     conversation_text += f"assistant: {model_output}"
 
-    grading_responses = []
-
+    # Prepare all grading prompts
+    grading_tasks = []
     for rubric_item in rubric_items:
-        # Create grading prompt
         grading_prompt = HEALTHBENCH_GRADER_TEMPLATE.replace(
             "<CONVERSATION>", conversation_text
         ).replace("<RUBRIC_ITEM>", str(rubric_item))
+        grading_tasks.append((rubric_item, grading_prompt))
 
+    # Submit all requests concurrently for server-side batching
+    grading_responses = []
+    
+    def _grade_single_rubric(task_data):
+        """Helper to grade a single rubric item."""
+        rubric_item, grading_prompt = task_data
         try:
             if grader_backend == "nvidia":
-                # Use NVIDIA NIM endpoint
                 response_text = _call_nvidia_nim_api(
                     api_key=grader_api_key,
                     model=grader_model,
@@ -554,7 +652,6 @@ def grade_healthbench_with_llm(
                     max_tokens=1024
                 )
             else:
-                # Use OpenAI endpoint
                 response_text = _call_openai_api(
                     api_key=grader_api_key,
                     model=grader_model,
@@ -563,16 +660,20 @@ def grade_healthbench_with_llm(
                     temperature=0.0,
                     max_tokens=1024
                 )
-
-            grading_result = parse_healthbench_json(response_text)
-            grading_responses.append(grading_result)
-
+            return parse_healthbench_json(response_text)
         except Exception as e:
             logger.warning(f"Error grading rubric item: {e}")
-            grading_responses.append({
+            return {
                 "explanation": f"Error during grading: {e}",
                 "criteria_met": False
-            })
+            }
+    
+    # Use ThreadPoolExecutor to send all requests concurrently
+    # The server can batch these together for efficient processing
+    # Default to sending all rubric items in parallel if max_workers not specified
+    num_workers = max_workers if max_workers is not None else len(grading_tasks)
+    with ThreadPoolExecutor(max_workers=num_workers) as executor:
+        grading_responses = list(executor.map(_grade_single_rubric, grading_tasks))
 
     # Calculate overall score
     score = calculate_healthbench_score(rubric_items, grading_responses)
@@ -688,29 +789,31 @@ def evaluate_healthbench(
     parsed_output: Optional[str],
     row_data: pd.Series,
     grader_api_key: Optional[str] = None,
-    grader_backend: str = "openai",
-    grader_model: Optional[str] = None
-) -> Tuple[bool, Optional[str]]:
+    grader_base_url: Optional[str] = None,
+    grader_model: Optional[str] = None,
+    max_workers: Optional[int] = None
+) -> Tuple[float, Optional[str]]:
     """Evaluate HealthBench response using LLM grading.
 
     Args:
         parsed_output: The model output text
         row_data: Full row data containing 'rubrics' and 'prompt'
         grader_api_key: Optional API key for grader
-        grader_backend: Backend to use - "openai" or "nvidia" (default: "openai")
+        grader_base_url: Base URL for API (default: OpenAI API)
         grader_model: Optional model name override
+        max_workers: Max concurrent requests for rubric grading
 
     Returns:
-        Tuple of (is_correct, detailed_explanation)
+        Tuple of (score, detailed_explanation) where score is 0.0-1.0
     """
     if not parsed_output:
-        return False, "Empty output"
+        return 0.0, "Empty output"
 
     # Extract rubrics from row
     rubrics = row_data.get('rubrics', [])
     if not rubrics:
         logger.warning("No rubrics found in row data")
-        return False, "No rubrics available"
+        return 0.0, "No rubrics available"
 
     # Convert to RubricItem objects
     rubric_items = [RubricItem.from_dict(r) for r in rubrics]
@@ -721,18 +824,21 @@ def evaluate_healthbench(
         # If prompt is a string, convert to message format
         prompt = [{"role": "user", "content": prompt}]
 
-    # Set default model based on backend
-    if grader_model is None:
-        if grader_backend == "nvidia":
-            grader_model = "deepseek-ai/deepseek-v3.1-terminus"
-        else:
-            grader_model = "gpt-4o-mini"
+    # Set default base URL if not provided
+    if grader_base_url is None:
+        grader_base_url = "https://api.openai.com/v1"
 
-    # Set base URL based on backend
-    if grader_backend == "nvidia":
-        grader_base_url = "https://integrate.api.nvidia.com/v1/chat/completions"
+    # Auto-detect backend based on URL
+    if "nvidia.com" in grader_base_url.lower():
+        grader_backend = "nvidia"
+        # Set default model for NVIDIA if not specified
+        if grader_model is None:
+            grader_model = "deepseek-ai/deepseek-v3.1-terminus"
     else:
-        grader_base_url = "https://api.openai.com/v1"
+        grader_backend = "openai"
+        # Set default model for OpenAI if not specified
+        if grader_model is None:
+            grader_model = "gpt-4o-mini"
 
     # Grade using LLM
     score, explanation = grade_healthbench_with_llm(
@@ -742,13 +848,13 @@ def evaluate_healthbench(
         grader_api_key=grader_api_key,
         grader_model=grader_model,
         grader_base_url=grader_base_url,
-        grader_backend=grader_backend
+        grader_backend=grader_backend,
+        max_workers=max_workers
     )
 
-    # Consider "correct" if score >= 0.7 (70%)
-    is_correct = score >= 0.7
-
-    return is_correct, f"Score: {score:.2%}\n\n{explanation}"
+    # Return the score (0.0 to 1.0) and detailed explanation
+    # Note: score is returned as-is, not converted to binary pass/fail
+    return score, f"Score: {score:.2%}\n\n{explanation}"
 
 
 # =============================================================================
@@ -839,6 +945,10 @@ def process_livecodebench_parallel(
     if not work_items:
         return 0, 0
 
+    # Ensure evaluation_details column exists
+    if 'evaluation_details' not in df.columns:
+        df['evaluation_details'] = None
+
     # Process in parallel
     max_workers = min(multiprocessing.cpu_count(), len(work_items))
     logger.info(
@@ -858,19 +968,222 @@ def process_livecodebench_parallel(
             idx = future_to_idx[future]
 
             try:
-                question_id, is_correct = future.result(timeout=30)
+                question_id, is_correct, detailed_reason = future.result(timeout=30)
                 df.at[idx, 'prompt_accuracy'] = 100.0 if is_correct else 0.0
+                df.at[idx, 'evaluation_details'] = detailed_reason
                 total_evaluated += 1
                 if is_correct:
                     correct_count += 1
             except Exception as e:
                 logger.error(f"Error evaluating row {idx}: {e}")
                 df.at[idx, 'prompt_accuracy'] = 0.0
+                df.at[idx, 'evaluation_details'] = f"Error: {e}"
                 total_evaluated += 1
 
     return correct_count, total_evaluated
 
 
+def evaluate_healthbench_batch(
+    df: pd.DataFrame,
+    group_indices: pd.Index,
+    grader_api_key: Optional[str] = None,
+    grader_base_url: Optional[str] = None,
+    grader_model: Optional[str] = None,
+    max_workers: Optional[int] = None
+) -> Dict[int, Tuple[float, str]]:
+    """Evaluate all HealthBench rows with batched rubric grading across all rows.
+    
+    Args:
+        df: DataFrame containing the data
+        group_indices: Indices of rows to evaluate
+        grader_api_key: Optional API key for grader
+        grader_base_url: Base URL for API
+        grader_model: Model name
+        max_workers: Max concurrent requests
+    
+    Returns:
+        Dictionary mapping row index to (score, explanation) tuple
+    """
+    # Set default base URL if not provided
+    if grader_base_url is None:
+        grader_base_url = "https://api.openai.com/v1"
+    
+    # Auto-detect backend based on URL
+    if "nvidia.com" in grader_base_url.lower():
+        grader_backend = "nvidia"
+        if grader_model is None:
+            grader_model = "deepseek-ai/deepseek-v3.1-terminus"
+    else:
+        grader_backend = "openai"
+        if grader_model is None:
+            grader_model = "gpt-4o-mini"
+    
+    # Handle API key
+    if grader_api_key is None:
+        if grader_backend == "nvidia":
+            grader_api_key = os.environ.get("NVIDIA_NIM_API_KEY")
+            if not grader_api_key and "nvidia.com" in grader_base_url.lower():
+                logger.warning("No NVIDIA NIM API key found. Set NVIDIA_NIM_API_KEY environment variable.")
+                return {idx: (0.0, "Error: No NVIDIA NIM API key provided") for idx in group_indices}
+        else:
+            grader_api_key = os.environ.get("OPENAI_API_KEY")
+            if not grader_api_key and "api.openai.com" in grader_base_url.lower():
+                logger.warning("No OpenAI API key found. Set OPENAI_API_KEY environment variable.")
+                return {idx: (0.0, "Error: No OpenAI API key provided") for idx in group_indices}
+        
+        if grader_api_key is None:
+            grader_api_key = "dummy-key-for-local-server"
+            logger.info(f"Using local server at {grader_base_url}, no API key required")
+    
+    # Prepare all grading tasks for all rows
+    all_tasks = []
+    row_rubric_map = {}  # Maps task_id to (row_idx, rubric_idx)
+    task_id = 0
+    
+    for idx in group_indices:
+        row = df.loc[idx]
+        extracted = row.get('extracted_answer')
+        
+        if extracted is None or pd.isna(extracted):
+            row_rubric_map[f"row_{idx}_skip"] = (idx, None)
+            continue
+        
+        # Extract rubrics and prompt
+        rubrics = row.get('rubrics', [])
+        if not rubrics:
+            logger.warning(f"No rubrics found for row {idx}")
+            row_rubric_map[f"row_{idx}_skip"] = (idx, None)
+            continue
+        
+        rubric_items = [RubricItem.from_dict(r) for r in rubrics]
+        prompt = row.get('prompt', [])
+        if isinstance(prompt, str):
+            prompt = [{"role": "user", "content": prompt}]
+        
+        # Format conversation
+        conversation_text = ""
+        for msg in prompt:
+            role = msg.get("role", "unknown")
+            content = msg.get("content", "")
+            conversation_text += f"{role}: {content}\n\n"
+        conversation_text += f"assistant: {extracted}"
+        
+        # Create grading tasks for all rubrics in this row
+        for rubric_idx, rubric_item in enumerate(rubric_items):
+            grading_prompt = HEALTHBENCH_GRADER_TEMPLATE.replace(
+                "<CONVERSATION>", conversation_text
+            ).replace("<RUBRIC_ITEM>", str(rubric_item))
+            
+            all_tasks.append({
+                'task_id': task_id,
+                'prompt': grading_prompt,
+                'backend': grader_backend
+            })
+            row_rubric_map[task_id] = (idx, rubric_idx, rubric_item)
+            task_id += 1
+    
+    if not all_tasks:
+        logger.warning("No grading tasks to process")
+        return {}
+    
+    logger.info(f"Batching {len(all_tasks)} rubric grading requests across {len(group_indices)} rows")
+    
+    # Define grading function
+    def _grade_single_task(task):
+        """Grade a single rubric item."""
+        try:
+            if task['backend'] == "nvidia":
+                response_text = _call_nvidia_nim_api(
+                    api_key=grader_api_key,
+                    model=grader_model,
+                    messages=[{"role": "user", "content": task['prompt']}],
+                    base_url=grader_base_url,
+                    temperature=0.0,
+                    max_tokens=1024
+                )
+            else:
+                response_text = _call_openai_api(
+                    api_key=grader_api_key,
+                    model=grader_model,
+                    messages=[{"role": "user", "content": task['prompt']}],
+                    base_url=grader_base_url,
+                    temperature=0.0,
+                    max_tokens=1024
+                )
+            return task['task_id'], parse_healthbench_json(response_text)
+        except Exception as e:
+            logger.warning(f"Error grading task {task['task_id']}: {e}")
+            return task['task_id'], {
+                "explanation": f"Error during grading: {e}",
+                "criteria_met": False
+            }
+    
+    # Send all requests concurrently for server-side batching
+    num_workers = max_workers if max_workers is not None else len(all_tasks)
+    grading_results = {}
+    
+    with ThreadPoolExecutor(max_workers=num_workers) as executor:
+        futures = {executor.submit(_grade_single_task, task): task['task_id'] for task in all_tasks}
+        
+        for future in tqdm(as_completed(futures), total=len(futures), desc="Grading HealthBench (batched)"):
+            try:
+                task_id, result = future.result(timeout=60)
+                grading_results[task_id] = result
+            except Exception as e:
+                task_id = futures[future]
+                logger.error(f"Error processing task {task_id}: {e}")
+                grading_results[task_id] = {
+                    "explanation": f"Error during grading: {e}",
+                    "criteria_met": False
+                }
+    
+    # Reconstruct results per row
+    row_results = {}
+    rows_rubrics = {}  # Group results by row: {row_idx: {rubric_idx: (rubric_item, grading_result)}}
+    
+    for task_id, grading_result in grading_results.items():
+        if task_id not in row_rubric_map:
+            continue
+        
+        row_idx, rubric_idx, rubric_item = row_rubric_map[task_id]
+        
+        if row_idx not in rows_rubrics:
+            rows_rubrics[row_idx] = {}
+        
+        rows_rubrics[row_idx][rubric_idx] = (rubric_item, grading_result)
+    
+    # Calculate scores for each row
+    for row_idx, rubric_data in rows_rubrics.items():
+        # Sort by rubric_idx to maintain correct order
+        sorted_rubrics = sorted(rubric_data.items(), key=lambda x: x[0])
+        rubric_items = [item for _, (item, _) in sorted_rubrics]
+        grading_responses = [response for _, (_, response) in sorted_rubrics]
+        
+        # Calculate overall score
+        score = calculate_healthbench_score(rubric_items, grading_responses)
+        
+        # Create detailed explanation
+        explanations = []
+        for rubric_item, response in zip(rubric_items, grading_responses):
+            met = response.get("criteria_met", False)
+            explanation = response.get("explanation", "No explanation")
+            explanations.append(
+                f"[{'✓' if met else '✗'}] {rubric_item}\n    Explanation: {explanation}"
+            )
+        
+        detailed_explanation = f"Score: {score:.2%}\n\n" + "\n\n".join(explanations)
+        row_results[row_idx] = (score, detailed_explanation)
+    
+    # Handle skipped rows
+    for key, value in row_rubric_map.items():
+        if isinstance(key, str) and key.startswith("row_") and key.endswith("_skip"):
+            row_idx = value[0]
+            if row_idx not in row_results:
+                row_results[row_idx] = (0.0, "Empty output or no rubrics")
+    
+    return row_results
+
+
 def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """Process entire dataframe with optimized batch processing."""
     validate_dataframe(df)
@@ -878,6 +1191,7 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     df_output = df.copy()
     df_output['extracted_answer'] = None
     df_output['prompt_accuracy'] = 0.0
+    df_output['evaluation_details'] = None  # Add evaluation details column
 
     # Process by dataset
     for dataset_name, group_indices in tqdm(df_output.groupby('dataset').groups.items(),
@@ -890,46 +1204,42 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
         for idx in group_indices:
             row = df_output.loc[idx]
             raw_output = validate_text_input(row['model_output'])
-            df_output.at[idx, 'extracted_answer'] = evaluator['parse'](
-                raw_output)
+            extracted = evaluator['parse'](raw_output)
+            df_output.at[idx, 'extracted_answer'] = extracted
+            
+            # Set initial evaluation details for rows without extracted answers
+            if extracted is None or pd.isna(extracted):
+                df_output.at[idx, 'evaluation_details'] = "No code extracted from model output"
 
         # Evaluate answers
         if 'livecodebench' in dataset_name.lower():
             correct_count, total_evaluated = process_livecodebench_parallel(
                 df_output, group_indices)
         elif 'healthbench' in dataset_name.lower():
-            # HealthBench evaluation with LLM grading
-            correct_count = 0
+            # HealthBench evaluation with LLM grading - batched across all rows
+            total_score = 0.0
             total_evaluated = 0
-
-            for idx in tqdm(group_indices, desc=f"Evaluating {dataset_name}"):
-                row = df_output.loc[idx]
-                extracted = row['extracted_answer']
-
-                if extracted is not None:
-                    try:
-                        # HealthBench needs full row data for rubrics and
-                        # prompts
-                        is_correct, explanation = evaluator['evaluate'](
-                            extracted,
-                            row,
-                            grader_backend=LLM_JUDGE_BACKEND,
-                            grader_model=LLM_JUDGE_MODEL
-                        )
-                        df_output.at[idx,
-                                     'prompt_accuracy'] = 100.0 if is_correct else 0.0
-                        # Store explanation in a new column if needed
-                        if 'evaluation_details' not in df_output.columns:
-                            df_output['evaluation_details'] = None
-                        df_output.at[idx, 'evaluation_details'] = explanation
-                        total_evaluated += 1
-                        if is_correct:
-                            correct_count += 1
-                    except Exception as e:
-                        logger.error(
-                            f"Error evaluating HealthBench row {idx}: {e}")
-                        df_output.at[idx, 'prompt_accuracy'] = 0.0
-                        total_evaluated += 1
+            
+            # Process all rows with batched grading
+            results = evaluate_healthbench_batch(
+                df_output,
+                group_indices,
+                grader_api_key=LLM_JUDGE_API_KEY,
+                grader_base_url=LLM_JUDGE_BASE_URL,
+                grader_model=LLM_JUDGE_MODEL,
+                max_workers=LLM_JUDGE_MAX_WORKERS
+            )
+            
+            # Store results
+            for idx, (score, explanation) in results.items():
+                # Store score as percentage (0-100)
+                df_output.at[idx, 'prompt_accuracy'] = score * 100.0
+                # Store explanation in a new column if needed
+                if 'evaluation_details' not in df_output.columns:
+                    df_output['evaluation_details'] = None
+                df_output.at[idx, 'evaluation_details'] = explanation
+                total_evaluated += 1
+                total_score += score
         else:
             # Sequential evaluation for other datasets
             correct_count = 0
@@ -950,9 +1260,16 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
 
         # Log results
         if total_evaluated > 0:
-            accuracy = correct_count / total_evaluated * 100
-            logger.info(
-                f"{dataset_name} results: {correct_count}/{total_evaluated} correct ({accuracy:.1f}% accuracy)")
+            if 'healthbench' in dataset_name.lower():
+                # For HealthBench, report average score
+                avg_score = total_score / total_evaluated * 100
+                logger.info(
+                    f"{dataset_name} results: Average score {avg_score:.1f}% ({total_evaluated} samples)")
+            else:
+                # For other datasets, report accuracy
+                accuracy = correct_count / total_evaluated * 100
+                logger.info(
+                    f"{dataset_name} results: {correct_count}/{total_evaluated} correct ({accuracy:.1f}% accuracy)")
 
     return df_output
 
@@ -983,10 +1300,22 @@ def print_evaluation_results(df_evaluated: pd.DataFrame,
     # tok_model_output_len is now a required column
     mean_output_len = float(df_evaluated['tok_model_output_len'].mean())
 
+    # Check if this is HealthBench dataset
+    is_healthbench = False
+    if 'dataset' in df_evaluated.columns:
+        datasets = df_evaluated['dataset'].unique()
+        is_healthbench = any('healthbench' in str(ds).lower() for ds in datasets)
+
+    # Use appropriate metric name
+    if is_healthbench:
+        metric_key = 'healthbench_score'
+    else:
+        metric_key = 'exact_match'
+
     results = {
         # 'evaluated': int(evaluated),
         # 'correct': int(correct),
-        'exact_match': float(accuracy),
+        metric_key: float(accuracy),
         'tokens_per_sample': mean_output_len,
         'num-samples': len(df_evaluated),
     }
@@ -1085,19 +1414,27 @@ def main():
         "--output-file", help="Output pickle file (defaults to <input-file>_evaluated.pkl)")
     parser.add_argument("--verbose", action="store_true",
                         help="Verbose logging")
-    parser.add_argument("--llm-judge-backend",
-                        choices=["openai", "nvidia"],
-                        default="openai",
-                        help="Backend for HealthBench LLM judge (default: openai)")
+    parser.add_argument("--llm-judge-base-url",
+                        help="Base URL for HealthBench LLM judge API (default: https://api.openai.com/v1). "
+                             "For local servers like SGLang, use http://localhost:8000/v1")
     parser.add_argument("--llm-judge",
-                        help="Model for HealthBench LLM judge (default: gpt-4o-mini for openai, deepseek-ai/deepseek-v3.1-terminus for nvidia)")
+                        help="Model for HealthBench LLM judge (default: gpt-4o-mini for OpenAI-compatible APIs, "
+                             "deepseek-ai/deepseek-v3.1-terminus for NVIDIA)")
+    parser.add_argument("--llm-judge-api-key",
+                        help="API key for HealthBench LLM judge (default: read from OPENAI_API_KEY or NVIDIA_NIM_API_KEY env var). "
+                             "Not required for local servers.")
+    parser.add_argument("--llm-judge-max-workers", type=int,
+                        help="Max concurrent requests per row for HealthBench rubric grading (default: all rubrics in parallel). "
+                             "Useful for rate limiting or controlling server load.")
 
     args = parser.parse_args()
 
     # Set global configuration for HealthBench LLM judge
-    global LLM_JUDGE_BACKEND, LLM_JUDGE_MODEL
-    LLM_JUDGE_BACKEND = args.llm_judge_backend
+    global LLM_JUDGE_BASE_URL, LLM_JUDGE_MODEL, LLM_JUDGE_API_KEY, LLM_JUDGE_MAX_WORKERS
+    LLM_JUDGE_BASE_URL = args.llm_judge_base_url
     LLM_JUDGE_MODEL = args.llm_judge
+    LLM_JUDGE_API_KEY = args.llm_judge_api_key
+    LLM_JUDGE_MAX_WORKERS = args.llm_judge_max_workers
 
     if args.verbose:
         logging.getLogger().setLevel(logging.DEBUG)
diff --git a/language/gpt-oss/golden-datasets/fetch_aime.py b/language/gpt-oss/golden-datasets/fetch_aime.py
index d877854038..bbe06372ab 100644
--- a/language/gpt-oss/golden-datasets/fetch_aime.py
+++ b/language/gpt-oss/golden-datasets/fetch_aime.py
@@ -1,13 +1,21 @@
 from datasets import load_dataset
 import pandas as pd
+import argparse
+import os
 
-df_1 = load_dataset("opencompass/AIME2025", "AIME2025-I")['test'].to_pandas()
-df_2 = load_dataset("opencompass/AIME2025", "AIME2025-II")['test'].to_pandas()
+parser = argparse.ArgumentParser()
+parser.add_argument("--dataset_folder", type=str, default="dataset")
+args = parser.parse_args()
 
-df = pd.concat([df_1, df_2], ignore_index=True)
+df = load_dataset("di-zhang-fdu/AIME_1983_2024")['train'].to_pandas()
 
-df.rename(columns={'answer': 'ground_truth'}, inplace=True)
-df['dataset'] = 'aime2025'
+df_1_aime2025 = load_dataset("opencompass/AIME2025", "AIME2025-I")['test'].to_pandas()
+df_2_aime2025 = load_dataset("opencompass/AIME2025", "AIME2025-II")['test'].to_pandas()
+# df_aime2025 = pd.concat([df_1_aime2025, df_2_aime2025], ignore_index=True)
+# df_aime2025.rename(columns={'answer': 'ground_truth'}, inplace=True)
 
-breakpoint()
-df.to_pickle('aime2025.pkl')
+df.rename(columns={'Answer': 'ground_truth', 'Question': 'question'}, inplace=True)
+df.drop(columns=['Year', 'ID', 'Problem Number', 'Part'], inplace=True)
+df['dataset'] = 'aime1983'
+
+df.to_pickle(os.path.join(args.dataset_folder, 'aime1983-2024.pkl'))
diff --git a/language/gpt-oss/golden-datasets/fetch_gpqa.py b/language/gpt-oss/golden-datasets/fetch_gpqa.py
index 74fbdca38d..79ba3525f4 100644
--- a/language/gpt-oss/golden-datasets/fetch_gpqa.py
+++ b/language/gpt-oss/golden-datasets/fetch_gpqa.py
@@ -1,13 +1,20 @@
 from datasets import load_dataset
 import pandas as pd
 import random
+import argparse
+import os
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--dataset_folder", type=str, default="dataset")
+args = parser.parse_args()
+
 random.seed(42)
 
 # Login using e.g. `huggingface-cli login` to access this dataset
 ds_diamond = load_dataset("Idavidrein/gpqa", "gpqa_diamond")
-ds_experts = load_dataset("Idavidrein/gpqa", "gpqa_experts")
-ds_main = load_dataset("Idavidrein/gpqa", "gpqa_main")
-ds_extended = load_dataset("Idavidrein/gpqa", "gpqa_extended")
+# ds_experts = load_dataset("Idavidrein/gpqa", "gpqa_experts")
+# ds_main = load_dataset("Idavidrein/gpqa", "gpqa_main")
+# ds_extended = load_dataset("Idavidrein/gpqa", "gpqa_extended")
 
 df_diamond = ds_diamond['train'].to_pandas()
 # df_experts = ds_experts['train'].to_pandas()
@@ -22,7 +29,7 @@
          'High-level domain',
          'Incorrect Answer 1',
          'Incorrect Answer 2',
-         'Incorrect Answer 3']]
+         'Incorrect Answer 3']].copy()
 
 for idx, row in df.iterrows():
     options = [str(row[col]) for col in ['Incorrect Answer 1',
@@ -45,5 +52,4 @@
     inplace=True)
 df['dataset'] = 'gpqa'
 
-breakpoint()
-df.to_pickle('gpqa.pkl')
+df.to_pickle(os.path.join(args.dataset_folder, 'gpqa_diamond.pkl'))
diff --git a/language/gpt-oss/golden-datasets/fetch_healthbench.py b/language/gpt-oss/golden-datasets/fetch_healthbench.py
index db1f7f0fae..861817ae6f 100644
--- a/language/gpt-oss/golden-datasets/fetch_healthbench.py
+++ b/language/gpt-oss/golden-datasets/fetch_healthbench.py
@@ -1,14 +1,17 @@
 import os
-import sys
 import glob
 import json
 import pandas as pd
+import argparse
+import os
 
-# Replace this with your dataset folder path
-dataset_folder = sys.argv[1]
+parser = argparse.ArgumentParser()
+parser.add_argument("--dataset_folder", type=str, default="dataset")
+parser.add_argument("--healthbench_folder", type=str, default="healthbench")
+args = parser.parse_args()
 
 # Find all jsonl files (update the path and pattern to match actual data)
-files = glob.glob(os.path.join(dataset_folder, "*.jsonl"))
+files = glob.glob(os.path.join(args.healthbench_folder, "*.jsonl"))
 
 all_rows = []
 all_columns = set()
@@ -33,5 +36,4 @@
 # Create DataFrame
 df = pd.DataFrame(all_rows, columns=all_columns)
 df['dataset'] = 'healthbench'
-breakpoint()
-df.to_pickle('healthbench.pkl')
+df.to_pickle(os.path.join(args.dataset_folder, 'healthbench.pkl'))
diff --git a/language/gpt-oss/golden-datasets/fetch_mmlu.py b/language/gpt-oss/golden-datasets/fetch_mmlu.py
index 82839fffb7..30d3d32132 100644
--- a/language/gpt-oss/golden-datasets/fetch_mmlu.py
+++ b/language/gpt-oss/golden-datasets/fetch_mmlu.py
@@ -1,11 +1,15 @@
 import pandas as pd
 from datasets import load_dataset
+import argparse
+import os
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--dataset_folder", type=str, default="dataset")
+args = parser.parse_args()
 
 ds = load_dataset("cais/mmlu", "all")
 df = ds['auxiliary_train'].to_pandas()
 
-breakpoint()
-
 for col in ['subject']:
     df.drop(col, axis=1, inplace=True)
 
@@ -21,5 +25,4 @@
     df.loc[row.Index, 'question'] = question
     df.loc[row.Index, 'ground_truth'] = f"{chr(65+row.answer)}"
 
-breakpoint()
-df.to_pickle('mmlu.pkl')
+df.to_pickle(os.path.join(args.dataset_folder, 'mmlu.pkl'))
diff --git a/language/gpt-oss/golden-datasets/fetch_mmlu_pro.py b/language/gpt-oss/golden-datasets/fetch_mmlu_pro.py
deleted file mode 100644
index d3bd246914..0000000000
--- a/language/gpt-oss/golden-datasets/fetch_mmlu_pro.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import pandas as pd
-from datasets import load_dataset
-
-ds = load_dataset("TIGER-Lab/MMLU-Pro")
-df_test = ds['test'].to_pandas()
-df_validation = ds['validation'].to_pandas()
-
-df = pd.concat([df_validation, df_test], ignore_index=True)
-
-for col in ['answer_index', 'cot_content', 'category', 'src']:
-    df.drop(col, axis=1, inplace=True)
-
-df.rename(
-    columns={
-        'question': 'base_question',
-        'answer': 'ground_truth'},
-    inplace=True)
-df['dataset'] = 'mmlu_pro'
-
-for row in df.itertuples():
-    base_question = row.base_question
-    options = row.options
-    question = f"{base_question}"
-    for idx, option in enumerate(options):
-        question += f"\n{chr(65+idx)}) {option}"
-    df.loc[row.Index, 'question'] = question
-
-breakpoint()
-df.to_pickle('mmlu_pro.pkl')
diff --git a/language/gpt-oss/harmonize_inputs.py b/language/gpt-oss/harmonize_inputs.py
index 58a4b317b0..aabc0481d4 100644
--- a/language/gpt-oss/harmonize_inputs.py
+++ b/language/gpt-oss/harmonize_inputs.py
@@ -310,12 +310,12 @@ def process_row(args):
     Worker function to process a single row from the dataframe.
 
     Args:
-        args: tuple of (index, row, dataset_function_map)
+        args: tuple of (index, row, dataset_function_map, reasoning_effort)
 
     Returns:
         tuple: (index, convo, tokens, dataset_name) or (index, None, None, dataset_name, error)
     """
-    index, row, dataset_function_map = args
+    index, row, dataset_function_map, reasoning_effort = args
     dataset_name = row["dataset"]
 
     if dataset_name == "healthbench":
@@ -327,7 +327,7 @@ def process_row(args):
         # Get the appropriate function based on dataset type
         if dataset_name in dataset_function_map:
             create_prompt_func = dataset_function_map[dataset_name]
-            convo, tokens = create_prompt_func(user_query)
+            convo, tokens = create_prompt_func(user_query, reasoning_effort)
             return (index, convo, tokens, dataset_name)
         else:
             error_msg = f"Unknown dataset '{dataset_name}' at index {index}"
@@ -383,6 +383,7 @@ def process_row(args):
     # Create mapping from dataset names to prompt creation functions
     dataset_function_map = {
         'aime1983': create_aime1983_prompt,
+        'aime2025': create_aime1983_prompt,
         'gpqa': create_gpqa_prompt,
         'livecodebench': create_livecodebench_prompt,
         'math500': create_math500_prompt,
@@ -392,7 +393,7 @@ def process_row(args):
     }
 
     # Prepare data for parallel processing
-    process_args = [(index, row, dataset_function_map)
+    process_args = [(index, row, dataset_function_map, reasoning_effort)
                     for index, row in df.iterrows()]
 
     # Don't use more processes than we have rows
@@ -421,13 +422,6 @@ def process_row(args):
     successful_count = 0
     error_count = 0
 
-    # Set reference columns to None
-    df['ref_accuracy'] = None
-    df['ref_extracted_answer'] = None
-    df['ref_output'] = None
-    df['tok_ref_output'] = None
-    df['tok_ref_output_len'] = None
-
     # Set templated text input to None (if it exists)
     if 'templated_text_input' in df.columns:
         df['templated_text_input'] = None
@@ -482,10 +476,8 @@ def process_row(args):
     # Verify input and output have identical column lists
     input_columns = list(pd.read_pickle(args.data_file).columns)
     output_columns = list(df.columns)
-    if input_columns != output_columns:
-        print(f"WARNING: Column lists differ!")
-        print(f"Input columns: {input_columns}")
-        print(f"Output columns: {output_columns}")
+    print(f"Input columns: {input_columns}")
+    print(f"Output columns: {output_columns}")
 
     # Save the modified DataFrame as pickle
     print("Saving modified DataFrame to pickle...")
@@ -498,4 +490,3 @@ def process_row(args):
     print(f"  - Total processed: {successful_count + error_count} queries")
     print(f"  - Modified DataFrame shape: {df.shape}")
     print(f"  - Updated columns: tok_input, tok_input_len, text_input")
-    print(f"  - Set to None: ref_accuracy, ref_extracted_answer, ref_output, tok_ref_output, tok_ref_output_len, templated_text_input")
diff --git a/language/gpt-oss/requirements.txt b/language/gpt-oss/requirements.txt
index e9ca471cb4..f86ed9de3a 100644
--- a/language/gpt-oss/requirements.txt
+++ b/language/gpt-oss/requirements.txt
@@ -5,4 +5,6 @@ numba>=0.51.0
 pooch>=1.0
 scikit-learn>=0.20.0
 soxr>=0.3.2
-absl-py>=2.3.1
\ No newline at end of file
+absl-py>=2.3.1
+lazy-loader>=0.1
+datasets>=2.0.0,<3.0.0
\ No newline at end of file

From 534390c17ac6ce96fe9e7fc881b6210d4af67f5c Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 29 Oct 2025 07:04:19 +0000
Subject: [PATCH 058/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/eval_accuracy.py             | 161 ++++++++++--------
 .../gpt-oss/golden-datasets/fetch_aime.py     |  14 +-
 2 files changed, 105 insertions(+), 70 deletions(-)

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
index 2388ba2893..7403e2835e 100644
--- a/language/gpt-oss/eval_accuracy.py
+++ b/language/gpt-oss/eval_accuracy.py
@@ -182,7 +182,7 @@ def parse_aime_answer(text: str) -> Optional[int]:
 
 def parse_code(text: str) -> Optional[str]:
     """Parse code from ```python or plain ``` code block.
-    
+
     Priority:
     1. Last ```python block
     2. Last plain ``` block (if it looks like Python code)
@@ -195,7 +195,7 @@ def parse_code(text: str) -> Optional[str]:
     python_matches = list(re.finditer(r"```python(.*?)```", text, re.DOTALL))
     if python_matches:
         return python_matches[-1].group(1).strip()
-    
+
     # Fall back to plain ``` blocks
     plain_matches = list(re.finditer(r"```(.*?)```", text, re.DOTALL))
     if plain_matches:
@@ -204,7 +204,7 @@ def parse_code(text: str) -> Optional[str]:
         # Remove language tag if present (e.g., ```python\n or ```py\n)
         code = re.sub(r'^(?:python|py)\s*\n', '', code, flags=re.IGNORECASE)
         return code
-    
+
     return None
 
 
@@ -322,7 +322,7 @@ def load_lcb_benchmark() -> Dict[str, Any]:
 
 def evaluate_livecodebench(code: Optional[str], question_id: str) -> bool:
     """Evaluate LiveCodeBench code generation.
-    
+
     Returns:
         bool: True if all tests passed, False otherwise
     """
@@ -330,9 +330,10 @@ def evaluate_livecodebench(code: Optional[str], question_id: str) -> bool:
     return result
 
 
-def evaluate_livecodebench_detailed(code: Optional[str], question_id: str) -> Tuple[bool, str]:
+def evaluate_livecodebench_detailed(
+        code: Optional[str], question_id: str) -> Tuple[bool, str]:
     """Evaluate LiveCodeBench code generation with detailed results.
-    
+
     Returns:
         Tuple[bool, str]: (passed, detailed_reason)
             - passed: True if all tests passed, False otherwise
@@ -343,7 +344,7 @@ def evaluate_livecodebench_detailed(code: Optional[str], question_id: str) -> Tu
 
     lcb_dir = os.path.abspath(os.path.join(
         os.path.dirname(__file__), "submodules", "LiveCodeBench"))
-    
+
     try:
         benchmark_map = load_lcb_benchmark()
     except Exception as e:
@@ -388,7 +389,7 @@ def evaluate_livecodebench_detailed(code: Optional[str], question_id: str) -> Tu
 
         graded = extract_instance_results(instance_results)
         passed = graded and graded[0] and graded[0][0]
-        
+
         # Try to extract detailed results
         detailed_reason = ""
         try:
@@ -409,13 +410,13 @@ def evaluate_livecodebench_detailed(code: Optional[str], question_id: str) -> Tu
                     detailed_reason = f"Status: {result_info.status}"
         except Exception:
             pass
-        
+
         if not detailed_reason:
             if passed:
                 detailed_reason = "All tests passed"
             else:
                 detailed_reason = "Failed one or more test cases"
-        
+
         return passed, detailed_reason
 
     except Exception as e:
@@ -426,9 +427,10 @@ def evaluate_livecodebench_detailed(code: Optional[str], question_id: str) -> Tu
         os.environ.pop('TQDM_DISABLE', None)
 
 
-def evaluate_livecodebench_worker(args: Tuple[str, str]) -> Tuple[str, bool, str]:
+def evaluate_livecodebench_worker(
+        args: Tuple[str, str]) -> Tuple[str, bool, str]:
     """Worker function for parallel LiveCodeBench evaluation.
-    
+
     Returns:
         Tuple[str, bool, str]: (question_id, passed, detailed_reason)
     """
@@ -538,8 +540,10 @@ def parse_healthbench_json(json_string: str) -> dict:
         return json.loads(json_cleaned)
     except json.JSONDecodeError as e:
         logger.warning(f"JSON decoding failed: {e}")
-        logger.warning(f"Raw LLM response (first 500 chars): {json_string[:500]}")
-        logger.warning(f"Cleaned response (first 500 chars): {json_cleaned[:500]}")
+        logger.warning(
+            f"Raw LLM response (first 500 chars): {json_string[:500]}")
+        logger.warning(
+            f"Cleaned response (first 500 chars): {json_cleaned[:500]}")
         return {"explanation": "Failed to parse response", "criteria_met": False}
 
 
@@ -617,7 +621,8 @@ def grade_healthbench_with_llm(
         # For local servers, use a dummy key if none provided
         if grader_api_key is None:
             grader_api_key = "dummy-key-for-local-server"
-            logger.info(f"Using local server at {grader_base_url}, no API key required")
+            logger.info(
+                f"Using local server at {grader_base_url}, no API key required")
 
     # Format conversation
     conversation_text = ""
@@ -637,7 +642,7 @@ def grade_healthbench_with_llm(
 
     # Submit all requests concurrently for server-side batching
     grading_responses = []
-    
+
     def _grade_single_rubric(task_data):
         """Helper to grade a single rubric item."""
         rubric_item, grading_prompt = task_data
@@ -667,13 +672,18 @@ def _grade_single_rubric(task_data):
                 "explanation": f"Error during grading: {e}",
                 "criteria_met": False
             }
-    
+
     # Use ThreadPoolExecutor to send all requests concurrently
     # The server can batch these together for efficient processing
-    # Default to sending all rubric items in parallel if max_workers not specified
-    num_workers = max_workers if max_workers is not None else len(grading_tasks)
+    # Default to sending all rubric items in parallel if max_workers not
+    # specified
+    num_workers = max_workers if max_workers is not None else len(
+        grading_tasks)
     with ThreadPoolExecutor(max_workers=num_workers) as executor:
-        grading_responses = list(executor.map(_grade_single_rubric, grading_tasks))
+        grading_responses = list(
+            executor.map(
+                _grade_single_rubric,
+                grading_tasks))
 
     # Calculate overall score
     score = calculate_healthbench_score(rubric_items, grading_responses)
@@ -968,7 +978,8 @@ def process_livecodebench_parallel(
             idx = future_to_idx[future]
 
             try:
-                question_id, is_correct, detailed_reason = future.result(timeout=30)
+                question_id, is_correct, detailed_reason = future.result(
+                    timeout=30)
                 df.at[idx, 'prompt_accuracy'] = 100.0 if is_correct else 0.0
                 df.at[idx, 'evaluation_details'] = detailed_reason
                 total_evaluated += 1
@@ -992,7 +1003,7 @@ def evaluate_healthbench_batch(
     max_workers: Optional[int] = None
 ) -> Dict[int, Tuple[float, str]]:
     """Evaluate all HealthBench rows with batched rubric grading across all rows.
-    
+
     Args:
         df: DataFrame containing the data
         group_indices: Indices of rows to evaluate
@@ -1000,14 +1011,14 @@ def evaluate_healthbench_batch(
         grader_base_url: Base URL for API
         grader_model: Model name
         max_workers: Max concurrent requests
-    
+
     Returns:
         Dictionary mapping row index to (score, explanation) tuple
     """
     # Set default base URL if not provided
     if grader_base_url is None:
         grader_base_url = "https://api.openai.com/v1"
-    
+
     # Auto-detect backend based on URL
     if "nvidia.com" in grader_base_url.lower():
         grader_backend = "nvidia"
@@ -1017,49 +1028,54 @@ def evaluate_healthbench_batch(
         grader_backend = "openai"
         if grader_model is None:
             grader_model = "gpt-4o-mini"
-    
+
     # Handle API key
     if grader_api_key is None:
         if grader_backend == "nvidia":
             grader_api_key = os.environ.get("NVIDIA_NIM_API_KEY")
             if not grader_api_key and "nvidia.com" in grader_base_url.lower():
-                logger.warning("No NVIDIA NIM API key found. Set NVIDIA_NIM_API_KEY environment variable.")
-                return {idx: (0.0, "Error: No NVIDIA NIM API key provided") for idx in group_indices}
+                logger.warning(
+                    "No NVIDIA NIM API key found. Set NVIDIA_NIM_API_KEY environment variable.")
+                return {idx: (0.0, "Error: No NVIDIA NIM API key provided")
+                        for idx in group_indices}
         else:
             grader_api_key = os.environ.get("OPENAI_API_KEY")
             if not grader_api_key and "api.openai.com" in grader_base_url.lower():
-                logger.warning("No OpenAI API key found. Set OPENAI_API_KEY environment variable.")
-                return {idx: (0.0, "Error: No OpenAI API key provided") for idx in group_indices}
-        
+                logger.warning(
+                    "No OpenAI API key found. Set OPENAI_API_KEY environment variable.")
+                return {idx: (0.0, "Error: No OpenAI API key provided")
+                        for idx in group_indices}
+
         if grader_api_key is None:
             grader_api_key = "dummy-key-for-local-server"
-            logger.info(f"Using local server at {grader_base_url}, no API key required")
-    
+            logger.info(
+                f"Using local server at {grader_base_url}, no API key required")
+
     # Prepare all grading tasks for all rows
     all_tasks = []
     row_rubric_map = {}  # Maps task_id to (row_idx, rubric_idx)
     task_id = 0
-    
+
     for idx in group_indices:
         row = df.loc[idx]
         extracted = row.get('extracted_answer')
-        
+
         if extracted is None or pd.isna(extracted):
             row_rubric_map[f"row_{idx}_skip"] = (idx, None)
             continue
-        
+
         # Extract rubrics and prompt
         rubrics = row.get('rubrics', [])
         if not rubrics:
             logger.warning(f"No rubrics found for row {idx}")
             row_rubric_map[f"row_{idx}_skip"] = (idx, None)
             continue
-        
+
         rubric_items = [RubricItem.from_dict(r) for r in rubrics]
         prompt = row.get('prompt', [])
         if isinstance(prompt, str):
             prompt = [{"role": "user", "content": prompt}]
-        
+
         # Format conversation
         conversation_text = ""
         for msg in prompt:
@@ -1067,13 +1083,13 @@ def evaluate_healthbench_batch(
             content = msg.get("content", "")
             conversation_text += f"{role}: {content}\n\n"
         conversation_text += f"assistant: {extracted}"
-        
+
         # Create grading tasks for all rubrics in this row
         for rubric_idx, rubric_item in enumerate(rubric_items):
             grading_prompt = HEALTHBENCH_GRADER_TEMPLATE.replace(
                 "<CONVERSATION>", conversation_text
             ).replace("<RUBRIC_ITEM>", str(rubric_item))
-            
+
             all_tasks.append({
                 'task_id': task_id,
                 'prompt': grading_prompt,
@@ -1081,13 +1097,14 @@ def evaluate_healthbench_batch(
             })
             row_rubric_map[task_id] = (idx, rubric_idx, rubric_item)
             task_id += 1
-    
+
     if not all_tasks:
         logger.warning("No grading tasks to process")
         return {}
-    
-    logger.info(f"Batching {len(all_tasks)} rubric grading requests across {len(group_indices)} rows")
-    
+
+    logger.info(
+        f"Batching {len(all_tasks)} rubric grading requests across {len(group_indices)} rows")
+
     # Define grading function
     def _grade_single_task(task):
         """Grade a single rubric item."""
@@ -1117,15 +1134,19 @@ def _grade_single_task(task):
                 "explanation": f"Error during grading: {e}",
                 "criteria_met": False
             }
-    
+
     # Send all requests concurrently for server-side batching
     num_workers = max_workers if max_workers is not None else len(all_tasks)
     grading_results = {}
-    
+
     with ThreadPoolExecutor(max_workers=num_workers) as executor:
-        futures = {executor.submit(_grade_single_task, task): task['task_id'] for task in all_tasks}
-        
-        for future in tqdm(as_completed(futures), total=len(futures), desc="Grading HealthBench (batched)"):
+        futures = {
+            executor.submit(
+                _grade_single_task,
+                task): task['task_id'] for task in all_tasks}
+
+        for future in tqdm(as_completed(futures), total=len(
+                futures), desc="Grading HealthBench (batched)"):
             try:
                 task_id, result = future.result(timeout=60)
                 grading_results[task_id] = result
@@ -1136,32 +1157,34 @@ def _grade_single_task(task):
                     "explanation": f"Error during grading: {e}",
                     "criteria_met": False
                 }
-    
+
     # Reconstruct results per row
     row_results = {}
-    rows_rubrics = {}  # Group results by row: {row_idx: {rubric_idx: (rubric_item, grading_result)}}
-    
+    # Group results by row: {row_idx: {rubric_idx: (rubric_item,
+    # grading_result)}}
+    rows_rubrics = {}
+
     for task_id, grading_result in grading_results.items():
         if task_id not in row_rubric_map:
             continue
-        
+
         row_idx, rubric_idx, rubric_item = row_rubric_map[task_id]
-        
+
         if row_idx not in rows_rubrics:
             rows_rubrics[row_idx] = {}
-        
+
         rows_rubrics[row_idx][rubric_idx] = (rubric_item, grading_result)
-    
+
     # Calculate scores for each row
     for row_idx, rubric_data in rows_rubrics.items():
         # Sort by rubric_idx to maintain correct order
         sorted_rubrics = sorted(rubric_data.items(), key=lambda x: x[0])
         rubric_items = [item for _, (item, _) in sorted_rubrics]
         grading_responses = [response for _, (_, response) in sorted_rubrics]
-        
+
         # Calculate overall score
         score = calculate_healthbench_score(rubric_items, grading_responses)
-        
+
         # Create detailed explanation
         explanations = []
         for rubric_item, response in zip(rubric_items, grading_responses):
@@ -1170,17 +1193,19 @@ def _grade_single_task(task):
             explanations.append(
                 f"[{'✓' if met else '✗'}] {rubric_item}\n    Explanation: {explanation}"
             )
-        
-        detailed_explanation = f"Score: {score:.2%}\n\n" + "\n\n".join(explanations)
+
+        detailed_explanation = f"Score: {score:.2%}\n\n" + \
+            "\n\n".join(explanations)
         row_results[row_idx] = (score, detailed_explanation)
-    
+
     # Handle skipped rows
     for key, value in row_rubric_map.items():
-        if isinstance(key, str) and key.startswith("row_") and key.endswith("_skip"):
+        if isinstance(key, str) and key.startswith(
+                "row_") and key.endswith("_skip"):
             row_idx = value[0]
             if row_idx not in row_results:
                 row_results[row_idx] = (0.0, "Empty output or no rubrics")
-    
+
     return row_results
 
 
@@ -1206,10 +1231,11 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
             raw_output = validate_text_input(row['model_output'])
             extracted = evaluator['parse'](raw_output)
             df_output.at[idx, 'extracted_answer'] = extracted
-            
+
             # Set initial evaluation details for rows without extracted answers
             if extracted is None or pd.isna(extracted):
-                df_output.at[idx, 'evaluation_details'] = "No code extracted from model output"
+                df_output.at[idx,
+                             'evaluation_details'] = "No code extracted from model output"
 
         # Evaluate answers
         if 'livecodebench' in dataset_name.lower():
@@ -1219,7 +1245,7 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
             # HealthBench evaluation with LLM grading - batched across all rows
             total_score = 0.0
             total_evaluated = 0
-            
+
             # Process all rows with batched grading
             results = evaluate_healthbench_batch(
                 df_output,
@@ -1229,7 +1255,7 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                 grader_model=LLM_JUDGE_MODEL,
                 max_workers=LLM_JUDGE_MAX_WORKERS
             )
-            
+
             # Store results
             for idx, (score, explanation) in results.items():
                 # Store score as percentage (0-100)
@@ -1304,7 +1330,8 @@ def print_evaluation_results(df_evaluated: pd.DataFrame,
     is_healthbench = False
     if 'dataset' in df_evaluated.columns:
         datasets = df_evaluated['dataset'].unique()
-        is_healthbench = any('healthbench' in str(ds).lower() for ds in datasets)
+        is_healthbench = any('healthbench' in str(ds).lower()
+                             for ds in datasets)
 
     # Use appropriate metric name
     if is_healthbench:
diff --git a/language/gpt-oss/golden-datasets/fetch_aime.py b/language/gpt-oss/golden-datasets/fetch_aime.py
index bbe06372ab..757a0e303a 100644
--- a/language/gpt-oss/golden-datasets/fetch_aime.py
+++ b/language/gpt-oss/golden-datasets/fetch_aime.py
@@ -9,12 +9,20 @@
 
 df = load_dataset("di-zhang-fdu/AIME_1983_2024")['train'].to_pandas()
 
-df_1_aime2025 = load_dataset("opencompass/AIME2025", "AIME2025-I")['test'].to_pandas()
-df_2_aime2025 = load_dataset("opencompass/AIME2025", "AIME2025-II")['test'].to_pandas()
+df_1_aime2025 = load_dataset(
+    "opencompass/AIME2025",
+    "AIME2025-I")['test'].to_pandas()
+df_2_aime2025 = load_dataset(
+    "opencompass/AIME2025",
+    "AIME2025-II")['test'].to_pandas()
 # df_aime2025 = pd.concat([df_1_aime2025, df_2_aime2025], ignore_index=True)
 # df_aime2025.rename(columns={'answer': 'ground_truth'}, inplace=True)
 
-df.rename(columns={'Answer': 'ground_truth', 'Question': 'question'}, inplace=True)
+df.rename(
+    columns={
+        'Answer': 'ground_truth',
+        'Question': 'question'},
+    inplace=True)
 df.drop(columns=['Year', 'ID', 'Problem Number', 'Part'], inplace=True)
 df['dataset'] = 'aime1983'
 

From 78bf9718481d7e4e23ff849f4ed58f92dcb87c10 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 29 Oct 2025 07:04:41 +0000
Subject: [PATCH 059/165] add symlinks to gitmodules

---
 language/gpt-oss/submodules/LiveCodeBench | 1 +
 language/gpt-oss/submodules/prm800k       | 1 +
 2 files changed, 2 insertions(+)
 create mode 120000 language/gpt-oss/submodules/LiveCodeBench
 create mode 120000 language/gpt-oss/submodules/prm800k

diff --git a/language/gpt-oss/submodules/LiveCodeBench b/language/gpt-oss/submodules/LiveCodeBench
new file mode 120000
index 0000000000..d1e5c66592
--- /dev/null
+++ b/language/gpt-oss/submodules/LiveCodeBench
@@ -0,0 +1 @@
+../../deepseek-r1/submodules/LiveCodeBench
\ No newline at end of file
diff --git a/language/gpt-oss/submodules/prm800k b/language/gpt-oss/submodules/prm800k
new file mode 120000
index 0000000000..1b078c3842
--- /dev/null
+++ b/language/gpt-oss/submodules/prm800k
@@ -0,0 +1 @@
+../../deepseek-r1/submodules/prm800k
\ No newline at end of file

From 115f49840a0def8c0ffb303ca372cd25391dc3cc Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 29 Oct 2025 07:05:14 +0000
Subject: [PATCH 060/165] add fetch_lcb.py

---
 language/gpt-oss/golden-datasets/fetch_lcb.py | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 language/gpt-oss/golden-datasets/fetch_lcb.py

diff --git a/language/gpt-oss/golden-datasets/fetch_lcb.py b/language/gpt-oss/golden-datasets/fetch_lcb.py
new file mode 100644
index 0000000000..9448791281
--- /dev/null
+++ b/language/gpt-oss/golden-datasets/fetch_lcb.py
@@ -0,0 +1,44 @@
+from datasets import load_dataset
+import pickle
+import os
+import glob
+import json
+import argparse
+import pandas as pd
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--dataset_folder", type=str, default="dataset")
+parser.add_argument("--lcb_folder", type=str, default="lcb")
+args = parser.parse_args()
+
+files = glob.glob(os.path.join(args.lcb_folder, "*.jsonl"))
+all_rows = []
+all_columns = set()
+
+for file in files:
+    with open(file, "r", encoding="utf-8") as f:
+        for line in f:
+            row = json.loads(line)
+            all_columns.update(row.keys())
+            all_rows.append(row)
+
+all_columns = list(all_columns)
+
+df = pd.DataFrame(all_rows, columns=all_columns)
+df['dataset'] = 'livecodebench'
+df.drop(columns=['private_test_cases', 'metadata', 'public_test_cases', 'contest_id', 'platform', 'difficulty', 'contest_date', 'question_title'], inplace=True)
+
+starter_prompt = """
+### Format: You will use the following starter code to write the solution to the problem and enclose your code within delimiters.
+```python
+<<starter_code>>
+```
+"""
+
+for idx, row in df.iterrows():
+    starter_code = row['starter_code']
+    starter_prompt_filled = starter_prompt.replace('<<starter_code>>', starter_code)
+    df.loc[idx, 'question'] = df.loc[idx, 'question_content'] + starter_prompt_filled
+
+df.rename(columns={'question_id': 'ground_truth'}, inplace=True)
+df.to_pickle(os.path.join(args.dataset_folder, 'lcb.pkl'))

From 9571c11a30d6039297abf80a56588dd4515efb5f Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 29 Oct 2025 07:05:42 +0000
Subject: [PATCH 061/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/golden-datasets/fetch_lcb.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/language/gpt-oss/golden-datasets/fetch_lcb.py b/language/gpt-oss/golden-datasets/fetch_lcb.py
index 9448791281..755b33e8b5 100644
--- a/language/gpt-oss/golden-datasets/fetch_lcb.py
+++ b/language/gpt-oss/golden-datasets/fetch_lcb.py
@@ -26,7 +26,17 @@
 
 df = pd.DataFrame(all_rows, columns=all_columns)
 df['dataset'] = 'livecodebench'
-df.drop(columns=['private_test_cases', 'metadata', 'public_test_cases', 'contest_id', 'platform', 'difficulty', 'contest_date', 'question_title'], inplace=True)
+df.drop(
+    columns=[
+        'private_test_cases',
+        'metadata',
+        'public_test_cases',
+        'contest_id',
+        'platform',
+        'difficulty',
+        'contest_date',
+        'question_title'],
+    inplace=True)
 
 starter_prompt = """
 ### Format: You will use the following starter code to write the solution to the problem and enclose your code within delimiters.
@@ -37,8 +47,10 @@
 
 for idx, row in df.iterrows():
     starter_code = row['starter_code']
-    starter_prompt_filled = starter_prompt.replace('<<starter_code>>', starter_code)
-    df.loc[idx, 'question'] = df.loc[idx, 'question_content'] + starter_prompt_filled
+    starter_prompt_filled = starter_prompt.replace(
+        '<<starter_code>>', starter_code)
+    df.loc[idx, 'question'] = df.loc[idx,
+                                     'question_content'] + starter_prompt_filled
 
 df.rename(columns={'question_id': 'ground_truth'}, inplace=True)
 df.to_pickle(os.path.join(args.dataset_folder, 'lcb.pkl'))

From f8244eeac8980491f2ee65acd9a2659366edcc6a Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Tue, 4 Nov 2025 21:00:38 +0000
Subject: [PATCH 062/165] updates

---
 language/gpt-oss/harmonize_inputs.py | 83 +++++++++++++++++++++++-----
 language/gpt-oss/plot.py             | 77 ++++++++++++++++++--------
 language/gpt-oss/setup_enroot.sh     |  4 +-
 3 files changed, 127 insertions(+), 37 deletions(-)

diff --git a/language/gpt-oss/harmonize_inputs.py b/language/gpt-oss/harmonize_inputs.py
index aabc0481d4..4118368f74 100644
--- a/language/gpt-oss/harmonize_inputs.py
+++ b/language/gpt-oss/harmonize_inputs.py
@@ -281,7 +281,7 @@ def create_healthbench_prompt(prompt, reasoning_effort=ReasoningEffort.HIGH):
     """
     instructions = (
         "Your task is to answer the following health-related query as a responsible assistant."
-        "Provide responses that are factually accurate, up-to-date, and consistent with current clinical guidelines. Adjust the detail, terminology, and tone based on the user’s apparent background (layperson or healthcare professional)."
+        "Provide responses that are factually accurate, up-to-date, and consistent with current clinical guidelines. Adjust the detail, terminology, and tone based on the user's apparent background (layperson or healthcare professional)."
         "If the evidence is limited or uncertain, clearly acknowledge this in your answer. When essential information is missing for a safe, thorough response, ask clarifying questions before proceeding."
         "Do not provide specific medical diagnoses or emergency recommendations unless explicitly appropriate and safe to do so."
         "All responses should promote user safety, accuracy, completeness, clear communication, and adherence to user instructions while maintaining clinical standards."
@@ -305,6 +305,43 @@ def create_healthbench_prompt(prompt, reasoning_effort=ReasoningEffort.HIGH):
     return _finalize_conversation(messages, None)
 
 
+def create_arxiv_summarization_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
+    """
+    Creates a zero-shot prompt for arXiv paper summarization using Harmony format.
+
+    Returns:
+        tuple: (conversation_object, token_list) ready for model completion
+    """
+    instructions = (
+        "You are an expert at reading and summarizing academic research papers. "
+        "Your task is to provide clear, concise, and accurate summaries of research papers. "
+        "Focus on the key contributions, methodology, results, and implications. "
+        "Structure your summary to be accessible while maintaining technical accuracy."
+    )
+
+    messages = _create_base_messages(reasoning_effort, instructions)
+
+    return _finalize_conversation(messages, user_query)
+
+
+def create_default_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
+    """
+    Creates a default zero-shot prompt for general problem solving using Harmony format.
+    This is used when no specific dataset is specified.
+
+    Returns:
+        tuple: (conversation_object, token_list) ready for model completion
+    """
+    instructions = (
+        "You are a helpful AI assistant that solves user questions. "
+        "Provide a well-structured answer to the user's question."
+    )
+
+    messages = _create_base_messages(reasoning_effort, instructions)
+
+    return _finalize_conversation(messages, user_query)
+
+
 def process_row(args):
     """
     Worker function to process a single row from the dataframe.
@@ -316,12 +353,26 @@ def process_row(args):
         tuple: (index, convo, tokens, dataset_name) or (index, None, None, dataset_name, error)
     """
     index, row, dataset_function_map, reasoning_effort = args
-    dataset_name = row["dataset"]
+    
+    # Check if dataset column exists, use default if not
+    if "dataset" in row:
+        dataset_name = row["dataset"]
+    else:
+        dataset_name = "default"
 
     if dataset_name == "healthbench":
         user_query = row["prompt"]
     else:
-        user_query = row["question"]
+        # Try to get question from common column names
+        if "question" in row:
+            user_query = row["question"]
+        elif "prompt" in row:
+            user_query = row["prompt"]
+        elif "query" in row:
+            user_query = row["query"]
+        else:
+            error_msg = f"No query column found (tried: question, prompt, query) at index {index}"
+            return (index, None, None, dataset_name, error_msg)
 
     try:
         # Get the appropriate function based on dataset type
@@ -364,16 +415,20 @@ def process_row(args):
 
     # Filter by dataset if specified
     if args.dataset is not None:
-        original_len = len(df)
-        available_datasets = sorted(df['dataset'].unique().tolist())
-        df = df[df['dataset'] == args.dataset].copy()
-        print(
-            f"Filtered to dataset '{args.dataset}': {len(df)} rows (from {original_len} total)")
-        if len(df) == 0:
-            print(f"ERROR: No rows found for dataset '{args.dataset}'")
-            print(f"Available datasets: {available_datasets}")
-            import sys
-            sys.exit(1)
+        if 'dataset' not in df.columns:
+            print(f"WARNING: No 'dataset' column found in dataframe. Cannot filter by dataset.")
+            print(f"All rows will be processed using the default prompt function.")
+        else:
+            original_len = len(df)
+            available_datasets = sorted(df['dataset'].unique().tolist())
+            df = df[df['dataset'] == args.dataset].copy()
+            print(
+                f"Filtered to dataset '{args.dataset}': {len(df)} rows (from {original_len} total)")
+            if len(df) == 0:
+                print(f"ERROR: No rows found for dataset '{args.dataset}'")
+                print(f"Available datasets: {available_datasets}")
+                import sys
+                sys.exit(1)
 
     # Apply row limit if specified
     if args.max_rows is not None:
@@ -384,12 +439,14 @@ def process_row(args):
     dataset_function_map = {
         'aime1983': create_aime1983_prompt,
         'aime2025': create_aime1983_prompt,
+        'arxiv_summarization': create_arxiv_summarization_prompt,
         'gpqa': create_gpqa_prompt,
         'livecodebench': create_livecodebench_prompt,
         'math500': create_math500_prompt,
         'mmlu_pro': create_mmlu_prompt,
         'mmlu': create_mmlu_prompt,
         'healthbench': create_healthbench_prompt,
+        'default': create_default_prompt,
     }
 
     # Prepare data for parallel processing
diff --git a/language/gpt-oss/plot.py b/language/gpt-oss/plot.py
index d24dbff870..1c5da36983 100644
--- a/language/gpt-oss/plot.py
+++ b/language/gpt-oss/plot.py
@@ -151,31 +151,53 @@ def main():
     # Load data
     df = load_data(args.pkl_path)
 
-    # Filter for 100% accuracy
-    df_100 = df[df['prompt_accuracy'] == 100.0]
-    print(
-        f"\nFiltered {len(df_100)} rows with prompt_accuracy == 100 (out of {len(df)} total)\n")
+    # Check if dataset column exists
+    has_dataset = 'dataset' in df.columns
+    if not has_dataset:
+        print("\nNote: 'dataset' column not found - skipping per-dataset histograms")
+        # Add a dummy dataset column for compatibility with existing code
+        df['dataset'] = 'default'
+
+    # Check if prompt_accuracy column exists
+    has_accuracy = 'prompt_accuracy' in df.columns
+    
+    if has_accuracy:
+        # Filter for 100% accuracy
+        df_100 = df[df['prompt_accuracy'] == 100.0].copy()
+        print(
+            f"\nFiltered {len(df_100)} rows with prompt_accuracy == 100 (out of {len(df)} total)\n")
+    else:
+        print("\nNote: 'prompt_accuracy' column not found - skipping accuracy-based histograms\n")
+        # Create empty dataframe with dataset column for consistency
+        df_100 = pd.DataFrame(columns=df.columns)
 
     print("=" * 60)
     print("CREATING ISL HISTOGRAMS")
     print("=" * 60)
 
     # 1. Per dataset ISL histogram
-    create_per_dataset_histogram(
-        df, 'tok_input_len',
-        'Token Input Length (ISL)',
-        '1_per_dataset_ISL.png',
-        args.output_dir)
+    if has_dataset:
+        create_per_dataset_histogram(
+            df, 'tok_input_len',
+            'Token Input Length (ISL)',
+            '1_per_dataset_ISL.png',
+            args.output_dir)
+    else:
+        print("Skipping per-dataset ISL: dataset column not found")
 
     # 2. Per dataset ISL histogram (accuracy == 100)
-    if len(df_100) > 0:
+    if has_dataset and has_accuracy and len(df_100) > 0:
         create_per_dataset_histogram(
             df_100, 'tok_input_len',
             'Token Input Length (ISL) - 100% Accuracy',
             '2_per_dataset_ISL_acc100.png',
             args.output_dir)
+    elif not has_dataset:
+        print("Skipping per-dataset ISL (acc==100): dataset column not found")
+    elif not has_accuracy:
+        print("Skipping per-dataset ISL (acc==100): prompt_accuracy column not found")
     else:
-        print("Skipping per-dataset ISL (acc==100): no data")
+        print("Skipping per-dataset ISL (acc==100): no data with 100% accuracy")
 
     # 3. Full ISL histogram
     create_full_histogram(
@@ -185,35 +207,44 @@ def main():
         args.output_dir)
 
     # 4. Full ISL histogram (accuracy == 100)
-    if len(df_100) > 0:
+    if has_accuracy and len(df_100) > 0:
         create_full_histogram(
             df_100, 'tok_input_len',
             'Token Input Length (ISL) - 100% Accuracy',
             '4_full_ISL_acc100.png',
             args.output_dir)
+    elif has_accuracy:
+        print("Skipping full ISL (acc==100): no data with 100% accuracy")
     else:
-        print("Skipping full ISL (acc==100): no data")
+        print("Skipping full ISL (acc==100): prompt_accuracy column not found")
 
     print("\n" + "=" * 60)
     print("CREATING OSL HISTOGRAMS")
     print("=" * 60)
 
     # 5. Per dataset OSL histogram
-    create_per_dataset_histogram(
-        df, 'tok_model_output_len',
-        'Token Output Length (OSL)',
-        '5_per_dataset_OSL.png',
-        args.output_dir)
+    if has_dataset:
+        create_per_dataset_histogram(
+            df, 'tok_model_output_len',
+            'Token Output Length (OSL)',
+            '5_per_dataset_OSL.png',
+            args.output_dir)
+    else:
+        print("Skipping per-dataset OSL: dataset column not found")
 
     # 6. Per dataset OSL histogram (accuracy == 100)
-    if len(df_100) > 0:
+    if has_dataset and has_accuracy and len(df_100) > 0:
         create_per_dataset_histogram(
             df_100, 'tok_model_output_len',
             'Token Output Length (OSL) - 100% Accuracy',
             '6_per_dataset_OSL_acc100.png',
             args.output_dir)
+    elif not has_dataset:
+        print("Skipping per-dataset OSL (acc==100): dataset column not found")
+    elif not has_accuracy:
+        print("Skipping per-dataset OSL (acc==100): prompt_accuracy column not found")
     else:
-        print("Skipping per-dataset OSL (acc==100): no data")
+        print("Skipping per-dataset OSL (acc==100): no data with 100% accuracy")
 
     # 7. Full OSL histogram
     create_full_histogram(
@@ -223,14 +254,16 @@ def main():
         args.output_dir)
 
     # 8. Full OSL histogram (accuracy == 100)
-    if len(df_100) > 0:
+    if has_accuracy and len(df_100) > 0:
         create_full_histogram(
             df_100, 'tok_model_output_len',
             'Token Output Length (OSL) - 100% Accuracy',
             '8_full_OSL_acc100.png',
             args.output_dir)
+    elif has_accuracy:
+        print("Skipping full OSL (acc==100): no data with 100% accuracy")
     else:
-        print("Skipping full OSL (acc==100): no data")
+        print("Skipping full OSL (acc==100): prompt_accuracy column not found")
 
     print(f"\n{'=' * 60}")
     print(f"All histograms saved to {args.output_dir}/")
diff --git a/language/gpt-oss/setup_enroot.sh b/language/gpt-oss/setup_enroot.sh
index 656d19b3dd..4e99f0bd84 100755
--- a/language/gpt-oss/setup_enroot.sh
+++ b/language/gpt-oss/setup_enroot.sh
@@ -22,7 +22,7 @@ while [[ $# -gt 0 ]]; do
     esac
 done
 
-mkdir -p $sqsh_location
+# mkdir -p $sqsh_location
 enroot import -o $sqsh_location/$sandbox_name.sqsh docker://$docker_image
 enroot create --name $sandbox_name $sqsh_location/$sandbox_name.sqsh
-# enroot start --mount $(pwd):$(pwd) --root --rw $sandbox_name
\ No newline at end of file
+# enroot start --mount $(pwd):$(pwd) --root --rw $sandbox_name

From 531c37c36dd1227c558e7e7a983f4e1f9932a9bb Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 5 Nov 2025 02:07:41 +0000
Subject: [PATCH 063/165] add pass@k; add spec decode option

---
 language/gpt-oss/eval_accuracy.py | 333 +++++++++++++++++++++---------
 language/gpt-oss/run_server.sh    |  25 ++-
 2 files changed, 251 insertions(+), 107 deletions(-)

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
index 7403e2835e..21dd0e2a2c 100644
--- a/language/gpt-oss/eval_accuracy.py
+++ b/language/gpt-oss/eval_accuracy.py
@@ -59,18 +59,48 @@
 # =============================================================================
 
 
+def detect_pass_k(df: pd.DataFrame) -> int:
+    """Detect if DataFrame has pass@k format and return k.
+    
+    Returns:
+        Number of passes (k) if pass@k format detected, otherwise 1
+    """
+    # Check for model_output_0, model_output_1, etc.
+    pass_k = 0
+    while f'model_output_{pass_k}' in df.columns:
+        pass_k += 1
+    
+    # If no _0 suffix found, check for single model_output column
+    if pass_k == 0 and 'model_output' in df.columns:
+        return 1
+    
+    return pass_k
+
+
 def validate_dataframe(df: pd.DataFrame) -> None:
     """Validate input DataFrame has required columns."""
     if not isinstance(df, pd.DataFrame):
         raise ValueError("Input must be a pandas DataFrame")
 
-    required_cols = [
-        'model_output',
-        'dataset',
-        'tok_model_output_len']
-    missing_cols = [col for col in required_cols if col not in df.columns]
-    if missing_cols:
-        raise ValueError(f"Missing required columns: {missing_cols}")
+    # Detect pass@k format
+    pass_k = detect_pass_k(df)
+    
+    if pass_k == 0:
+        raise ValueError("No model_output columns found (expected 'model_output' or 'model_output_0', 'model_output_1', etc.)")
+    
+    # Check for dataset column
+    if 'dataset' not in df.columns:
+        raise ValueError("Missing required column: 'dataset'")
+    
+    # Check for tok_model_output_len (either single or with suffixes)
+    has_tok_len = False
+    if pass_k == 1:
+        has_tok_len = 'tok_model_output_len' in df.columns
+    else:
+        has_tok_len = all(f'tok_model_output_len_{i}' in df.columns for i in range(pass_k))
+    
+    if not has_tok_len:
+        raise ValueError("Missing required tok_model_output_len column(s)")
 
     # Check for ground_truth or rubrics depending on dataset
     has_ground_truth = 'ground_truth' in df.columns
@@ -940,13 +970,30 @@ def process_row(row: pd.Series) -> Dict[str, Any]:
 
 
 def process_livecodebench_parallel(
-        df: pd.DataFrame, group_indices: pd.Index) -> Tuple[int, int]:
-    """Process LiveCodeBench items in parallel."""
+        df: pd.DataFrame, 
+        group_indices: pd.Index,
+        extracted_answer_col: str = 'extracted_answer',
+        prompt_accuracy_col: str = 'prompt_accuracy',
+        evaluation_details_col: str = 'evaluation_details',
+        pass_label: str = '') -> Tuple[int, int]:
+    """Process LiveCodeBench items in parallel.
+    
+    Args:
+        df: DataFrame with data
+        group_indices: Indices to process
+        extracted_answer_col: Column name for extracted answers
+        prompt_accuracy_col: Column name for accuracy results
+        evaluation_details_col: Column name for evaluation details
+        pass_label: Label for logging (e.g., 'pass 0', 'pass 1')
+    
+    Returns:
+        Tuple of (correct_count, total_evaluated)
+    """
     # Prepare work items
     work_items = []
     for idx in group_indices:
         row = df.loc[idx]
-        extracted = row.get('extracted_answer')
+        extracted = row.get(extracted_answer_col)
         ground_truth = row.get('ground_truth')
 
         if extracted is not None and not pd.isna(ground_truth):
@@ -956,13 +1003,14 @@ def process_livecodebench_parallel(
         return 0, 0
 
     # Ensure evaluation_details column exists
-    if 'evaluation_details' not in df.columns:
-        df['evaluation_details'] = None
+    if evaluation_details_col not in df.columns:
+        df[evaluation_details_col] = None
 
     # Process in parallel
     max_workers = min(multiprocessing.cpu_count(), len(work_items))
+    desc = f"Evaluating LiveCodeBench{' ' + pass_label if pass_label else ''}"
     logger.info(
-        f"Evaluating {len(work_items)} LiveCodeBench items with {max_workers} workers")
+        f"Evaluating {len(work_items)} LiveCodeBench items{' ' + pass_label if pass_label else ''} with {max_workers} workers")
 
     correct_count = 0
     total_evaluated = 0
@@ -974,21 +1022,21 @@ def process_livecodebench_parallel(
         }
 
         for future in tqdm(as_completed(future_to_idx, timeout=1200),
-                           total=len(future_to_idx), desc="Evaluating LiveCodeBench"):
+                           total=len(future_to_idx), desc=desc):
             idx = future_to_idx[future]
 
             try:
                 question_id, is_correct, detailed_reason = future.result(
                     timeout=30)
-                df.at[idx, 'prompt_accuracy'] = 100.0 if is_correct else 0.0
-                df.at[idx, 'evaluation_details'] = detailed_reason
+                df.at[idx, prompt_accuracy_col] = 100.0 if is_correct else 0.0
+                df.at[idx, evaluation_details_col] = detailed_reason
                 total_evaluated += 1
                 if is_correct:
                     correct_count += 1
             except Exception as e:
-                logger.error(f"Error evaluating row {idx}: {e}")
-                df.at[idx, 'prompt_accuracy'] = 0.0
-                df.at[idx, 'evaluation_details'] = f"Error: {e}"
+                logger.error(f"Error evaluating row {idx}{' ' + pass_label if pass_label else ''}: {e}")
+                df.at[idx, prompt_accuracy_col] = 0.0
+                df.at[idx, evaluation_details_col] = f"Error: {e}"
                 total_evaluated += 1
 
     return correct_count, total_evaluated
@@ -1000,7 +1048,9 @@ def evaluate_healthbench_batch(
     grader_api_key: Optional[str] = None,
     grader_base_url: Optional[str] = None,
     grader_model: Optional[str] = None,
-    max_workers: Optional[int] = None
+    max_workers: Optional[int] = None,
+    extracted_answer_col: str = 'extracted_answer',
+    pass_label: str = ''
 ) -> Dict[int, Tuple[float, str]]:
     """Evaluate all HealthBench rows with batched rubric grading across all rows.
 
@@ -1011,6 +1061,8 @@ def evaluate_healthbench_batch(
         grader_base_url: Base URL for API
         grader_model: Model name
         max_workers: Max concurrent requests
+        extracted_answer_col: Column name for extracted answers (e.g., 'extracted_answer_0')
+        pass_label: Label for logging (e.g., 'pass 0')
 
     Returns:
         Dictionary mapping row index to (score, explanation) tuple
@@ -1058,7 +1110,7 @@ def evaluate_healthbench_batch(
 
     for idx in group_indices:
         row = df.loc[idx]
-        extracted = row.get('extracted_answer')
+        extracted = row.get(extracted_answer_col)
 
         if extracted is None or pd.isna(extracted):
             row_rubric_map[f"row_{idx}_skip"] = (idx, None)
@@ -1099,11 +1151,11 @@ def evaluate_healthbench_batch(
             task_id += 1
 
     if not all_tasks:
-        logger.warning("No grading tasks to process")
+        logger.warning(f"No grading tasks to process{' for ' + pass_label if pass_label else ''}")
         return {}
 
     logger.info(
-        f"Batching {len(all_tasks)} rubric grading requests across {len(group_indices)} rows")
+        f"Batching {len(all_tasks)} rubric grading requests{' for ' + pass_label if pass_label else ''} across {len(group_indices)} rows")
 
     # Define grading function
     def _grade_single_task(task):
@@ -1145,8 +1197,9 @@ def _grade_single_task(task):
                 _grade_single_task,
                 task): task['task_id'] for task in all_tasks}
 
+        desc = f"Grading HealthBench{' ' + pass_label if pass_label else ''} (batched)"
         for future in tqdm(as_completed(futures), total=len(
-                futures), desc="Grading HealthBench (batched)"):
+                futures), desc=desc):
             try:
                 task_id, result = future.result(timeout=60)
                 grading_results[task_id] = result
@@ -1210,92 +1263,143 @@ def _grade_single_task(task):
 
 
 def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
-    """Process entire dataframe with optimized batch processing."""
+    """Process entire dataframe with optimized batch processing.
+    
+    Supports both single-pass and pass@k formats:
+    - Single-pass: model_output -> extracted_answer, prompt_accuracy
+    - Pass@k: model_output_0, model_output_1, ... -> extracted_answer_0, prompt_accuracy_0, ...
+              and aggregated prompt_accuracy = max(prompt_accuracy_0, prompt_accuracy_1, ...)
+    """
     validate_dataframe(df)
 
     df_output = df.copy()
-    df_output['extracted_answer'] = None
-    df_output['prompt_accuracy'] = 0.0
-    df_output['evaluation_details'] = None  # Add evaluation details column
+    
+    # Detect pass@k
+    pass_k = detect_pass_k(df)
+    logger.info(f"Detected pass@k format with k={pass_k}")
+    
+    # Initialize columns for each pass
+    for pass_num in range(pass_k):
+        suffix = f'_{pass_num}' if pass_k > 1 else ''
+        df_output[f'extracted_answer{suffix}'] = None
+        df_output[f'prompt_accuracy{suffix}'] = 0.0
+        df_output[f'evaluation_details{suffix}'] = None
+    
+    # Add aggregated columns for pass@k
+    if pass_k > 1:
+        df_output['prompt_accuracy'] = 0.0  # Will be max of all passes
+        df_output['evaluation_details'] = None  # Will aggregate details
 
     # Process by dataset
     for dataset_name, group_indices in tqdm(df_output.groupby('dataset').groups.items(),
                                             desc="Processing datasets"):
         evaluator = get_evaluator(dataset_name)
 
-        # Parse answers for all rows in this dataset
-        logger.info(
-            f"Processing {len(group_indices)} rows for dataset '{dataset_name}'")
-        for idx in group_indices:
-            row = df_output.loc[idx]
-            raw_output = validate_text_input(row['model_output'])
-            extracted = evaluator['parse'](raw_output)
-            df_output.at[idx, 'extracted_answer'] = extracted
-
-            # Set initial evaluation details for rows without extracted answers
-            if extracted is None or pd.isna(extracted):
-                df_output.at[idx,
-                             'evaluation_details'] = "No code extracted from model output"
-
-        # Evaluate answers
-        if 'livecodebench' in dataset_name.lower():
-            correct_count, total_evaluated = process_livecodebench_parallel(
-                df_output, group_indices)
-        elif 'healthbench' in dataset_name.lower():
-            # HealthBench evaluation with LLM grading - batched across all rows
-            total_score = 0.0
-            total_evaluated = 0
-
-            # Process all rows with batched grading
-            results = evaluate_healthbench_batch(
-                df_output,
-                group_indices,
-                grader_api_key=LLM_JUDGE_API_KEY,
-                grader_base_url=LLM_JUDGE_BASE_URL,
-                grader_model=LLM_JUDGE_MODEL,
-                max_workers=LLM_JUDGE_MAX_WORKERS
-            )
-
-            # Store results
-            for idx, (score, explanation) in results.items():
-                # Store score as percentage (0-100)
-                df_output.at[idx, 'prompt_accuracy'] = score * 100.0
-                # Store explanation in a new column if needed
-                if 'evaluation_details' not in df_output.columns:
-                    df_output['evaluation_details'] = None
-                df_output.at[idx, 'evaluation_details'] = explanation
-                total_evaluated += 1
-                total_score += score
-        else:
-            # Sequential evaluation for other datasets
-            correct_count = 0
-            total_evaluated = 0
-
+        # Process each pass
+        for pass_num in range(pass_k):
+            suffix = f'_{pass_num}' if pass_k > 1 else ''
+            model_output_col = f'model_output{suffix}'
+            extracted_answer_col = f'extracted_answer{suffix}'
+            prompt_accuracy_col = f'prompt_accuracy{suffix}'
+            evaluation_details_col = f'evaluation_details{suffix}'
+            
+            logger.info(
+                f"Processing {len(group_indices)} rows for dataset '{dataset_name}', pass {pass_num}")
+            
+            # Parse answers for all rows in this dataset for this pass
             for idx in group_indices:
                 row = df_output.loc[idx]
-                extracted = row['extracted_answer']
-                ground_truth = row.get('ground_truth')
+                raw_output = validate_text_input(row[model_output_col])
+                extracted = evaluator['parse'](raw_output)
+                df_output.at[idx, extracted_answer_col] = extracted
+
+                # Set initial evaluation details for rows without extracted answers
+                if extracted is None or pd.isna(extracted):
+                    df_output.at[idx, evaluation_details_col] = "No answer extracted from model output"
+
+            # Evaluate answers for this pass
+            pass_label_str = f'(pass {pass_num})' if pass_k > 1 else ''
+            
+            if 'livecodebench' in dataset_name.lower():
+                correct_count, total_evaluated = process_livecodebench_parallel(
+                    df_output, 
+                    group_indices,
+                    extracted_answer_col=extracted_answer_col,
+                    prompt_accuracy_col=prompt_accuracy_col,
+                    evaluation_details_col=evaluation_details_col,
+                    pass_label=pass_label_str
+                )
+            elif 'healthbench' in dataset_name.lower():
+                # HealthBench evaluation with LLM grading - batched across all rows
+                total_score = 0.0
+                total_evaluated = 0
+
+                # Process all rows with batched grading for this pass
+                results = evaluate_healthbench_batch(
+                    df_output,
+                    group_indices,
+                    grader_api_key=LLM_JUDGE_API_KEY,
+                    grader_base_url=LLM_JUDGE_BASE_URL,
+                    grader_model=LLM_JUDGE_MODEL,
+                    max_workers=LLM_JUDGE_MAX_WORKERS,
+                    extracted_answer_col=extracted_answer_col,
+                    pass_label=pass_label_str
+                )
 
-                if extracted is not None and not pd.isna(ground_truth):
-                    is_correct = evaluator['evaluate'](extracted, ground_truth)
-                    df_output.at[idx,
-                                 'prompt_accuracy'] = 100.0 if is_correct else 0.0
+                # Store results for this pass
+                for idx, (score, explanation) in results.items():
+                    # Store score as percentage (0-100)
+                    df_output.at[idx, prompt_accuracy_col] = score * 100.0
+                    df_output.at[idx, evaluation_details_col] = explanation
                     total_evaluated += 1
-                    if is_correct:
-                        correct_count += 1
-
-        # Log results
-        if total_evaluated > 0:
-            if 'healthbench' in dataset_name.lower():
-                # For HealthBench, report average score
-                avg_score = total_score / total_evaluated * 100
-                logger.info(
-                    f"{dataset_name} results: Average score {avg_score:.1f}% ({total_evaluated} samples)")
+                    total_score += score
             else:
-                # For other datasets, report accuracy
-                accuracy = correct_count / total_evaluated * 100
-                logger.info(
-                    f"{dataset_name} results: {correct_count}/{total_evaluated} correct ({accuracy:.1f}% accuracy)")
+                # Sequential evaluation for other datasets
+                correct_count = 0
+                total_evaluated = 0
+
+                for idx in group_indices:
+                    row = df_output.loc[idx]
+                    extracted = row[extracted_answer_col]
+                    ground_truth = row.get('ground_truth')
+
+                    if extracted is not None and not pd.isna(ground_truth):
+                        is_correct = evaluator['evaluate'](extracted, ground_truth)
+                        df_output.at[idx, prompt_accuracy_col] = 100.0 if is_correct else 0.0
+                        total_evaluated += 1
+                        if is_correct:
+                            correct_count += 1
+
+            # Log results for this pass
+            if total_evaluated > 0:
+                if 'healthbench' in dataset_name.lower():
+                    # For HealthBench, report average score
+                    avg_score = total_score / total_evaluated * 100
+                    logger.info(
+                        f"{dataset_name} pass {pass_num} results: Average score {avg_score:.1f}% ({total_evaluated} samples)")
+                else:
+                    # For other datasets, report accuracy
+                    accuracy = correct_count / total_evaluated * 100
+                    logger.info(
+                        f"{dataset_name} pass {pass_num} results: {correct_count}/{total_evaluated} correct ({accuracy:.1f}% accuracy)")
+        
+        # Aggregate results across all passes (take max)
+        if pass_k > 1:
+            logger.info(f"Aggregating results across {pass_k} passes for dataset '{dataset_name}'")
+            for idx in group_indices:
+                # Get all accuracy values for this row
+                accuracies = []
+                for pass_num in range(pass_k):
+                    acc = df_output.at[idx, f'prompt_accuracy_{pass_num}']
+                    accuracies.append(acc if not pd.isna(acc) else 0.0)
+                
+                # Set aggregated accuracy as max
+                max_accuracy = max(accuracies)
+                df_output.at[idx, 'prompt_accuracy'] = max_accuracy
+                
+                # Find which pass achieved max accuracy
+                max_pass = accuracies.index(max_accuracy)
+                df_output.at[idx, 'evaluation_details'] = f"Best pass: {max_pass} (accuracy: {max_accuracy:.1f}%)"
 
     return df_output
 
@@ -1318,13 +1422,31 @@ def print_evaluation_results(df_evaluated: pd.DataFrame,
     if logger is None:
         logger = logging.getLogger(__name__)
 
+    # Detect pass@k
+    pass_k = detect_pass_k(df_evaluated)
+    
     # Calculate statistics
-    evaluated = df_evaluated['extracted_answer'].notna().sum()
-    correct = (df_evaluated['prompt_accuracy'] > 0).sum()
-    accuracy = df_evaluated['prompt_accuracy'].mean()
-
-    # tok_model_output_len is now a required column
-    mean_output_len = float(df_evaluated['tok_model_output_len'].mean())
+    if pass_k > 1:
+        # For pass@k, use the aggregated prompt_accuracy (max across passes)
+        evaluated = df_evaluated['extracted_answer_0'].notna().sum()  # Count from first pass
+        correct = (df_evaluated['prompt_accuracy'] > 0).sum()
+        accuracy = df_evaluated['prompt_accuracy'].mean()
+        
+        # Calculate average token length across all passes
+        all_output_lens = []
+        for i in range(pass_k):
+            all_output_lens.extend(df_evaluated[f'tok_model_output_len_{i}'].tolist())
+        mean_output_len = float(sum(all_output_lens) / len(all_output_lens)) if all_output_lens else 0.0
+    else:
+        # Single pass format
+        suffix = '' if 'extracted_answer' in df_evaluated.columns else '_0'
+        evaluated = df_evaluated[f'extracted_answer{suffix}'].notna().sum()
+        correct = (df_evaluated[f'prompt_accuracy{suffix}'] > 0).sum()
+        accuracy = df_evaluated[f'prompt_accuracy{suffix}'].mean()
+        
+        # tok_model_output_len is now a required column
+        tok_len_col = 'tok_model_output_len' if 'tok_model_output_len' in df_evaluated.columns else 'tok_model_output_len_0'
+        mean_output_len = float(df_evaluated[tok_len_col].mean())
 
     # Check if this is HealthBench dataset
     is_healthbench = False
@@ -1346,6 +1468,13 @@ def print_evaluation_results(df_evaluated: pd.DataFrame,
         'tokens_per_sample': mean_output_len,
         'num-samples': len(df_evaluated),
     }
+    
+    if pass_k > 1:
+        results['pass_k'] = pass_k
+        # Also report individual pass accuracies
+        for i in range(pass_k):
+            pass_acc = df_evaluated[f'prompt_accuracy_{i}'].mean()
+            results[f'{metric_key}_pass_{i}'] = float(pass_acc)
 
     print("\nResults\n")
     print(results)
diff --git a/language/gpt-oss/run_server.sh b/language/gpt-oss/run_server.sh
index 71746af6ff..2d164182d8 100755
--- a/language/gpt-oss/run_server.sh
+++ b/language/gpt-oss/run_server.sh
@@ -2,6 +2,7 @@
 
 dp=1
 model_path=openai/gpt-oss-120b
+eagle_path=""
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -13,6 +14,10 @@ while [[ $# -gt 0 ]]; do
             model_path=$2
             shift 2
             ;;
+        --eagle_path)
+            eagle_path=$2
+            shift 2
+            ;;
         *)
             echo "Unknown argument: $1"
             exit 1
@@ -20,9 +25,7 @@ while [[ $# -gt 0 ]]; do
     esac
 done
 
-set -x;
-python3 -m sglang.launch_server \
-    --model-path $model_path \
+args=" --model-path $model_path \
     --host 0.0.0.0 \
     --port 30000 \
     --tp-size=1 \
@@ -31,5 +34,17 @@ python3 -m sglang.launch_server \
     --mem-fraction-static 0.85 \
     --chunked-prefill-size 16384 \
     --ep-size=1 \
-    --quantization mxfp4 \
-    --stream-interval 500 
\ No newline at end of file
+    --stream-interval 500 "
+
+if [ -n "$eagle_path" ]; then
+    args="$args --speculative-draft-model-path $eagle_path \
+        --speculative-algorithm EAGLE3"
+fi
+
+# --speculative-num-steps 1 \
+# --speculative-eagle-topk 1 \
+# --speculative-num-draft-tokens 3 \
+
+
+set -x;
+python3 -m sglang.launch_server $args

From 5e86d65711e21cb4511ad54efbb01a70a47e3e2b Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 5 Nov 2025 04:10:18 +0000
Subject: [PATCH 064/165] add openai client; add pass@k

---
 language/gpt-oss/run_infer.py | 423 +++++++++++++++++++++++++---------
 1 file changed, 308 insertions(+), 115 deletions(-)

diff --git a/language/gpt-oss/run_infer.py b/language/gpt-oss/run_infer.py
index a9c61d8c18..b20754125b 100644
--- a/language/gpt-oss/run_infer.py
+++ b/language/gpt-oss/run_infer.py
@@ -1,17 +1,21 @@
 #!/usr/bin/env python3
 """
-Script to send pre-tokenized requests to SGLang server.
+Script to send pre-tokenized requests to inference server (SGLang or OpenAI).
 
 Usage:
-    python send_requests.py --input-tokens tokenized_data.pkl [options]
+    python run_infer.py --input-tokens tokenized_data.pkl [options]
 
 Arguments:
     --input-tokens     Path to pickle file containing pre-tokenized data from harmony-tokens.py
-    --server-url       SGLang server URL (default: http://umbriel-b200-145:30000)
+    --server-url       Server URL (default: http://localhost:30000)
     --max-samples      Maximum number of samples to process (default: all)
     --max-tokens       Maximum tokens to generate per request (default: 100)
-    --max-concurrency  Maximum number of concurrent requests (default: 128)
+    --max-concurrency  Maximum number of concurrent requests (default: 256)
     --output           Output pickle file for responses (optional)
+    --pass-k           Number of inference passes per sample for pass@k strategy (default: 1)
+    --client-type      Client type: sglang or openai (default: sglang)
+    --api-key          API key for OpenAI client (optional)
+    --model            Model name for OpenAI client (default: gpt-4)
 """
 
 import requests
@@ -24,6 +28,7 @@
 import pandas as pd
 from tqdm import tqdm
 from transformers import AutoTokenizer
+from openai import OpenAI
 
 # Set up logging
 logging.basicConfig(
@@ -89,6 +94,73 @@ def send_request(
             return {"error": str(e)}
 
 
+class OpenAIClient:
+    def __init__(self,
+                 server_url: str = "https://api.openai.com/v1",
+                 temperature: float = 0.001,
+                 top_k: int = 1,
+                 timeout: int = 1200,
+                 api_key: str = None,
+                 model: str = "gpt-4"
+                 ):
+        self.base_url = server_url
+        self.temperature = temperature
+        self.top_k = top_k
+        self.timeout = timeout
+        self.model = model
+        # Initialize OpenAI client
+        self.client = OpenAI(
+            api_key=api_key,
+            base_url=server_url,
+            timeout=timeout
+        )
+
+    def send_request(
+            self, input_ids: List[int], max_tokens: int = 100) -> Dict[str, Any]:
+        """Send a single request to the OpenAI-compatible server.
+        
+        Note: OpenAI API expects text input, so we need to decode the input_ids first.
+        """
+        try:
+            # Decode input_ids to text
+            tokenizer = get_tokenizer()
+            prompt_text = tokenizer.decode(input_ids, skip_special_tokens=False)
+            
+            # Make request to OpenAI-compatible API
+            response = self.client.completions.create(
+                model=self.model,
+                prompt=prompt_text,
+                max_tokens=max_tokens,
+                temperature=self.temperature,
+                top_p=1.0,  # Use top_p instead of top_k for OpenAI
+                logprobs=0,  # Request logprobs to get token IDs
+            )
+            
+            # Convert OpenAI response to SGLang-compatible format
+            choice = response.choices[0]
+            output_text = choice.text
+            
+            # Tokenize the output to get output_ids
+            output_ids = tokenizer.encode(output_text, add_special_tokens=False)
+            
+            # Build compatible response format
+            result = {
+                "output_ids": output_ids,
+                "text": output_text,
+                "meta_info": {
+                    "completion_tokens": response.usage.completion_tokens,
+                    "prompt_tokens": response.usage.prompt_tokens,
+                    "total_tokens": response.usage.total_tokens,
+                }
+            }
+            
+            return result
+            
+        except Exception as e:
+            logger.error(f"OpenAI request failed: {e}")
+            return {"error": str(e)}
+
+
 def load_tokenized_data(data_file: str) -> pd.DataFrame:
     """Load pre-tokenized data from pickle file produced by harmony-tokens.py."""
     logger.info(f"Loading tokenized data from {data_file}")
@@ -127,14 +199,23 @@ def load_tokenized_data(data_file: str) -> pd.DataFrame:
 
 def send_single_request(args_tuple):
     """Send a single request - used by multiprocessing pool."""
-    input_ids, max_tokens, server_url, sample_id, temperature, top_k, timeout = args_tuple
-
-    # Create a new client for this process
-    client = SGLangClient(
-        server_url=server_url,
-        temperature=temperature,
-        top_k=top_k,
-        timeout=timeout)
+    input_ids, max_tokens, server_url, sample_id, pass_num, temperature, top_k, timeout, client_type, api_key, model = args_tuple
+
+    # Create a new client for this process based on client_type
+    if client_type == "openai":
+        client = OpenAIClient(
+            server_url=server_url,
+            temperature=temperature,
+            top_k=top_k,
+            timeout=timeout,
+            api_key=api_key,
+            model=model)
+    else:  # default to sglang
+        client = SGLangClient(
+            server_url=server_url,
+            temperature=temperature,
+            top_k=top_k,
+            timeout=timeout)
 
     try:
         # Track latency: time from request sent to response received
@@ -142,34 +223,44 @@ def send_single_request(args_tuple):
         response = client.send_request(input_ids, max_tokens=max_tokens)
         end_time = time.time()
         latency = end_time - start_time
-        return sample_id, response, latency
+        return sample_id, pass_num, response, latency
     except Exception as e:
-        logger.error(f"Request {sample_id} failed: {e}")
+        logger.error(f"Request {sample_id} (pass {pass_num}) failed: {e}")
         # Return None for latency on error
-        return sample_id, {"error": str(e)}, None
+        return sample_id, pass_num, {"error": str(e)}, None
 
 
 def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
-                           max_tokens: int = 100, max_concurrency: int = 128, temperature: float = 0.001, top_k: int = 1, timeout: int = 1200):
-    """Send all requests to SGLang server in parallel using multiprocessing.
+                           max_tokens: int = 100, max_concurrency: int = 128, temperature: float = 0.001, top_k: int = 1, timeout: int = 1200,
+                           client_type: str = "sglang", api_key: str = None, model: str = "gpt-4", pass_k: int = 1):
+    """Send all requests to server in parallel using multiprocessing.
+    
+    Args:
+        client_type: Type of client to use ("sglang" or "openai")
+        api_key: API key for OpenAI client (optional, will use env var if not provided)
+        model: Model name for OpenAI client
+        pass_k: Number of inference passes per sample for pass@k strategy
     
     Returns:
-        tuple: (responses, latencies) - List of responses and list of latencies in seconds
+        tuple: (responses_by_pass, latencies_by_pass) - Dict mapping (sample_id, pass_num) to response/latency
     """
     num_samples = len(tokenized_df)
+    total_requests = num_samples * pass_k
     logger.info(
-        f"Sending {num_samples} requests to server with {max_concurrency} concurrent workers...")
+        f"Sending {total_requests} requests ({num_samples} samples × {pass_k} passes) to server with {max_concurrency} concurrent workers using {client_type} client...")
 
-    # Prepare arguments for multiprocessing
-    args_list = [
-        (row['tok_input'], max_tokens, server_url,
-         idx, temperature, top_k, timeout)
-        for idx, row in tokenized_df.iterrows()
-    ]
+    # Prepare arguments for multiprocessing - create pass_k requests per sample
+    args_list = []
+    for idx, row in tokenized_df.iterrows():
+        for pass_num in range(pass_k):
+            args_list.append((
+                row['tok_input'], max_tokens, server_url,
+                idx, pass_num, temperature, top_k, timeout, client_type, api_key, model
+            ))
 
     start_time = time.time()
 
-    with Pool(processes=min(max_concurrency, num_samples)) as pool:
+    with Pool(processes=min(max_concurrency, total_requests)) as pool:
         results = list(tqdm(
             pool.imap_unordered(send_single_request, args_list),
             total=len(args_list),
@@ -177,109 +268,165 @@ def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
             unit="request"
         ))
 
-    # Sort results by sample_id to maintain order
-    results.sort(key=lambda x: x[0])
-    responses = [result[1] for result in results]
-    latencies = [result[2] for result in results]
+    # Group results by sample_id and pass_num
+    responses_by_pass = {}
+    latencies_by_pass = {}
+    for sample_id, pass_num, response, latency in results:
+        responses_by_pass[(sample_id, pass_num)] = response
+        latencies_by_pass[(sample_id, pass_num)] = latency
 
     total_time = time.time() - start_time
     logger.info(
-        f"Completed {num_samples} requests in {total_time:.2f} seconds")
-    logger.info(f"Average rate: {num_samples/total_time:.2f} requests/sec")
+        f"Completed {total_requests} requests in {total_time:.2f} seconds")
+    logger.info(f"Average rate: {total_requests/total_time:.2f} requests/sec")
     
     # Log latency statistics
-    valid_latencies = [lat for lat in latencies if lat is not None]
+    valid_latencies = [lat for lat in latencies_by_pass.values() if lat is not None]
     if valid_latencies:
         avg_latency = sum(valid_latencies) / len(valid_latencies)
         min_latency = min(valid_latencies)
         max_latency = max(valid_latencies)
         logger.info(f"Latency stats - Avg: {avg_latency:.3f}s, Min: {min_latency:.3f}s, Max: {max_latency:.3f}s")
 
-    return responses, latencies
+    return responses_by_pass, latencies_by_pass
 
 
 def extract_response_ids(
-        responses: List[Dict[str, Any]], tokenized_df: pd.DataFrame) -> List[List[int]]:
-    """Extract response output_ids from SGLang responses."""
+        responses_by_pass: Dict[tuple, Dict[str, Any]], tokenized_df: pd.DataFrame, pass_k: int) -> Dict[tuple, List[int]]:
+    """Extract response output_ids from SGLang responses for all passes.
+    
+    Args:
+        responses_by_pass: Dict mapping (sample_id, pass_num) to response
+        tokenized_df: DataFrame with samples
+        pass_k: Number of passes per sample
+    
+    Returns:
+        Dict mapping (sample_id, pass_num) to output_ids list
+    """
     logger.info("Extracting response output_ids...")
 
-    response_ids = []
-    for i, (response, (_, row)) in enumerate(tqdm(zip(responses, tokenized_df.iterrows()),
-                                                  total=len(responses),
-                                                  desc="Extracting responses",
-                                                  unit="response")):
-        response_id = []
-        if "error" not in response and "output_ids" in response:
-            try:
-                # SGLang returns the generated token IDs in the 'output_ids'
-                # field
-                response_id = response["output_ids"]
-            except Exception as e:
-                logger.warning(
-                    f"Failed to extract response for sample {i+1}: {e}")
-        response_ids.append(response_id)
+    response_ids_by_pass = {}
+    total_responses = len(tokenized_df) * pass_k
+    
+    with tqdm(total=total_responses, desc="Extracting responses", unit="response") as pbar:
+        for idx, row in tokenized_df.iterrows():
+            for pass_num in range(pass_k):
+                response = responses_by_pass.get((idx, pass_num), {})
+                response_id = []
+                if "error" not in response and "output_ids" in response:
+                    try:
+                        # SGLang returns the generated token IDs in the 'output_ids' field
+                        response_id = response["output_ids"]
+                    except Exception as e:
+                        logger.warning(
+                            f"Failed to extract response for sample {idx}, pass {pass_num}: {e}")
+                response_ids_by_pass[(idx, pass_num)] = response_id
+                pbar.update(1)
 
     logger.info("Response output_ids extraction complete")
-    return response_ids
+    return response_ids_by_pass
 
 
-def detokenize_output_ids(response_ids: List[List[int]]) -> List[str]:
-    """Detokenize output_ids back to text using AutoTokenizer."""
+def detokenize_output_ids(response_ids_by_pass: Dict[tuple, List[int]], pass_k: int) -> Dict[tuple, str]:
+    """Detokenize output_ids back to text using AutoTokenizer for all passes.
+    
+    Args:
+        response_ids_by_pass: Dict mapping (sample_id, pass_num) to output_ids
+        pass_k: Number of passes per sample
+    
+    Returns:
+        Dict mapping (sample_id, pass_num) to detokenized text
+    """
     logger.info("Detokenizing output_ids to text...")
 
     tokenizer = get_tokenizer()
-    detokenized_texts = []
+    detokenized_texts_by_pass = {}
 
-    for i, token_ids in enumerate(
-            tqdm(response_ids, desc="Detokenizing outputs", unit="output")):
+    for (sample_id, pass_num), token_ids in tqdm(
+            response_ids_by_pass.items(), desc="Detokenizing outputs", unit="output"):
         try:
             # Detokenize the token IDs back to text
             text = tokenizer.decode(token_ids, skip_special_tokens=True)
-            detokenized_texts.append(text)
+            detokenized_texts_by_pass[(sample_id, pass_num)] = text
         except Exception as e:
             logger.warning(
-                f"Failed to detokenize output for sample {i+1}: {e}")
-            detokenized_texts.append("")
+                f"Failed to detokenize output for sample {sample_id}, pass {pass_num}: {e}")
+            detokenized_texts_by_pass[(sample_id, pass_num)] = ""
 
     logger.info("Output detokenization complete")
-    return detokenized_texts
+    return detokenized_texts_by_pass
 
 
-def save_responses(responses: List[Dict[str, Any]], response_ids: List[List[int]],
-                   detokenized_texts: List[str], latencies: List[float], 
-                   tokenized_df: pd.DataFrame, output_file: str = None) -> pd.DataFrame:
-    """Save all responses to DataFrame and optionally to pickle file."""
+def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]], 
+                   response_ids_by_pass: Dict[tuple, List[int]],
+                   detokenized_texts_by_pass: Dict[tuple, str], 
+                   latencies_by_pass: Dict[tuple, float], 
+                   tokenized_df: pd.DataFrame, pass_k: int, output_file: str = None) -> pd.DataFrame:
+    """Save all responses to DataFrame and optionally to pickle file.
+    
+    Args:
+        responses_by_pass: Dict mapping (sample_id, pass_num) to response
+        response_ids_by_pass: Dict mapping (sample_id, pass_num) to output_ids
+        detokenized_texts_by_pass: Dict mapping (sample_id, pass_num) to text
+        latencies_by_pass: Dict mapping (sample_id, pass_num) to latency
+        tokenized_df: Original DataFrame with samples
+        pass_k: Number of passes per sample
+        output_file: Optional output pickle file
+    
+    Returns:
+        DataFrame with columns for each pass (e.g., model_output_0, model_output_1, ...)
+    """
     logger.info("Processing responses and updating DataFrame...")
 
     # Work with the original DataFrame
     result_df = tokenized_df.copy()
 
-    # Overwrite existing columns with server response data
-    result_df['model_output'] = detokenized_texts  # Detokenized text output
-    # Original output_ids from SGLang
-    result_df['tok_model_output'] = response_ids
-    result_df['tok_model_output_len'] = [
-        len(token_ids) for token_ids in response_ids]  # Length of output_ids
-    result_df['infer_time'] = latencies  # E2E latency in seconds
+    # Create columns for each pass with _0, _1, _2, ... suffixes
+    for pass_num in range(pass_k):
+        # Lists to store data for this pass
+        model_outputs = []
+        tok_model_outputs = []
+        tok_model_output_lens = []
+        infer_times = []
+        
+        for idx in tokenized_df.index:
+            key = (idx, pass_num)
+            detokenized_text = detokenized_texts_by_pass.get(key, "")
+            response_ids = response_ids_by_pass.get(key, [])
+            latency = latencies_by_pass.get(key, None)
+            
+            model_outputs.append(detokenized_text)
+            tok_model_outputs.append(response_ids)
+            tok_model_output_lens.append(len(response_ids))
+            infer_times.append(latency)
+        
+        # Add columns with suffixes
+        result_df[f'model_output_{pass_num}'] = model_outputs
+        result_df[f'tok_model_output_{pass_num}'] = tok_model_outputs
+        result_df[f'tok_model_output_len_{pass_num}'] = tok_model_output_lens
+        result_df[f'infer_time_{pass_num}'] = infer_times
 
     # Calculate output token lengths for logging
-    output_token_lengths = []
-    for i, (response, response_ids) in enumerate(
-            zip(responses, response_ids)):
-        try:
-            output_token_length = response["meta_info"]["completion_tokens"] if "meta_info" in response else len(
-                response_ids)
-            output_token_lengths.append(output_token_length)
-        except Exception as e:
-            logger.warning(
-                f"Failed to calculate output tokens for sample {i+1}: {e}")
-            output_token_lengths.append(len(response_ids))
+    all_output_token_lengths = []
+    for idx in tokenized_df.index:
+        for pass_num in range(pass_k):
+            key = (idx, pass_num)
+            response = responses_by_pass.get(key, {})
+            response_ids = response_ids_by_pass.get(key, [])
+            try:
+                output_token_length = response.get("meta_info", {}).get("completion_tokens", len(response_ids))
+                all_output_token_lengths.append(output_token_length)
+            except Exception as e:
+                logger.warning(
+                    f"Failed to calculate output tokens for sample {idx}, pass {pass_num}: {e}")
+                all_output_token_lengths.append(len(response_ids))
 
     logger.info(f"Updated DataFrame with shape: {result_df.shape}")
-    logger.info(
-        f"Updated columns: model_output, tok_model_output, tok_model_output_len, infer_time")
-    logger.info(
-        f"Average output token length: {sum(output_token_lengths)/len(output_token_lengths):.1f}")
+    new_columns = [f'model_output_{i}, tok_model_output_{i}, tok_model_output_len_{i}, infer_time_{i}' for i in range(pass_k)]
+    logger.info(f"Added columns for {pass_k} passes: {', '.join(new_columns)}")
+    if all_output_token_lengths:
+        logger.info(
+            f"Average output token length: {sum(all_output_token_lengths)/len(all_output_token_lengths):.1f}")
 
     # Save to pickle file if output_file is provided
     if output_file:
@@ -293,37 +440,47 @@ def save_responses(responses: List[Dict[str, Any]], response_ids: List[List[int]
 def process_requests(tokenized_df: pd.DataFrame, server_url: str,
                      max_samples: int = None, max_tokens: int = 100,
                      max_concurrency: int = 128, output_file: str = None, temperature: float = 0.001, top_k: int = 1,
-                     timeout: int = 1200) -> pd.DataFrame:
-    """Main processing function that handles requests and response extraction."""
+                     timeout: int = 1200, client_type: str = "sglang", api_key: str = None, model: str = "gpt-4",
+                     pass_k: int = 1) -> pd.DataFrame:
+    """Main processing function that handles requests and response extraction.
+    
+    Args:
+        pass_k: Number of inference passes per sample for pass@k strategy
+    """
 
     # Step 1: Limit samples if specified
     if max_samples is not None:
         tokenized_df = tokenized_df.head(max_samples)
         logger.info(f"Limited to first {max_samples} samples")
 
-    # Step 2: Send all requests in parallel
-    responses, latencies = send_requests_parallel(
+    # Step 2: Send all requests in parallel (k passes per sample)
+    responses_by_pass, latencies_by_pass = send_requests_parallel(
         tokenized_df,
         server_url,
         max_tokens,
         max_concurrency,
         temperature,
         top_k,
-        timeout)
+        timeout,
+        client_type,
+        api_key,
+        model,
+        pass_k)
 
-    # Step 3: Extract response output_ids
-    response_ids = extract_response_ids(responses, tokenized_df)
+    # Step 3: Extract response output_ids for all passes
+    response_ids_by_pass = extract_response_ids(responses_by_pass, tokenized_df, pass_k)
 
-    # Step 4: Detokenize output_ids to text for model_output
-    detokenized_texts = detokenize_output_ids(response_ids)
+    # Step 4: Detokenize output_ids to text for model_output for all passes
+    detokenized_texts_by_pass = detokenize_output_ids(response_ids_by_pass, pass_k)
 
     # Step 5: Save all results and return DataFrame
     result_df = save_responses(
-        responses,
-        response_ids,
-        detokenized_texts,
-        latencies,
+        responses_by_pass,
+        response_ids_by_pass,
+        detokenized_texts_by_pass,
+        latencies_by_pass,
         tokenized_df,
+        pass_k,
         output_file)
 
     return result_df
@@ -331,41 +488,62 @@ def process_requests(tokenized_df: pd.DataFrame, server_url: str,
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Send pre-tokenized requests to SGLang server")
+        description="Send pre-tokenized requests to inference server (SGLang or OpenAI)")
     parser.add_argument("--input-tokens", required=True,
                         help="Path to pickle file containing pre-tokenized data from harmony-tokens.py")
     parser.add_argument("--server-url", default="http://localhost:30000",
-                        help="SGLang server URL (default: http://localhost:30000)")
+                        help="Server URL (default: http://localhost:30000 for SGLang, https://api.openai.com/v1 for OpenAI)")
     parser.add_argument("--max-samples", type=int, default=None,
                         help="Maximum number of samples to process (default: all)")
     parser.add_argument("--max-tokens", type=int, default=100,
                         help="Maximum tokens to generate per request")
     parser.add_argument("--max-concurrency", type=int, default=256,
-                        help="Maximum number of concurrent requests (default: 128)")
+                        help="Maximum number of concurrent requests (default: 256)")
     parser.add_argument("--output", default=None,
                         help="Output pickle file for responses (optional)")
+    parser.add_argument("--pass-k", type=int, default=1,
+                        help="Number of inference passes per sample for pass@k strategy (default: 1)")
     parser.add_argument("--temperature", type=float, default=0.001,
                         help="Temperature for sampling (default: 0.001)")
     parser.add_argument("--top-k", type=int, default=1,
                         help="Top-k for sampling (default: 1)")
     parser.add_argument("--timeout", type=int, default=1200,
                         help="Timeout for requests (default: 1200)")
+    parser.add_argument("--client-type", choices=["sglang", "openai"], default="sglang",
+                        help="Client type to use: sglang or openai (default: sglang)")
+    parser.add_argument("--api-key", default=None,
+                        help="API key for OpenAI client (optional, will use OPENAI_API_KEY env var if not provided)")
+    parser.add_argument("--model", default="gpt-4",
+                        help="Model name for OpenAI client (default: gpt-4)")
 
     args = parser.parse_args()
 
     # Test connection
-    logger.info(f"Testing server connection to {args.server_url}...")
-    test_client = SGLangClient(
-        server_url=args.server_url,
-        temperature=args.temperature,
-        top_k=args.top_k,
-        timeout=args.timeout)
+    logger.info(f"Testing server connection to {args.server_url} using {args.client_type} client...")
+    if args.client_type == "openai":
+        test_client = OpenAIClient(
+            server_url=args.server_url,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            timeout=args.timeout,
+            api_key=args.api_key,
+            model=args.model)
+    else:
+        test_client = SGLangClient(
+            server_url=args.server_url,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            timeout=args.timeout)
+    
     test_response = test_client.send_request(input_ids=[1, 2, 3], max_tokens=5)
     if "error" in test_response:
         logger.error(f"Server connection failed: {test_response['error']}")
-        logger.error("Make sure your SGLang server is running. Try:")
-        logger.error(
-            "  python -m sglang.launch_server --model-path openai/gpt-oss-120b --mem-fraction-static 0.98 --tp 8")
+        if args.client_type == "sglang":
+            logger.error("Make sure your SGLang server is running. Try:")
+            logger.error(
+                "  python -m sglang.launch_server --model-path openai/gpt-oss-120b --mem-fraction-static 0.98 --tp 8")
+        else:
+            logger.error("Make sure your OpenAI API key is valid and the server URL is correct.")
         return
     logger.info("Server connection successful")
 
@@ -380,15 +558,30 @@ def main():
                                  output_file=args.output,
                                  temperature=args.temperature,
                                  top_k=args.top_k,
-                                 timeout=args.timeout)
+                                 timeout=args.timeout,
+                                 client_type=args.client_type,
+                                 api_key=args.api_key,
+                                 model=args.model,
+                                 pass_k=args.pass_k)
 
     # Print summary
     logger.info(f"\nProcessing completed:")
     logger.info(f"  - Total samples processed: {len(result_df)}")
+    logger.info(f"  - Number of passes per sample: {args.pass_k}")
     logger.info(
         f"  - Average input token length: {result_df['tok_input_len'].mean():.1f}")
-    logger.info(
-        f"  - Average output text length: {result_df['tok_model_output_len'].mean():.1f}")
+    
+    # Calculate average output length across all passes
+    if args.pass_k == 1:
+        avg_output_len = result_df['tok_model_output_len_0'].mean()
+        logger.info(f"  - Average output token length: {avg_output_len:.1f}")
+    else:
+        all_output_lens = []
+        for i in range(args.pass_k):
+            all_output_lens.extend(result_df[f'tok_model_output_len_{i}'].tolist())
+        avg_output_len = sum(all_output_lens) / len(all_output_lens) if all_output_lens else 0
+        logger.info(f"  - Average output token length (across all passes): {avg_output_len:.1f}")
+        
     if args.output:
         logger.info(f"  - Results saved to: {args.output}")
     else:

From 8c028396a29e19011d3e3a91899deccecb6901e7 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 5 Nov 2025 04:11:03 +0000
Subject: [PATCH 065/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/run_infer.py | 100 ++++++++++++++++++++--------------
 1 file changed, 58 insertions(+), 42 deletions(-)

diff --git a/language/gpt-oss/run_infer.py b/language/gpt-oss/run_infer.py
index b20754125b..c1fb84a59c 100644
--- a/language/gpt-oss/run_infer.py
+++ b/language/gpt-oss/run_infer.py
@@ -118,14 +118,15 @@ def __init__(self,
     def send_request(
             self, input_ids: List[int], max_tokens: int = 100) -> Dict[str, Any]:
         """Send a single request to the OpenAI-compatible server.
-        
+
         Note: OpenAI API expects text input, so we need to decode the input_ids first.
         """
         try:
             # Decode input_ids to text
             tokenizer = get_tokenizer()
-            prompt_text = tokenizer.decode(input_ids, skip_special_tokens=False)
-            
+            prompt_text = tokenizer.decode(
+                input_ids, skip_special_tokens=False)
+
             # Make request to OpenAI-compatible API
             response = self.client.completions.create(
                 model=self.model,
@@ -135,14 +136,15 @@ def send_request(
                 top_p=1.0,  # Use top_p instead of top_k for OpenAI
                 logprobs=0,  # Request logprobs to get token IDs
             )
-            
+
             # Convert OpenAI response to SGLang-compatible format
             choice = response.choices[0]
             output_text = choice.text
-            
+
             # Tokenize the output to get output_ids
-            output_ids = tokenizer.encode(output_text, add_special_tokens=False)
-            
+            output_ids = tokenizer.encode(
+                output_text, add_special_tokens=False)
+
             # Build compatible response format
             result = {
                 "output_ids": output_ids,
@@ -153,9 +155,9 @@ def send_request(
                     "total_tokens": response.usage.total_tokens,
                 }
             }
-            
+
             return result
-            
+
         except Exception as e:
             logger.error(f"OpenAI request failed: {e}")
             return {"error": str(e)}
@@ -234,13 +236,13 @@ def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
                            max_tokens: int = 100, max_concurrency: int = 128, temperature: float = 0.001, top_k: int = 1, timeout: int = 1200,
                            client_type: str = "sglang", api_key: str = None, model: str = "gpt-4", pass_k: int = 1):
     """Send all requests to server in parallel using multiprocessing.
-    
+
     Args:
         client_type: Type of client to use ("sglang" or "openai")
         api_key: API key for OpenAI client (optional, will use env var if not provided)
         model: Model name for OpenAI client
         pass_k: Number of inference passes per sample for pass@k strategy
-    
+
     Returns:
         tuple: (responses_by_pass, latencies_by_pass) - Dict mapping (sample_id, pass_num) to response/latency
     """
@@ -279,14 +281,16 @@ def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
     logger.info(
         f"Completed {total_requests} requests in {total_time:.2f} seconds")
     logger.info(f"Average rate: {total_requests/total_time:.2f} requests/sec")
-    
+
     # Log latency statistics
-    valid_latencies = [lat for lat in latencies_by_pass.values() if lat is not None]
+    valid_latencies = [
+        lat for lat in latencies_by_pass.values() if lat is not None]
     if valid_latencies:
         avg_latency = sum(valid_latencies) / len(valid_latencies)
         min_latency = min(valid_latencies)
         max_latency = max(valid_latencies)
-        logger.info(f"Latency stats - Avg: {avg_latency:.3f}s, Min: {min_latency:.3f}s, Max: {max_latency:.3f}s")
+        logger.info(
+            f"Latency stats - Avg: {avg_latency:.3f}s, Min: {min_latency:.3f}s, Max: {max_latency:.3f}s")
 
     return responses_by_pass, latencies_by_pass
 
@@ -294,12 +298,12 @@ def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
 def extract_response_ids(
         responses_by_pass: Dict[tuple, Dict[str, Any]], tokenized_df: pd.DataFrame, pass_k: int) -> Dict[tuple, List[int]]:
     """Extract response output_ids from SGLang responses for all passes.
-    
+
     Args:
         responses_by_pass: Dict mapping (sample_id, pass_num) to response
         tokenized_df: DataFrame with samples
         pass_k: Number of passes per sample
-    
+
     Returns:
         Dict mapping (sample_id, pass_num) to output_ids list
     """
@@ -307,7 +311,7 @@ def extract_response_ids(
 
     response_ids_by_pass = {}
     total_responses = len(tokenized_df) * pass_k
-    
+
     with tqdm(total=total_responses, desc="Extracting responses", unit="response") as pbar:
         for idx, row in tokenized_df.iterrows():
             for pass_num in range(pass_k):
@@ -315,7 +319,8 @@ def extract_response_ids(
                 response_id = []
                 if "error" not in response and "output_ids" in response:
                     try:
-                        # SGLang returns the generated token IDs in the 'output_ids' field
+                        # SGLang returns the generated token IDs in the
+                        # 'output_ids' field
                         response_id = response["output_ids"]
                     except Exception as e:
                         logger.warning(
@@ -327,13 +332,14 @@ def extract_response_ids(
     return response_ids_by_pass
 
 
-def detokenize_output_ids(response_ids_by_pass: Dict[tuple, List[int]], pass_k: int) -> Dict[tuple, str]:
+def detokenize_output_ids(
+        response_ids_by_pass: Dict[tuple, List[int]], pass_k: int) -> Dict[tuple, str]:
     """Detokenize output_ids back to text using AutoTokenizer for all passes.
-    
+
     Args:
         response_ids_by_pass: Dict mapping (sample_id, pass_num) to output_ids
         pass_k: Number of passes per sample
-    
+
     Returns:
         Dict mapping (sample_id, pass_num) to detokenized text
     """
@@ -357,13 +363,13 @@ def detokenize_output_ids(response_ids_by_pass: Dict[tuple, List[int]], pass_k:
     return detokenized_texts_by_pass
 
 
-def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]], 
+def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
                    response_ids_by_pass: Dict[tuple, List[int]],
-                   detokenized_texts_by_pass: Dict[tuple, str], 
-                   latencies_by_pass: Dict[tuple, float], 
+                   detokenized_texts_by_pass: Dict[tuple, str],
+                   latencies_by_pass: Dict[tuple, float],
                    tokenized_df: pd.DataFrame, pass_k: int, output_file: str = None) -> pd.DataFrame:
     """Save all responses to DataFrame and optionally to pickle file.
-    
+
     Args:
         responses_by_pass: Dict mapping (sample_id, pass_num) to response
         response_ids_by_pass: Dict mapping (sample_id, pass_num) to output_ids
@@ -372,7 +378,7 @@ def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
         tokenized_df: Original DataFrame with samples
         pass_k: Number of passes per sample
         output_file: Optional output pickle file
-    
+
     Returns:
         DataFrame with columns for each pass (e.g., model_output_0, model_output_1, ...)
     """
@@ -388,18 +394,18 @@ def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
         tok_model_outputs = []
         tok_model_output_lens = []
         infer_times = []
-        
+
         for idx in tokenized_df.index:
             key = (idx, pass_num)
             detokenized_text = detokenized_texts_by_pass.get(key, "")
             response_ids = response_ids_by_pass.get(key, [])
             latency = latencies_by_pass.get(key, None)
-            
+
             model_outputs.append(detokenized_text)
             tok_model_outputs.append(response_ids)
             tok_model_output_lens.append(len(response_ids))
             infer_times.append(latency)
-        
+
         # Add columns with suffixes
         result_df[f'model_output_{pass_num}'] = model_outputs
         result_df[f'tok_model_output_{pass_num}'] = tok_model_outputs
@@ -414,7 +420,9 @@ def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
             response = responses_by_pass.get(key, {})
             response_ids = response_ids_by_pass.get(key, [])
             try:
-                output_token_length = response.get("meta_info", {}).get("completion_tokens", len(response_ids))
+                output_token_length = response.get(
+                    "meta_info", {}).get(
+                    "completion_tokens", len(response_ids))
                 all_output_token_lengths.append(output_token_length)
             except Exception as e:
                 logger.warning(
@@ -422,7 +430,8 @@ def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
                 all_output_token_lengths.append(len(response_ids))
 
     logger.info(f"Updated DataFrame with shape: {result_df.shape}")
-    new_columns = [f'model_output_{i}, tok_model_output_{i}, tok_model_output_len_{i}, infer_time_{i}' for i in range(pass_k)]
+    new_columns = [
+        f'model_output_{i}, tok_model_output_{i}, tok_model_output_len_{i}, infer_time_{i}' for i in range(pass_k)]
     logger.info(f"Added columns for {pass_k} passes: {', '.join(new_columns)}")
     if all_output_token_lengths:
         logger.info(
@@ -443,7 +452,7 @@ def process_requests(tokenized_df: pd.DataFrame, server_url: str,
                      timeout: int = 1200, client_type: str = "sglang", api_key: str = None, model: str = "gpt-4",
                      pass_k: int = 1) -> pd.DataFrame:
     """Main processing function that handles requests and response extraction.
-    
+
     Args:
         pass_k: Number of inference passes per sample for pass@k strategy
     """
@@ -468,10 +477,12 @@ def process_requests(tokenized_df: pd.DataFrame, server_url: str,
         pass_k)
 
     # Step 3: Extract response output_ids for all passes
-    response_ids_by_pass = extract_response_ids(responses_by_pass, tokenized_df, pass_k)
+    response_ids_by_pass = extract_response_ids(
+        responses_by_pass, tokenized_df, pass_k)
 
     # Step 4: Detokenize output_ids to text for model_output for all passes
-    detokenized_texts_by_pass = detokenize_output_ids(response_ids_by_pass, pass_k)
+    detokenized_texts_by_pass = detokenize_output_ids(
+        response_ids_by_pass, pass_k)
 
     # Step 5: Save all results and return DataFrame
     result_df = save_responses(
@@ -519,7 +530,8 @@ def main():
     args = parser.parse_args()
 
     # Test connection
-    logger.info(f"Testing server connection to {args.server_url} using {args.client_type} client...")
+    logger.info(
+        f"Testing server connection to {args.server_url} using {args.client_type} client...")
     if args.client_type == "openai":
         test_client = OpenAIClient(
             server_url=args.server_url,
@@ -534,7 +546,7 @@ def main():
             temperature=args.temperature,
             top_k=args.top_k,
             timeout=args.timeout)
-    
+
     test_response = test_client.send_request(input_ids=[1, 2, 3], max_tokens=5)
     if "error" in test_response:
         logger.error(f"Server connection failed: {test_response['error']}")
@@ -543,7 +555,8 @@ def main():
             logger.error(
                 "  python -m sglang.launch_server --model-path openai/gpt-oss-120b --mem-fraction-static 0.98 --tp 8")
         else:
-            logger.error("Make sure your OpenAI API key is valid and the server URL is correct.")
+            logger.error(
+                "Make sure your OpenAI API key is valid and the server URL is correct.")
         return
     logger.info("Server connection successful")
 
@@ -570,7 +583,7 @@ def main():
     logger.info(f"  - Number of passes per sample: {args.pass_k}")
     logger.info(
         f"  - Average input token length: {result_df['tok_input_len'].mean():.1f}")
-    
+
     # Calculate average output length across all passes
     if args.pass_k == 1:
         avg_output_len = result_df['tok_model_output_len_0'].mean()
@@ -578,10 +591,13 @@ def main():
     else:
         all_output_lens = []
         for i in range(args.pass_k):
-            all_output_lens.extend(result_df[f'tok_model_output_len_{i}'].tolist())
-        avg_output_len = sum(all_output_lens) / len(all_output_lens) if all_output_lens else 0
-        logger.info(f"  - Average output token length (across all passes): {avg_output_len:.1f}")
-        
+            all_output_lens.extend(
+                result_df[f'tok_model_output_len_{i}'].tolist())
+        avg_output_len = sum(all_output_lens) / \
+            len(all_output_lens) if all_output_lens else 0
+        logger.info(
+            f"  - Average output token length (across all passes): {avg_output_len:.1f}")
+
     if args.output:
         logger.info(f"  - Results saved to: {args.output}")
     else:

From be4109ec2dafff875d55e72cc7776b57d6d32a7c Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Tue, 4 Nov 2025 23:18:13 -0800
Subject: [PATCH 066/165] restrict lcb to v5

---
 language/gpt-oss/golden-datasets/fetch_lcb.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/language/gpt-oss/golden-datasets/fetch_lcb.py b/language/gpt-oss/golden-datasets/fetch_lcb.py
index 755b33e8b5..a0b3e659aa 100644
--- a/language/gpt-oss/golden-datasets/fetch_lcb.py
+++ b/language/gpt-oss/golden-datasets/fetch_lcb.py
@@ -11,7 +11,9 @@
 parser.add_argument("--lcb_folder", type=str, default="lcb")
 args = parser.parse_args()
 
-files = glob.glob(os.path.join(args.lcb_folder, "*.jsonl"))
+files = ["test.jsonl", "test2.jsonl", "test3.jsonl", "test4.jsonl", "test5.jsonl"]
+files = [os.path.join(args.lcb_folder, file) for file in files]
+
 all_rows = []
 all_columns = set()
 

From c0b9ef3d2aa8287de18939eb18c664d117540435 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 5 Nov 2025 07:19:01 +0000
Subject: [PATCH 067/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/golden-datasets/fetch_lcb.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/language/gpt-oss/golden-datasets/fetch_lcb.py b/language/gpt-oss/golden-datasets/fetch_lcb.py
index a0b3e659aa..5b6bf513e3 100644
--- a/language/gpt-oss/golden-datasets/fetch_lcb.py
+++ b/language/gpt-oss/golden-datasets/fetch_lcb.py
@@ -11,7 +11,12 @@
 parser.add_argument("--lcb_folder", type=str, default="lcb")
 args = parser.parse_args()
 
-files = ["test.jsonl", "test2.jsonl", "test3.jsonl", "test4.jsonl", "test5.jsonl"]
+files = [
+    "test.jsonl",
+    "test2.jsonl",
+    "test3.jsonl",
+    "test4.jsonl",
+    "test5.jsonl"]
 files = [os.path.join(args.lcb_folder, file) for file in files]
 
 all_rows = []

From dc54e980e9b5839b26b75b8c8e0c268b0896aaa4 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 5 Nov 2025 09:12:17 +0000
Subject: [PATCH 068/165] lcb optimizations

---
 language/gpt-oss/eval_accuracy.py | 256 +++++++++++++++++-------------
 1 file changed, 148 insertions(+), 108 deletions(-)

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
index 21dd0e2a2c..fa888a33da 100644
--- a/language/gpt-oss/eval_accuracy.py
+++ b/language/gpt-oss/eval_accuracy.py
@@ -975,7 +975,8 @@ def process_livecodebench_parallel(
         extracted_answer_col: str = 'extracted_answer',
         prompt_accuracy_col: str = 'prompt_accuracy',
         evaluation_details_col: str = 'evaluation_details',
-        pass_label: str = '') -> Tuple[int, int]:
+        pass_label: str = '',
+        executor: Optional[ProcessPoolExecutor] = None) -> Tuple[int, int]:
     """Process LiveCodeBench items in parallel.
     
     Args:
@@ -985,6 +986,7 @@ def process_livecodebench_parallel(
         prompt_accuracy_col: Column name for accuracy results
         evaluation_details_col: Column name for evaluation details
         pass_label: Label for logging (e.g., 'pass 0', 'pass 1')
+        executor: Optional ProcessPoolExecutor to reuse (for performance)
     
     Returns:
         Tuple of (correct_count, total_evaluated)
@@ -1007,7 +1009,7 @@ def process_livecodebench_parallel(
         df[evaluation_details_col] = None
 
     # Process in parallel
-    max_workers = min(multiprocessing.cpu_count(), len(work_items))
+    max_workers = min(multiprocessing.cpu_count(), len(work_items), 64)
     desc = f"Evaluating LiveCodeBench{' ' + pass_label if pass_label else ''}"
     logger.info(
         f"Evaluating {len(work_items)} LiveCodeBench items{' ' + pass_label if pass_label else ''} with {max_workers} workers")
@@ -1015,7 +1017,13 @@ def process_livecodebench_parallel(
     correct_count = 0
     total_evaluated = 0
 
-    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+    # Determine whether to create new executor or reuse existing one
+    should_close_executor = False
+    if executor is None:
+        executor = ProcessPoolExecutor(max_workers=max_workers)
+        should_close_executor = True
+
+    try:
         future_to_idx = {
             executor.submit(evaluate_livecodebench_worker, (code, question_id)): idx
             for idx, code, question_id in work_items
@@ -1038,6 +1046,10 @@ def process_livecodebench_parallel(
                 df.at[idx, prompt_accuracy_col] = 0.0
                 df.at[idx, evaluation_details_col] = f"Error: {e}"
                 total_evaluated += 1
+    finally:
+        # Only close if we created it
+        if should_close_executor:
+            executor.shutdown(wait=True)
 
     return correct_count, total_evaluated
 
@@ -1290,118 +1302,146 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
         df_output['prompt_accuracy'] = 0.0  # Will be max of all passes
         df_output['evaluation_details'] = None  # Will aggregate details
 
-    # Process by dataset
-    for dataset_name, group_indices in tqdm(df_output.groupby('dataset').groups.items(),
-                                            desc="Processing datasets"):
-        evaluator = get_evaluator(dataset_name)
-
-        # Process each pass
-        for pass_num in range(pass_k):
-            suffix = f'_{pass_num}' if pass_k > 1 else ''
-            model_output_col = f'model_output{suffix}'
-            extracted_answer_col = f'extracted_answer{suffix}'
-            prompt_accuracy_col = f'prompt_accuracy{suffix}'
-            evaluation_details_col = f'evaluation_details{suffix}'
-            
-            logger.info(
-                f"Processing {len(group_indices)} rows for dataset '{dataset_name}', pass {pass_num}")
-            
-            # Parse answers for all rows in this dataset for this pass
-            for idx in group_indices:
-                row = df_output.loc[idx]
-                raw_output = validate_text_input(row[model_output_col])
-                extracted = evaluator['parse'](raw_output)
-                df_output.at[idx, extracted_answer_col] = extracted
-
-                # Set initial evaluation details for rows without extracted answers
-                if extracted is None or pd.isna(extracted):
-                    df_output.at[idx, evaluation_details_col] = "No answer extracted from model output"
-
-            # Evaluate answers for this pass
-            pass_label_str = f'(pass {pass_num})' if pass_k > 1 else ''
+    # Check if we have LiveCodeBench datasets to evaluate
+    has_livecodebench = any('livecodebench' in str(ds).lower() 
+                           for ds in df_output['dataset'].unique())
+    
+    # Pre-load LiveCodeBench benchmark and create shared process pool for all LCB evaluations
+    lcb_executor = None
+    if has_livecodebench:
+        try:
+            logger.info("Pre-loading LiveCodeBench benchmark for worker processes...")
+            # Load benchmark in main process before forking - workers will inherit via copy-on-write
+            _ = load_lcb_benchmark()
+            logger.info("LiveCodeBench benchmark loaded successfully")
             
-            if 'livecodebench' in dataset_name.lower():
-                correct_count, total_evaluated = process_livecodebench_parallel(
-                    df_output, 
-                    group_indices,
-                    extracted_answer_col=extracted_answer_col,
-                    prompt_accuracy_col=prompt_accuracy_col,
-                    evaluation_details_col=evaluation_details_col,
-                    pass_label=pass_label_str
-                )
-            elif 'healthbench' in dataset_name.lower():
-                # HealthBench evaluation with LLM grading - batched across all rows
-                total_score = 0.0
-                total_evaluated = 0
-
-                # Process all rows with batched grading for this pass
-                results = evaluate_healthbench_batch(
-                    df_output,
-                    group_indices,
-                    grader_api_key=LLM_JUDGE_API_KEY,
-                    grader_base_url=LLM_JUDGE_BASE_URL,
-                    grader_model=LLM_JUDGE_MODEL,
-                    max_workers=LLM_JUDGE_MAX_WORKERS,
-                    extracted_answer_col=extracted_answer_col,
-                    pass_label=pass_label_str
-                )
-
-                # Store results for this pass
-                for idx, (score, explanation) in results.items():
-                    # Store score as percentage (0-100)
-                    df_output.at[idx, prompt_accuracy_col] = score * 100.0
-                    df_output.at[idx, evaluation_details_col] = explanation
-                    total_evaluated += 1
-                    total_score += score
-            else:
-                # Sequential evaluation for other datasets
-                correct_count = 0
-                total_evaluated = 0
+            # Create a single process pool for all LCB evaluations
+            max_workers = multiprocessing.cpu_count()
+            lcb_executor = ProcessPoolExecutor(max_workers=max_workers)
+            logger.info(f"Created shared ProcessPoolExecutor with {max_workers} workers for LiveCodeBench")
+        except Exception as e:
+            logger.warning(f"Failed to pre-load LiveCodeBench benchmark: {e}")
+            logger.warning("Will fall back to per-evaluation loading")
 
+    try:
+        # Process by dataset
+        for dataset_name, group_indices in tqdm(df_output.groupby('dataset').groups.items(),
+                                                desc="Processing datasets"):
+            evaluator = get_evaluator(dataset_name)
+
+            # Process each pass
+            for pass_num in range(pass_k):
+                suffix = f'_{pass_num}' if pass_k > 1 else ''
+                model_output_col = f'model_output{suffix}'
+                extracted_answer_col = f'extracted_answer{suffix}'
+                prompt_accuracy_col = f'prompt_accuracy{suffix}'
+                evaluation_details_col = f'evaluation_details{suffix}'
+                
+                logger.info(
+                    f"Processing {len(group_indices)} rows for dataset '{dataset_name}', pass {pass_num}")
+                
+                # Parse answers for all rows in this dataset for this pass
                 for idx in group_indices:
                     row = df_output.loc[idx]
-                    extracted = row[extracted_answer_col]
-                    ground_truth = row.get('ground_truth')
+                    raw_output = validate_text_input(row[model_output_col])
+                    extracted = evaluator['parse'](raw_output)
+                    df_output.at[idx, extracted_answer_col] = extracted
+
+                    # Set initial evaluation details for rows without extracted answers
+                    if extracted is None or pd.isna(extracted):
+                        df_output.at[idx, evaluation_details_col] = "No answer extracted from model output"
 
-                    if extracted is not None and not pd.isna(ground_truth):
-                        is_correct = evaluator['evaluate'](extracted, ground_truth)
-                        df_output.at[idx, prompt_accuracy_col] = 100.0 if is_correct else 0.0
+                # Evaluate answers for this pass
+                pass_label_str = f'(pass {pass_num})' if pass_k > 1 else ''
+                
+                if 'livecodebench' in dataset_name.lower():
+                    correct_count, total_evaluated = process_livecodebench_parallel(
+                        df_output, 
+                        group_indices,
+                        extracted_answer_col=extracted_answer_col,
+                        prompt_accuracy_col=prompt_accuracy_col,
+                        evaluation_details_col=evaluation_details_col,
+                        pass_label=pass_label_str,
+                        executor=lcb_executor  # Reuse shared executor
+                    )
+                elif 'healthbench' in dataset_name.lower():
+                    # HealthBench evaluation with LLM grading - batched across all rows
+                    total_score = 0.0
+                    total_evaluated = 0
+
+                    # Process all rows with batched grading for this pass
+                    results = evaluate_healthbench_batch(
+                        df_output,
+                        group_indices,
+                        grader_api_key=LLM_JUDGE_API_KEY,
+                        grader_base_url=LLM_JUDGE_BASE_URL,
+                        grader_model=LLM_JUDGE_MODEL,
+                        max_workers=LLM_JUDGE_MAX_WORKERS,
+                        extracted_answer_col=extracted_answer_col,
+                        pass_label=pass_label_str
+                    )
+
+                    # Store results for this pass
+                    for idx, (score, explanation) in results.items():
+                        # Store score as percentage (0-100)
+                        df_output.at[idx, prompt_accuracy_col] = score * 100.0
+                        df_output.at[idx, evaluation_details_col] = explanation
                         total_evaluated += 1
-                        if is_correct:
-                            correct_count += 1
-
-            # Log results for this pass
-            if total_evaluated > 0:
-                if 'healthbench' in dataset_name.lower():
-                    # For HealthBench, report average score
-                    avg_score = total_score / total_evaluated * 100
-                    logger.info(
-                        f"{dataset_name} pass {pass_num} results: Average score {avg_score:.1f}% ({total_evaluated} samples)")
+                        total_score += score
                 else:
-                    # For other datasets, report accuracy
-                    accuracy = correct_count / total_evaluated * 100
-                    logger.info(
-                        f"{dataset_name} pass {pass_num} results: {correct_count}/{total_evaluated} correct ({accuracy:.1f}% accuracy)")
-        
-        # Aggregate results across all passes (take max)
-        if pass_k > 1:
-            logger.info(f"Aggregating results across {pass_k} passes for dataset '{dataset_name}'")
-            for idx in group_indices:
-                # Get all accuracy values for this row
-                accuracies = []
-                for pass_num in range(pass_k):
-                    acc = df_output.at[idx, f'prompt_accuracy_{pass_num}']
-                    accuracies.append(acc if not pd.isna(acc) else 0.0)
-                
-                # Set aggregated accuracy as max
-                max_accuracy = max(accuracies)
-                df_output.at[idx, 'prompt_accuracy'] = max_accuracy
-                
-                # Find which pass achieved max accuracy
-                max_pass = accuracies.index(max_accuracy)
-                df_output.at[idx, 'evaluation_details'] = f"Best pass: {max_pass} (accuracy: {max_accuracy:.1f}%)"
-
-    return df_output
+                    # Sequential evaluation for other datasets
+                    correct_count = 0
+                    total_evaluated = 0
+
+                    for idx in group_indices:
+                        row = df_output.loc[idx]
+                        extracted = row[extracted_answer_col]
+                        ground_truth = row.get('ground_truth')
+
+                        if extracted is not None and not pd.isna(ground_truth):
+                            is_correct = evaluator['evaluate'](extracted, ground_truth)
+                            df_output.at[idx, prompt_accuracy_col] = 100.0 if is_correct else 0.0
+                            total_evaluated += 1
+                            if is_correct:
+                                correct_count += 1
+
+                # Log results for this pass
+                if total_evaluated > 0:
+                    if 'healthbench' in dataset_name.lower():
+                        # For HealthBench, report average score
+                        avg_score = total_score / total_evaluated * 100
+                        logger.info(
+                            f"{dataset_name} pass {pass_num} results: Average score {avg_score:.1f}% ({total_evaluated} samples)")
+                    else:
+                        # For other datasets, report accuracy
+                        accuracy = correct_count / total_evaluated * 100
+                        logger.info(
+                            f"{dataset_name} pass {pass_num} results: {correct_count}/{total_evaluated} correct ({accuracy:.1f}% accuracy)")
+            
+            # Aggregate results across all passes (take max)
+            if pass_k > 1:
+                logger.info(f"Aggregating results across {pass_k} passes for dataset '{dataset_name}'")
+                for idx in group_indices:
+                    # Get all accuracy values for this row
+                    accuracies = []
+                    for pass_num in range(pass_k):
+                        acc = df_output.at[idx, f'prompt_accuracy_{pass_num}']
+                        accuracies.append(acc if not pd.isna(acc) else 0.0)
+                    
+                    # Set aggregated accuracy as max
+                    max_accuracy = max(accuracies)
+                    df_output.at[idx, 'prompt_accuracy'] = max_accuracy
+                    
+                    # Find which pass achieved max accuracy
+                    max_pass = accuracies.index(max_accuracy)
+                    df_output.at[idx, 'evaluation_details'] = f"Best pass: {max_pass} (accuracy: {max_accuracy:.1f}%)"
+
+        return df_output
+    finally:
+        # Clean up shared LiveCodeBench executor
+        if lcb_executor is not None:
+            logger.info("Shutting down shared LiveCodeBench ProcessPoolExecutor")
+            lcb_executor.shutdown(wait=True)
 
 
 # =============================================================================

From 41cac5adfff9e983cfd47db62120af8e4ae87e7b Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 5 Nov 2025 09:14:27 +0000
Subject: [PATCH 069/165] remove openai client

---
 language/gpt-oss/harmonize_inputs.py |   4 -
 language/gpt-oss/run_infer.py        | 168 +++++----------------------
 2 files changed, 28 insertions(+), 144 deletions(-)

diff --git a/language/gpt-oss/harmonize_inputs.py b/language/gpt-oss/harmonize_inputs.py
index 4118368f74..e2e3f198c3 100644
--- a/language/gpt-oss/harmonize_inputs.py
+++ b/language/gpt-oss/harmonize_inputs.py
@@ -479,10 +479,6 @@ def process_row(args):
     successful_count = 0
     error_count = 0
 
-    # Set templated text input to None (if it exists)
-    if 'templated_text_input' in df.columns:
-        df['templated_text_input'] = None
-
     # Initialize columns for harmony tokenized input
     df['tok_input'] = None
     df['tok_input_len'] = None
diff --git a/language/gpt-oss/run_infer.py b/language/gpt-oss/run_infer.py
index c1fb84a59c..823a22e258 100644
--- a/language/gpt-oss/run_infer.py
+++ b/language/gpt-oss/run_infer.py
@@ -1,21 +1,18 @@
 #!/usr/bin/env python3
 """
-Script to send pre-tokenized requests to inference server (SGLang or OpenAI).
+Script to send pre-tokenized requests to SGLang server.
 
 Usage:
     python run_infer.py --input-tokens tokenized_data.pkl [options]
 
 Arguments:
     --input-tokens     Path to pickle file containing pre-tokenized data from harmony-tokens.py
-    --server-url       Server URL (default: http://localhost:30000)
+    --server-url       SGLang server URL (default: http://localhost:30000)
     --max-samples      Maximum number of samples to process (default: all)
     --max-tokens       Maximum tokens to generate per request (default: 100)
     --max-concurrency  Maximum number of concurrent requests (default: 256)
     --output           Output pickle file for responses (optional)
     --pass-k           Number of inference passes per sample for pass@k strategy (default: 1)
-    --client-type      Client type: sglang or openai (default: sglang)
-    --api-key          API key for OpenAI client (optional)
-    --model            Model name for OpenAI client (default: gpt-4)
 """
 
 import requests
@@ -28,7 +25,6 @@
 import pandas as pd
 from tqdm import tqdm
 from transformers import AutoTokenizer
-from openai import OpenAI
 
 # Set up logging
 logging.basicConfig(
@@ -94,75 +90,6 @@ def send_request(
             return {"error": str(e)}
 
 
-class OpenAIClient:
-    def __init__(self,
-                 server_url: str = "https://api.openai.com/v1",
-                 temperature: float = 0.001,
-                 top_k: int = 1,
-                 timeout: int = 1200,
-                 api_key: str = None,
-                 model: str = "gpt-4"
-                 ):
-        self.base_url = server_url
-        self.temperature = temperature
-        self.top_k = top_k
-        self.timeout = timeout
-        self.model = model
-        # Initialize OpenAI client
-        self.client = OpenAI(
-            api_key=api_key,
-            base_url=server_url,
-            timeout=timeout
-        )
-
-    def send_request(
-            self, input_ids: List[int], max_tokens: int = 100) -> Dict[str, Any]:
-        """Send a single request to the OpenAI-compatible server.
-
-        Note: OpenAI API expects text input, so we need to decode the input_ids first.
-        """
-        try:
-            # Decode input_ids to text
-            tokenizer = get_tokenizer()
-            prompt_text = tokenizer.decode(
-                input_ids, skip_special_tokens=False)
-
-            # Make request to OpenAI-compatible API
-            response = self.client.completions.create(
-                model=self.model,
-                prompt=prompt_text,
-                max_tokens=max_tokens,
-                temperature=self.temperature,
-                top_p=1.0,  # Use top_p instead of top_k for OpenAI
-                logprobs=0,  # Request logprobs to get token IDs
-            )
-
-            # Convert OpenAI response to SGLang-compatible format
-            choice = response.choices[0]
-            output_text = choice.text
-
-            # Tokenize the output to get output_ids
-            output_ids = tokenizer.encode(
-                output_text, add_special_tokens=False)
-
-            # Build compatible response format
-            result = {
-                "output_ids": output_ids,
-                "text": output_text,
-                "meta_info": {
-                    "completion_tokens": response.usage.completion_tokens,
-                    "prompt_tokens": response.usage.prompt_tokens,
-                    "total_tokens": response.usage.total_tokens,
-                }
-            }
-
-            return result
-
-        except Exception as e:
-            logger.error(f"OpenAI request failed: {e}")
-            return {"error": str(e)}
-
-
 def load_tokenized_data(data_file: str) -> pd.DataFrame:
     """Load pre-tokenized data from pickle file produced by harmony-tokens.py."""
     logger.info(f"Loading tokenized data from {data_file}")
@@ -201,23 +128,14 @@ def load_tokenized_data(data_file: str) -> pd.DataFrame:
 
 def send_single_request(args_tuple):
     """Send a single request - used by multiprocessing pool."""
-    input_ids, max_tokens, server_url, sample_id, pass_num, temperature, top_k, timeout, client_type, api_key, model = args_tuple
-
-    # Create a new client for this process based on client_type
-    if client_type == "openai":
-        client = OpenAIClient(
-            server_url=server_url,
-            temperature=temperature,
-            top_k=top_k,
-            timeout=timeout,
-            api_key=api_key,
-            model=model)
-    else:  # default to sglang
-        client = SGLangClient(
-            server_url=server_url,
-            temperature=temperature,
-            top_k=top_k,
-            timeout=timeout)
+    input_ids, max_tokens, server_url, sample_id, pass_num, temperature, top_k, timeout = args_tuple
+
+    # Create a new client for this process
+    client = SGLangClient(
+        server_url=server_url,
+        temperature=temperature,
+        top_k=top_k,
+        timeout=timeout)
 
     try:
         # Track latency: time from request sent to response received
@@ -234,13 +152,10 @@ def send_single_request(args_tuple):
 
 def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
                            max_tokens: int = 100, max_concurrency: int = 128, temperature: float = 0.001, top_k: int = 1, timeout: int = 1200,
-                           client_type: str = "sglang", api_key: str = None, model: str = "gpt-4", pass_k: int = 1):
-    """Send all requests to server in parallel using multiprocessing.
-
+                           pass_k: int = 1):
+    """Send all requests to SGLang server in parallel using multiprocessing.
+    
     Args:
-        client_type: Type of client to use ("sglang" or "openai")
-        api_key: API key for OpenAI client (optional, will use env var if not provided)
-        model: Model name for OpenAI client
         pass_k: Number of inference passes per sample for pass@k strategy
 
     Returns:
@@ -249,7 +164,7 @@ def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
     num_samples = len(tokenized_df)
     total_requests = num_samples * pass_k
     logger.info(
-        f"Sending {total_requests} requests ({num_samples} samples × {pass_k} passes) to server with {max_concurrency} concurrent workers using {client_type} client...")
+        f"Sending {total_requests} requests ({num_samples} samples × {pass_k} passes) to server with {max_concurrency} concurrent workers...")
 
     # Prepare arguments for multiprocessing - create pass_k requests per sample
     args_list = []
@@ -257,7 +172,7 @@ def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
         for pass_num in range(pass_k):
             args_list.append((
                 row['tok_input'], max_tokens, server_url,
-                idx, pass_num, temperature, top_k, timeout, client_type, api_key, model
+                idx, pass_num, temperature, top_k, timeout
             ))
 
     start_time = time.time()
@@ -449,8 +364,7 @@ def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
 def process_requests(tokenized_df: pd.DataFrame, server_url: str,
                      max_samples: int = None, max_tokens: int = 100,
                      max_concurrency: int = 128, output_file: str = None, temperature: float = 0.001, top_k: int = 1,
-                     timeout: int = 1200, client_type: str = "sglang", api_key: str = None, model: str = "gpt-4",
-                     pass_k: int = 1) -> pd.DataFrame:
+                     timeout: int = 1200, pass_k: int = 1) -> pd.DataFrame:
     """Main processing function that handles requests and response extraction.
 
     Args:
@@ -471,9 +385,6 @@ def process_requests(tokenized_df: pd.DataFrame, server_url: str,
         temperature,
         top_k,
         timeout,
-        client_type,
-        api_key,
-        model,
         pass_k)
 
     # Step 3: Extract response output_ids for all passes
@@ -499,11 +410,11 @@ def process_requests(tokenized_df: pd.DataFrame, server_url: str,
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Send pre-tokenized requests to inference server (SGLang or OpenAI)")
+        description="Send pre-tokenized requests to SGLang server")
     parser.add_argument("--input-tokens", required=True,
                         help="Path to pickle file containing pre-tokenized data from harmony-tokens.py")
     parser.add_argument("--server-url", default="http://localhost:30000",
-                        help="Server URL (default: http://localhost:30000 for SGLang, https://api.openai.com/v1 for OpenAI)")
+                        help="SGLang server URL (default: http://localhost:30000)")
     parser.add_argument("--max-samples", type=int, default=None,
                         help="Maximum number of samples to process (default: all)")
     parser.add_argument("--max-tokens", type=int, default=100,
@@ -520,43 +431,23 @@ def main():
                         help="Top-k for sampling (default: 1)")
     parser.add_argument("--timeout", type=int, default=1200,
                         help="Timeout for requests (default: 1200)")
-    parser.add_argument("--client-type", choices=["sglang", "openai"], default="sglang",
-                        help="Client type to use: sglang or openai (default: sglang)")
-    parser.add_argument("--api-key", default=None,
-                        help="API key for OpenAI client (optional, will use OPENAI_API_KEY env var if not provided)")
-    parser.add_argument("--model", default="gpt-4",
-                        help="Model name for OpenAI client (default: gpt-4)")
 
     args = parser.parse_args()
 
     # Test connection
-    logger.info(
-        f"Testing server connection to {args.server_url} using {args.client_type} client...")
-    if args.client_type == "openai":
-        test_client = OpenAIClient(
-            server_url=args.server_url,
-            temperature=args.temperature,
-            top_k=args.top_k,
-            timeout=args.timeout,
-            api_key=args.api_key,
-            model=args.model)
-    else:
-        test_client = SGLangClient(
-            server_url=args.server_url,
-            temperature=args.temperature,
-            top_k=args.top_k,
-            timeout=args.timeout)
-
+    logger.info(f"Testing server connection to {args.server_url}...")
+    test_client = SGLangClient(
+        server_url=args.server_url,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        timeout=args.timeout)
+    
     test_response = test_client.send_request(input_ids=[1, 2, 3], max_tokens=5)
     if "error" in test_response:
         logger.error(f"Server connection failed: {test_response['error']}")
-        if args.client_type == "sglang":
-            logger.error("Make sure your SGLang server is running. Try:")
-            logger.error(
-                "  python -m sglang.launch_server --model-path openai/gpt-oss-120b --mem-fraction-static 0.98 --tp 8")
-        else:
-            logger.error(
-                "Make sure your OpenAI API key is valid and the server URL is correct.")
+        logger.error("Make sure your SGLang server is running. Try:")
+        logger.error(
+            "  python -m sglang.launch_server --model-path openai/gpt-oss-120b --mem-fraction-static 0.98 --tp 8")
         return
     logger.info("Server connection successful")
 
@@ -572,9 +463,6 @@ def main():
                                  temperature=args.temperature,
                                  top_k=args.top_k,
                                  timeout=args.timeout,
-                                 client_type=args.client_type,
-                                 api_key=args.api_key,
-                                 model=args.model,
                                  pass_k=args.pass_k)
 
     # Print summary

From 06a5387ddb14511293a1900a57bf85bcbec73eb9 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 5 Nov 2025 09:14:59 +0000
Subject: [PATCH 070/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/eval_accuracy.py    | 116 ++++++++++++++++-----------
 language/gpt-oss/harmonize_inputs.py |   8 +-
 language/gpt-oss/run_infer.py        |   4 +-
 3 files changed, 75 insertions(+), 53 deletions(-)

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
index fa888a33da..66b10ef919 100644
--- a/language/gpt-oss/eval_accuracy.py
+++ b/language/gpt-oss/eval_accuracy.py
@@ -61,7 +61,7 @@
 
 def detect_pass_k(df: pd.DataFrame) -> int:
     """Detect if DataFrame has pass@k format and return k.
-    
+
     Returns:
         Number of passes (k) if pass@k format detected, otherwise 1
     """
@@ -69,11 +69,11 @@ def detect_pass_k(df: pd.DataFrame) -> int:
     pass_k = 0
     while f'model_output_{pass_k}' in df.columns:
         pass_k += 1
-    
+
     # If no _0 suffix found, check for single model_output column
     if pass_k == 0 and 'model_output' in df.columns:
         return 1
-    
+
     return pass_k
 
 
@@ -84,21 +84,23 @@ def validate_dataframe(df: pd.DataFrame) -> None:
 
     # Detect pass@k format
     pass_k = detect_pass_k(df)
-    
+
     if pass_k == 0:
-        raise ValueError("No model_output columns found (expected 'model_output' or 'model_output_0', 'model_output_1', etc.)")
-    
+        raise ValueError(
+            "No model_output columns found (expected 'model_output' or 'model_output_0', 'model_output_1', etc.)")
+
     # Check for dataset column
     if 'dataset' not in df.columns:
         raise ValueError("Missing required column: 'dataset'")
-    
+
     # Check for tok_model_output_len (either single or with suffixes)
     has_tok_len = False
     if pass_k == 1:
         has_tok_len = 'tok_model_output_len' in df.columns
     else:
-        has_tok_len = all(f'tok_model_output_len_{i}' in df.columns for i in range(pass_k))
-    
+        has_tok_len = all(
+            f'tok_model_output_len_{i}' in df.columns for i in range(pass_k))
+
     if not has_tok_len:
         raise ValueError("Missing required tok_model_output_len column(s)")
 
@@ -970,7 +972,7 @@ def process_row(row: pd.Series) -> Dict[str, Any]:
 
 
 def process_livecodebench_parallel(
-        df: pd.DataFrame, 
+        df: pd.DataFrame,
         group_indices: pd.Index,
         extracted_answer_col: str = 'extracted_answer',
         prompt_accuracy_col: str = 'prompt_accuracy',
@@ -978,7 +980,7 @@ def process_livecodebench_parallel(
         pass_label: str = '',
         executor: Optional[ProcessPoolExecutor] = None) -> Tuple[int, int]:
     """Process LiveCodeBench items in parallel.
-    
+
     Args:
         df: DataFrame with data
         group_indices: Indices to process
@@ -987,7 +989,7 @@ def process_livecodebench_parallel(
         evaluation_details_col: Column name for evaluation details
         pass_label: Label for logging (e.g., 'pass 0', 'pass 1')
         executor: Optional ProcessPoolExecutor to reuse (for performance)
-    
+
     Returns:
         Tuple of (correct_count, total_evaluated)
     """
@@ -1042,7 +1044,8 @@ def process_livecodebench_parallel(
                 if is_correct:
                     correct_count += 1
             except Exception as e:
-                logger.error(f"Error evaluating row {idx}{' ' + pass_label if pass_label else ''}: {e}")
+                logger.error(
+                    f"Error evaluating row {idx}{' ' + pass_label if pass_label else ''}: {e}")
                 df.at[idx, prompt_accuracy_col] = 0.0
                 df.at[idx, evaluation_details_col] = f"Error: {e}"
                 total_evaluated += 1
@@ -1163,7 +1166,8 @@ def evaluate_healthbench_batch(
             task_id += 1
 
     if not all_tasks:
-        logger.warning(f"No grading tasks to process{' for ' + pass_label if pass_label else ''}")
+        logger.warning(
+            f"No grading tasks to process{' for ' + pass_label if pass_label else ''}")
         return {}
 
     logger.info(
@@ -1276,7 +1280,7 @@ def _grade_single_task(task):
 
 def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """Process entire dataframe with optimized batch processing.
-    
+
     Supports both single-pass and pass@k formats:
     - Single-pass: model_output -> extracted_answer, prompt_accuracy
     - Pass@k: model_output_0, model_output_1, ... -> extracted_answer_0, prompt_accuracy_0, ...
@@ -1285,40 +1289,44 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     validate_dataframe(df)
 
     df_output = df.copy()
-    
+
     # Detect pass@k
     pass_k = detect_pass_k(df)
     logger.info(f"Detected pass@k format with k={pass_k}")
-    
+
     # Initialize columns for each pass
     for pass_num in range(pass_k):
         suffix = f'_{pass_num}' if pass_k > 1 else ''
         df_output[f'extracted_answer{suffix}'] = None
         df_output[f'prompt_accuracy{suffix}'] = 0.0
         df_output[f'evaluation_details{suffix}'] = None
-    
+
     # Add aggregated columns for pass@k
     if pass_k > 1:
         df_output['prompt_accuracy'] = 0.0  # Will be max of all passes
         df_output['evaluation_details'] = None  # Will aggregate details
 
     # Check if we have LiveCodeBench datasets to evaluate
-    has_livecodebench = any('livecodebench' in str(ds).lower() 
-                           for ds in df_output['dataset'].unique())
-    
-    # Pre-load LiveCodeBench benchmark and create shared process pool for all LCB evaluations
+    has_livecodebench = any('livecodebench' in str(ds).lower()
+                            for ds in df_output['dataset'].unique())
+
+    # Pre-load LiveCodeBench benchmark and create shared process pool for all
+    # LCB evaluations
     lcb_executor = None
     if has_livecodebench:
         try:
-            logger.info("Pre-loading LiveCodeBench benchmark for worker processes...")
-            # Load benchmark in main process before forking - workers will inherit via copy-on-write
+            logger.info(
+                "Pre-loading LiveCodeBench benchmark for worker processes...")
+            # Load benchmark in main process before forking - workers will
+            # inherit via copy-on-write
             _ = load_lcb_benchmark()
             logger.info("LiveCodeBench benchmark loaded successfully")
-            
+
             # Create a single process pool for all LCB evaluations
             max_workers = multiprocessing.cpu_count()
             lcb_executor = ProcessPoolExecutor(max_workers=max_workers)
-            logger.info(f"Created shared ProcessPoolExecutor with {max_workers} workers for LiveCodeBench")
+            logger.info(
+                f"Created shared ProcessPoolExecutor with {max_workers} workers for LiveCodeBench")
         except Exception as e:
             logger.warning(f"Failed to pre-load LiveCodeBench benchmark: {e}")
             logger.warning("Will fall back to per-evaluation loading")
@@ -1336,10 +1344,10 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                 extracted_answer_col = f'extracted_answer{suffix}'
                 prompt_accuracy_col = f'prompt_accuracy{suffix}'
                 evaluation_details_col = f'evaluation_details{suffix}'
-                
+
                 logger.info(
                     f"Processing {len(group_indices)} rows for dataset '{dataset_name}', pass {pass_num}")
-                
+
                 # Parse answers for all rows in this dataset for this pass
                 for idx in group_indices:
                     row = df_output.loc[idx]
@@ -1347,16 +1355,18 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                     extracted = evaluator['parse'](raw_output)
                     df_output.at[idx, extracted_answer_col] = extracted
 
-                    # Set initial evaluation details for rows without extracted answers
+                    # Set initial evaluation details for rows without extracted
+                    # answers
                     if extracted is None or pd.isna(extracted):
-                        df_output.at[idx, evaluation_details_col] = "No answer extracted from model output"
+                        df_output.at[idx,
+                                     evaluation_details_col] = "No answer extracted from model output"
 
                 # Evaluate answers for this pass
                 pass_label_str = f'(pass {pass_num})' if pass_k > 1 else ''
-                
+
                 if 'livecodebench' in dataset_name.lower():
                     correct_count, total_evaluated = process_livecodebench_parallel(
-                        df_output, 
+                        df_output,
                         group_indices,
                         extracted_answer_col=extracted_answer_col,
                         prompt_accuracy_col=prompt_accuracy_col,
@@ -1365,7 +1375,8 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                         executor=lcb_executor  # Reuse shared executor
                     )
                 elif 'healthbench' in dataset_name.lower():
-                    # HealthBench evaluation with LLM grading - batched across all rows
+                    # HealthBench evaluation with LLM grading - batched across
+                    # all rows
                     total_score = 0.0
                     total_evaluated = 0
 
@@ -1399,8 +1410,10 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                         ground_truth = row.get('ground_truth')
 
                         if extracted is not None and not pd.isna(ground_truth):
-                            is_correct = evaluator['evaluate'](extracted, ground_truth)
-                            df_output.at[idx, prompt_accuracy_col] = 100.0 if is_correct else 0.0
+                            is_correct = evaluator['evaluate'](
+                                extracted, ground_truth)
+                            df_output.at[idx,
+                                         prompt_accuracy_col] = 100.0 if is_correct else 0.0
                             total_evaluated += 1
                             if is_correct:
                                 correct_count += 1
@@ -1417,30 +1430,33 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                         accuracy = correct_count / total_evaluated * 100
                         logger.info(
                             f"{dataset_name} pass {pass_num} results: {correct_count}/{total_evaluated} correct ({accuracy:.1f}% accuracy)")
-            
+
             # Aggregate results across all passes (take max)
             if pass_k > 1:
-                logger.info(f"Aggregating results across {pass_k} passes for dataset '{dataset_name}'")
+                logger.info(
+                    f"Aggregating results across {pass_k} passes for dataset '{dataset_name}'")
                 for idx in group_indices:
                     # Get all accuracy values for this row
                     accuracies = []
                     for pass_num in range(pass_k):
                         acc = df_output.at[idx, f'prompt_accuracy_{pass_num}']
                         accuracies.append(acc if not pd.isna(acc) else 0.0)
-                    
+
                     # Set aggregated accuracy as max
                     max_accuracy = max(accuracies)
                     df_output.at[idx, 'prompt_accuracy'] = max_accuracy
-                    
+
                     # Find which pass achieved max accuracy
                     max_pass = accuracies.index(max_accuracy)
-                    df_output.at[idx, 'evaluation_details'] = f"Best pass: {max_pass} (accuracy: {max_accuracy:.1f}%)"
+                    df_output.at[idx,
+                                 'evaluation_details'] = f"Best pass: {max_pass} (accuracy: {max_accuracy:.1f}%)"
 
         return df_output
     finally:
         # Clean up shared LiveCodeBench executor
         if lcb_executor is not None:
-            logger.info("Shutting down shared LiveCodeBench ProcessPoolExecutor")
+            logger.info(
+                "Shutting down shared LiveCodeBench ProcessPoolExecutor")
             lcb_executor.shutdown(wait=True)
 
 
@@ -1464,26 +1480,30 @@ def print_evaluation_results(df_evaluated: pd.DataFrame,
 
     # Detect pass@k
     pass_k = detect_pass_k(df_evaluated)
-    
+
     # Calculate statistics
     if pass_k > 1:
         # For pass@k, use the aggregated prompt_accuracy (max across passes)
-        evaluated = df_evaluated['extracted_answer_0'].notna().sum()  # Count from first pass
+        # Count from first pass
+        evaluated = df_evaluated['extracted_answer_0'].notna().sum()
         correct = (df_evaluated['prompt_accuracy'] > 0).sum()
         accuracy = df_evaluated['prompt_accuracy'].mean()
-        
+
         # Calculate average token length across all passes
         all_output_lens = []
         for i in range(pass_k):
-            all_output_lens.extend(df_evaluated[f'tok_model_output_len_{i}'].tolist())
-        mean_output_len = float(sum(all_output_lens) / len(all_output_lens)) if all_output_lens else 0.0
+            all_output_lens.extend(
+                df_evaluated[f'tok_model_output_len_{i}'].tolist())
+        mean_output_len = float(
+            sum(all_output_lens) /
+            len(all_output_lens)) if all_output_lens else 0.0
     else:
         # Single pass format
         suffix = '' if 'extracted_answer' in df_evaluated.columns else '_0'
         evaluated = df_evaluated[f'extracted_answer{suffix}'].notna().sum()
         correct = (df_evaluated[f'prompt_accuracy{suffix}'] > 0).sum()
         accuracy = df_evaluated[f'prompt_accuracy{suffix}'].mean()
-        
+
         # tok_model_output_len is now a required column
         tok_len_col = 'tok_model_output_len' if 'tok_model_output_len' in df_evaluated.columns else 'tok_model_output_len_0'
         mean_output_len = float(df_evaluated[tok_len_col].mean())
@@ -1508,7 +1528,7 @@ def print_evaluation_results(df_evaluated: pd.DataFrame,
         'tokens_per_sample': mean_output_len,
         'num-samples': len(df_evaluated),
     }
-    
+
     if pass_k > 1:
         results['pass_k'] = pass_k
         # Also report individual pass accuracies
diff --git a/language/gpt-oss/harmonize_inputs.py b/language/gpt-oss/harmonize_inputs.py
index e2e3f198c3..d433d731c4 100644
--- a/language/gpt-oss/harmonize_inputs.py
+++ b/language/gpt-oss/harmonize_inputs.py
@@ -305,7 +305,8 @@ def create_healthbench_prompt(prompt, reasoning_effort=ReasoningEffort.HIGH):
     return _finalize_conversation(messages, None)
 
 
-def create_arxiv_summarization_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
+def create_arxiv_summarization_prompt(
+        user_query, reasoning_effort=ReasoningEffort.HIGH):
     """
     Creates a zero-shot prompt for arXiv paper summarization using Harmony format.
 
@@ -353,7 +354,7 @@ def process_row(args):
         tuple: (index, convo, tokens, dataset_name) or (index, None, None, dataset_name, error)
     """
     index, row, dataset_function_map, reasoning_effort = args
-    
+
     # Check if dataset column exists, use default if not
     if "dataset" in row:
         dataset_name = row["dataset"]
@@ -416,7 +417,8 @@ def process_row(args):
     # Filter by dataset if specified
     if args.dataset is not None:
         if 'dataset' not in df.columns:
-            print(f"WARNING: No 'dataset' column found in dataframe. Cannot filter by dataset.")
+            print(
+                f"WARNING: No 'dataset' column found in dataframe. Cannot filter by dataset.")
             print(f"All rows will be processed using the default prompt function.")
         else:
             original_len = len(df)
diff --git a/language/gpt-oss/run_infer.py b/language/gpt-oss/run_infer.py
index 823a22e258..f30205f99e 100644
--- a/language/gpt-oss/run_infer.py
+++ b/language/gpt-oss/run_infer.py
@@ -154,7 +154,7 @@ def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
                            max_tokens: int = 100, max_concurrency: int = 128, temperature: float = 0.001, top_k: int = 1, timeout: int = 1200,
                            pass_k: int = 1):
     """Send all requests to SGLang server in parallel using multiprocessing.
-    
+
     Args:
         pass_k: Number of inference passes per sample for pass@k strategy
 
@@ -441,7 +441,7 @@ def main():
         temperature=args.temperature,
         top_k=args.top_k,
         timeout=args.timeout)
-    
+
     test_response = test_client.send_request(input_ids=[1, 2, 3], max_tokens=5)
     if "error" in test_response:
         logger.error(f"Server connection failed: {test_response['error']}")

From d927fb396bcab0f9459d76604e2567bcd11abd59 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 10 Nov 2025 11:41:58 -0800
Subject: [PATCH 071/165] rename

---
 language/gpt-oss/{golden-datasets => data}/fetch_aime.py        | 0
 language/gpt-oss/{golden-datasets => data}/fetch_gpqa.py        | 0
 language/gpt-oss/{golden-datasets => data}/fetch_healthbench.py | 0
 language/gpt-oss/{golden-datasets => data}/fetch_lcb.py         | 0
 language/gpt-oss/{golden-datasets => data}/fetch_mmlu.py        | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename language/gpt-oss/{golden-datasets => data}/fetch_aime.py (100%)
 rename language/gpt-oss/{golden-datasets => data}/fetch_gpqa.py (100%)
 rename language/gpt-oss/{golden-datasets => data}/fetch_healthbench.py (100%)
 rename language/gpt-oss/{golden-datasets => data}/fetch_lcb.py (100%)
 rename language/gpt-oss/{golden-datasets => data}/fetch_mmlu.py (100%)

diff --git a/language/gpt-oss/golden-datasets/fetch_aime.py b/language/gpt-oss/data/fetch_aime.py
similarity index 100%
rename from language/gpt-oss/golden-datasets/fetch_aime.py
rename to language/gpt-oss/data/fetch_aime.py
diff --git a/language/gpt-oss/golden-datasets/fetch_gpqa.py b/language/gpt-oss/data/fetch_gpqa.py
similarity index 100%
rename from language/gpt-oss/golden-datasets/fetch_gpqa.py
rename to language/gpt-oss/data/fetch_gpqa.py
diff --git a/language/gpt-oss/golden-datasets/fetch_healthbench.py b/language/gpt-oss/data/fetch_healthbench.py
similarity index 100%
rename from language/gpt-oss/golden-datasets/fetch_healthbench.py
rename to language/gpt-oss/data/fetch_healthbench.py
diff --git a/language/gpt-oss/golden-datasets/fetch_lcb.py b/language/gpt-oss/data/fetch_lcb.py
similarity index 100%
rename from language/gpt-oss/golden-datasets/fetch_lcb.py
rename to language/gpt-oss/data/fetch_lcb.py
diff --git a/language/gpt-oss/golden-datasets/fetch_mmlu.py b/language/gpt-oss/data/fetch_mmlu.py
similarity index 100%
rename from language/gpt-oss/golden-datasets/fetch_mmlu.py
rename to language/gpt-oss/data/fetch_mmlu.py

From c2fda5e3057993afbdcf60f4aeb01746ceb41acf Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 10 Nov 2025 11:43:38 -0800
Subject: [PATCH 072/165] remove mmlu, healthbench

---
 language/gpt-oss/data/fetch_healthbench.py | 39 ----------------------
 language/gpt-oss/data/fetch_mmlu.py        | 28 ----------------
 2 files changed, 67 deletions(-)
 delete mode 100644 language/gpt-oss/data/fetch_healthbench.py
 delete mode 100644 language/gpt-oss/data/fetch_mmlu.py

diff --git a/language/gpt-oss/data/fetch_healthbench.py b/language/gpt-oss/data/fetch_healthbench.py
deleted file mode 100644
index 861817ae6f..0000000000
--- a/language/gpt-oss/data/fetch_healthbench.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import os
-import glob
-import json
-import pandas as pd
-import argparse
-import os
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--dataset_folder", type=str, default="dataset")
-parser.add_argument("--healthbench_folder", type=str, default="healthbench")
-args = parser.parse_args()
-
-# Find all jsonl files (update the path and pattern to match actual data)
-files = glob.glob(os.path.join(args.healthbench_folder, "*.jsonl"))
-
-all_rows = []
-all_columns = set()
-
-# First pass: gather all columns
-for file in files:
-    with open(file, "r", encoding="utf-8") as f:
-        for line in f:
-            row = json.loads(line)
-            all_columns.update(row.keys())
-
-all_columns = list(all_columns)
-
-# Second pass: load rows, filling missing keys with None
-for file in files:
-    with open(file, "r", encoding="utf-8") as f:
-        for line in f:
-            row = json.loads(line)
-            row_filled = {col: row.get(col, None) for col in all_columns}
-            all_rows.append(row_filled)
-
-# Create DataFrame
-df = pd.DataFrame(all_rows, columns=all_columns)
-df['dataset'] = 'healthbench'
-df.to_pickle(os.path.join(args.dataset_folder, 'healthbench.pkl'))
diff --git a/language/gpt-oss/data/fetch_mmlu.py b/language/gpt-oss/data/fetch_mmlu.py
deleted file mode 100644
index 30d3d32132..0000000000
--- a/language/gpt-oss/data/fetch_mmlu.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import pandas as pd
-from datasets import load_dataset
-import argparse
-import os
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--dataset_folder", type=str, default="dataset")
-args = parser.parse_args()
-
-ds = load_dataset("cais/mmlu", "all")
-df = ds['auxiliary_train'].to_pandas()
-
-for col in ['subject']:
-    df.drop(col, axis=1, inplace=True)
-
-df.rename(columns={'question': 'base_question'}, inplace=True)
-df['dataset'] = 'mmlu'
-
-for row in df.itertuples():
-    base_question = row.base_question
-    options = row.choices
-    question = f"{base_question}"
-    for idx, option in enumerate(options):
-        question += f"\n{chr(65+idx)}) {option}"
-    df.loc[row.Index, 'question'] = question
-    df.loc[row.Index, 'ground_truth'] = f"{chr(65+row.answer)}"
-
-df.to_pickle(os.path.join(args.dataset_folder, 'mmlu.pkl'))

From c5c389b86c0bf8d08b9a501bce0f93dfe13f7d0c Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 10 Nov 2025 20:59:28 +0000
Subject: [PATCH 073/165] add fetch_all.py

---
 language/gpt-oss/data/fetch_all.py | 178 +++++++++++++++++++++++++++++
 1 file changed, 178 insertions(+)
 create mode 100644 language/gpt-oss/data/fetch_all.py

diff --git a/language/gpt-oss/data/fetch_all.py b/language/gpt-oss/data/fetch_all.py
new file mode 100644
index 0000000000..1e96253ba1
--- /dev/null
+++ b/language/gpt-oss/data/fetch_all.py
@@ -0,0 +1,178 @@
+from datasets import load_dataset
+import pandas as pd
+import random
+import argparse
+import os
+import json
+
+parser = argparse.ArgumentParser(description="Fetch and combine AIME, GPQA, and LiveCodeBench datasets")
+parser.add_argument("--output_path", type=str, default="./combined_dataset.pkl", help="Full path to output pickle file")
+parser.add_argument("--lcb_folder", type=str, default="lcb", help="Folder containing LiveCodeBench repo cloned from HuggingFace")
+args = parser.parse_args()
+
+# Ensure output folder exists
+output_dir = os.path.dirname(args.output_path)
+if output_dir:
+    os.makedirs(output_dir, exist_ok=True)
+
+random.seed(42)
+
+print("=" * 80)
+print("Fetching datasets...")
+print("=" * 80)
+
+# ============================================================================
+# 1. FETCH AIME DATASET
+# ============================================================================
+print("\n[1/3] Fetching AIME dataset...")
+df_aime = load_dataset("di-zhang-fdu/AIME_1983_2024")['train'].to_pandas()
+
+# Optional: AIME 2025 datasets
+# df_1_aime2025 = load_dataset("opencompass/AIME2025", "AIME2025-I")['test'].to_pandas()
+# df_2_aime2025 = load_dataset("opencompass/AIME2025", "AIME2025-II")['test'].to_pandas()
+# df_aime2025 = pd.concat([df_1_aime2025, df_2_aime2025], ignore_index=True)
+# df_aime2025.rename(columns={'answer': 'ground_truth'}, inplace=True)
+
+df_aime.rename(
+    columns={
+        'Answer': 'ground_truth',
+        'Question': 'question'},
+    inplace=True)
+df_aime.drop(columns=['Year', 'ID', 'Problem Number', 'Part'], inplace=True)
+df_aime['dataset'] = 'aime1983'
+
+print(f"   ✓ AIME dataset loaded: {len(df_aime)} samples")
+
+# ============================================================================
+# 2. FETCH GPQA DATASET
+# ============================================================================
+print("\n[2/3] Fetching GPQA dataset...")
+
+# Note: Login using `huggingface-cli login` to access this dataset if needed
+ds_diamond = load_dataset("Idavidrein/gpqa", "gpqa_diamond")
+
+# Optional: Other GPQA variants
+# ds_experts = load_dataset("Idavidrein/gpqa", "gpqa_experts")
+# ds_main = load_dataset("Idavidrein/gpqa", "gpqa_main")
+# ds_extended = load_dataset("Idavidrein/gpqa", "gpqa_extended")
+
+df_gpqa = ds_diamond['train'].to_pandas()
+# df_experts = ds_experts['train'].to_pandas()
+# df_main = ds_main['train'].to_pandas()
+# df_extended = ds_extended['train'].to_pandas()
+
+# df = pd.concat([df_diamond, df_main, df_extended], ignore_index=True)
+df_gpqa = df_gpqa[['Question',
+                    'Correct Answer',
+                    'High-level domain',
+                    'Incorrect Answer 1',
+                    'Incorrect Answer 2',
+                    'Incorrect Answer 3']].copy()
+
+# Format questions with multiple choice options
+for idx, row in df_gpqa.iterrows():
+    options = [str(row[col]) for col in ['Incorrect Answer 1',
+                                          'Incorrect Answer 2', 'Incorrect Answer 3']]
+    options.append(str(row['Correct Answer']))
+    random.shuffle(options)
+    answer_idx = options.index(str(row['Correct Answer']))
+
+    options = [option.strip() for option in options]
+    answer = chr(65 + answer_idx)
+
+    question = f"{row['Question']}\nA) {options[0]}\nB) {options[1]}\nC) {options[2]}\nD) {options[3]}"
+    df_gpqa.loc[idx, 'Question'] = question
+    df_gpqa.loc[idx, 'ground_truth'] = answer
+
+df_gpqa.rename(
+    columns={
+        'High-level domain': 'domain',
+        'Question': 'question'},
+    inplace=True)
+df_gpqa['dataset'] = 'gpqa'
+
+print(f"   ✓ GPQA dataset loaded: {len(df_gpqa)} samples")
+
+# ============================================================================
+# 3. FETCH LIVECODEBENCH DATASET
+# ============================================================================
+print("\n[3/3] Fetching LiveCodeBench dataset...")
+files = [
+    "test.jsonl",
+    "test2.jsonl",
+    "test3.jsonl",
+    "test4.jsonl",
+    "test5.jsonl"]
+files = [os.path.join(args.lcb_folder, file) for file in files]
+
+all_rows = []
+all_columns = set()
+
+for file in files:
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"Error: File not found: {file}")
+
+    with open(file, "r", encoding="utf-8") as f:
+        for line in f:
+            row = json.loads(line)
+            all_columns.update(row.keys())
+            all_rows.append(row)
+
+all_columns = list(all_columns)
+
+df_lcb = pd.DataFrame(all_rows, columns=all_columns)
+df_lcb['dataset'] = 'livecodebench'
+df_lcb.drop(
+    columns=[
+        'private_test_cases',
+        'metadata',
+        'public_test_cases',
+        'contest_id',
+        'platform',
+        'difficulty',
+        'contest_date',
+        'question_title'],
+    inplace=True,
+    errors='ignore')  # Use errors='ignore' in case some columns don't exist
+
+starter_prompt = """
+### Format: You will use the following starter code to write the solution to the problem and enclose your code within delimiters.
+```python
+<<starter_code>>
+```
+"""
+
+for idx, row in df_lcb.iterrows():
+    starter_code = row['starter_code']
+    starter_prompt_filled = starter_prompt.replace(
+        '<<starter_code>>', starter_code)
+    df_lcb.loc[idx, 'question'] = df_lcb.loc[idx,
+                                              'question_content'] + starter_prompt_filled
+
+df_lcb.rename(columns={'question_id': 'ground_truth'}, inplace=True)
+
+print(f"   ✓ LiveCodeBench dataset loaded: {len(df_lcb)} samples")
+
+# ============================================================================
+# 4. COMBINE ALL DATASETS
+# ============================================================================
+print("\n" + "=" * 80)
+print("Combining datasets...")
+print("=" * 80)
+
+# Combine all dataframes
+df_combined = pd.concat([df_aime, df_gpqa, df_lcb], ignore_index=True)
+
+print(f"\nCombined dataset statistics:")
+print(f"  • Total samples: {len(df_combined)}")
+print(f"  • AIME samples: {len(df_aime)}")
+print(f"  • GPQA samples: {len(df_gpqa)}")
+print(f"  • LiveCodeBench samples: {len(df_lcb)}")
+print(f"\nDataset columns: {list(df_combined.columns)}")
+
+# Save combined dataset
+df_combined.to_pickle(args.output_path)
+
+print(f"\n✓ Combined dataset saved to: {args.output_path}")
+print("=" * 80)
+

From 341c750cb03d56d4210a5d2db2c028af894d649c Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 10 Nov 2025 21:10:39 +0000
Subject: [PATCH 074/165] updates

---
 language/gpt-oss/data/fetch_aime.py | 29 -------------
 language/gpt-oss/data/fetch_gpqa.py | 55 -------------------------
 language/gpt-oss/data/fetch_lcb.py  | 63 -----------------------------
 language/gpt-oss/setup_enroot.sh    |  6 +--
 4 files changed, 3 insertions(+), 150 deletions(-)
 delete mode 100644 language/gpt-oss/data/fetch_aime.py
 delete mode 100644 language/gpt-oss/data/fetch_gpqa.py
 delete mode 100644 language/gpt-oss/data/fetch_lcb.py

diff --git a/language/gpt-oss/data/fetch_aime.py b/language/gpt-oss/data/fetch_aime.py
deleted file mode 100644
index 757a0e303a..0000000000
--- a/language/gpt-oss/data/fetch_aime.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from datasets import load_dataset
-import pandas as pd
-import argparse
-import os
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--dataset_folder", type=str, default="dataset")
-args = parser.parse_args()
-
-df = load_dataset("di-zhang-fdu/AIME_1983_2024")['train'].to_pandas()
-
-df_1_aime2025 = load_dataset(
-    "opencompass/AIME2025",
-    "AIME2025-I")['test'].to_pandas()
-df_2_aime2025 = load_dataset(
-    "opencompass/AIME2025",
-    "AIME2025-II")['test'].to_pandas()
-# df_aime2025 = pd.concat([df_1_aime2025, df_2_aime2025], ignore_index=True)
-# df_aime2025.rename(columns={'answer': 'ground_truth'}, inplace=True)
-
-df.rename(
-    columns={
-        'Answer': 'ground_truth',
-        'Question': 'question'},
-    inplace=True)
-df.drop(columns=['Year', 'ID', 'Problem Number', 'Part'], inplace=True)
-df['dataset'] = 'aime1983'
-
-df.to_pickle(os.path.join(args.dataset_folder, 'aime1983-2024.pkl'))
diff --git a/language/gpt-oss/data/fetch_gpqa.py b/language/gpt-oss/data/fetch_gpqa.py
deleted file mode 100644
index 79ba3525f4..0000000000
--- a/language/gpt-oss/data/fetch_gpqa.py
+++ /dev/null
@@ -1,55 +0,0 @@
-from datasets import load_dataset
-import pandas as pd
-import random
-import argparse
-import os
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--dataset_folder", type=str, default="dataset")
-args = parser.parse_args()
-
-random.seed(42)
-
-# Login using e.g. `huggingface-cli login` to access this dataset
-ds_diamond = load_dataset("Idavidrein/gpqa", "gpqa_diamond")
-# ds_experts = load_dataset("Idavidrein/gpqa", "gpqa_experts")
-# ds_main = load_dataset("Idavidrein/gpqa", "gpqa_main")
-# ds_extended = load_dataset("Idavidrein/gpqa", "gpqa_extended")
-
-df_diamond = ds_diamond['train'].to_pandas()
-# df_experts = ds_experts['train'].to_pandas()
-# df_main = ds_main['train'].to_pandas()
-# df_extended = ds_extended['train'].to_pandas()
-
-
-# df = pd.concat([df_diamond, df_main, df_extended], ignore_index=True)
-df = df_diamond
-df = df[['Question',
-         'Correct Answer',
-         'High-level domain',
-         'Incorrect Answer 1',
-         'Incorrect Answer 2',
-         'Incorrect Answer 3']].copy()
-
-for idx, row in df.iterrows():
-    options = [str(row[col]) for col in ['Incorrect Answer 1',
-                                         'Incorrect Answer 2', 'Incorrect Answer 3']]
-    options.append(str(row['Correct Answer']))
-    random.shuffle(options)
-    answer_idx = options.index(str(row['Correct Answer']))
-
-    options = [option.strip() for option in options]
-    answer = chr(65 + answer_idx)
-
-    question = f"{row['Question']}\nA) {options[0]}\nB) {options[1]}\nC) {options[2]}\nD) {options[3]}"
-    df.loc[idx, 'Question'] = question
-    df.loc[idx, 'ground_truth'] = answer
-
-df.rename(
-    columns={
-        'High-level domain': 'domain',
-        'Question': 'question'},
-    inplace=True)
-df['dataset'] = 'gpqa'
-
-df.to_pickle(os.path.join(args.dataset_folder, 'gpqa_diamond.pkl'))
diff --git a/language/gpt-oss/data/fetch_lcb.py b/language/gpt-oss/data/fetch_lcb.py
deleted file mode 100644
index 5b6bf513e3..0000000000
--- a/language/gpt-oss/data/fetch_lcb.py
+++ /dev/null
@@ -1,63 +0,0 @@
-from datasets import load_dataset
-import pickle
-import os
-import glob
-import json
-import argparse
-import pandas as pd
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--dataset_folder", type=str, default="dataset")
-parser.add_argument("--lcb_folder", type=str, default="lcb")
-args = parser.parse_args()
-
-files = [
-    "test.jsonl",
-    "test2.jsonl",
-    "test3.jsonl",
-    "test4.jsonl",
-    "test5.jsonl"]
-files = [os.path.join(args.lcb_folder, file) for file in files]
-
-all_rows = []
-all_columns = set()
-
-for file in files:
-    with open(file, "r", encoding="utf-8") as f:
-        for line in f:
-            row = json.loads(line)
-            all_columns.update(row.keys())
-            all_rows.append(row)
-
-all_columns = list(all_columns)
-
-df = pd.DataFrame(all_rows, columns=all_columns)
-df['dataset'] = 'livecodebench'
-df.drop(
-    columns=[
-        'private_test_cases',
-        'metadata',
-        'public_test_cases',
-        'contest_id',
-        'platform',
-        'difficulty',
-        'contest_date',
-        'question_title'],
-    inplace=True)
-
-starter_prompt = """
-### Format: You will use the following starter code to write the solution to the problem and enclose your code within delimiters.
-```python
-<<starter_code>>
-```
-"""
-
-for idx, row in df.iterrows():
-    starter_code = row['starter_code']
-    starter_prompt_filled = starter_prompt.replace(
-        '<<starter_code>>', starter_code)
-    df.loc[idx, 'question'] = df.loc[idx,
-                                     'question_content'] + starter_prompt_filled
-
-df.rename(columns={'question_id': 'ground_truth'}, inplace=True)
-df.to_pickle(os.path.join(args.dataset_folder, 'lcb.pkl'))
diff --git a/language/gpt-oss/setup_enroot.sh b/language/gpt-oss/setup_enroot.sh
index 4e99f0bd84..c534ded13e 100755
--- a/language/gpt-oss/setup_enroot.sh
+++ b/language/gpt-oss/setup_enroot.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 
 sqsh_location=$(readlink -f $(dirname $0))/sqsh_files
-sandbox_name=sglang
-docker_image=lmsysorg/sglang:dev-cu13
+sandbox_name=sglang_v0.5.4.post2
+docker_image=lmsysorg/sglang:v0.5.4.post2
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -22,7 +22,7 @@ while [[ $# -gt 0 ]]; do
     esac
 done
 
-# mkdir -p $sqsh_location
+mkdir -p $sqsh_location
 enroot import -o $sqsh_location/$sandbox_name.sqsh docker://$docker_image
 enroot create --name $sandbox_name $sqsh_location/$sandbox_name.sqsh
 # enroot start --mount $(pwd):$(pwd) --root --rw $sandbox_name

From ef142a626cb7cd526ceda87ad2f9c02d23d2e5f2 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 10 Nov 2025 13:12:41 -0800
Subject: [PATCH 075/165] add preprocess

---
 language/gpt-oss/preprocess.sh | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100755 language/gpt-oss/preprocess.sh

diff --git a/language/gpt-oss/preprocess.sh b/language/gpt-oss/preprocess.sh
new file mode 100755
index 0000000000..8796c80af3
--- /dev/null
+++ b/language/gpt-oss/preprocess.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+git clone https://huggingface.co/datasets/livecodebench/code_generation_lite data/lcb
+python3 data/fetch_all.py --output_path data/accuracy_eval_raw.pkl --lcb_folder data/lcb
+python3 harmonize_inputs.py --data-file data/accuracy_eval_raw.pkl --output-file data/accuracy_eval_tokenized.pkl --reasoning-effort high --num-processes 32

From 834175fbe25d67d6f03a983919e1ea456d54f69c Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 10 Nov 2025 21:22:39 +0000
Subject: [PATCH 076/165] update README

---
 language/gpt-oss/README.md     | 107 +++++++--------------------------
 language/gpt-oss/preprocess.sh |   2 +
 language/gpt-oss/run_server.sh |   2 +
 3 files changed, 25 insertions(+), 86 deletions(-)

diff --git a/language/gpt-oss/README.md b/language/gpt-oss/README.md
index 3f1fd85e19..4dff1ea501 100644
--- a/language/gpt-oss/README.md
+++ b/language/gpt-oss/README.md
@@ -6,108 +6,43 @@ This is the reference implementation for GPT-OSS-120B. This is a proposal and is
 * Model: `openai/gpt-oss-120b`, commit id: [`b5c939d`](https://huggingface.co/openai/gpt-oss-120b/tree/b5c939de8f754692c1647ca79fbf85e8c1e70f8a)
 * Dataset: For now, we are re-using the `deepseek-r1` dataset.
 
-## Preprocessing considerations
-* For all other LLMs in MLPerf, tokenization has not been part of the benchmark and has been assumed a static preprocessing step
-* With the introduction of OpenAI's [harmony format](https://github.com/openai/harmony/tree/main) - the format must be agreed upon for consistent benchmarking. 
-* Knobs:
-  - reasoning_effort: HIGH
+## Environment setup
+Work on reference implementation is done using the sglang container `lmsysorg/sglang:v0.5.4.post2`. For enroot setup, a script is provided under [`setup_enroot.sh`](./setup_enroot.sh). For all sections below, we shall assume this environment is instantiated.
 
-An input to the `gpt-oss` model is first formatted into a `conversation` - an ordered list of messages.
-Each message has:
-* `role`: The sender of the message
-* `content`
-* `channel`: (choices `final/analysis/commentary`, we use `final` only in prompts)
-
-
-### Preamble:
-Each converstation starts with a message from `System` and `Developer` respectively.
-```json
- "messages": [
-    {
-      "role": "system",
-      "content": "model_identity=' ... 
-        reasoning_effort=<ReasoningEffort.HIGH: 'High'>
-        channel_config=ChannelConfig(
-          valid_channels=['analysis', 'commentary', 'final',
-          channel_required=True
-        )
-        tools=None
-        ...."
-    },
-    {
-      "role": "developer",
-      "content": "system_prompt"
-    },
-```
-
-### Multi-shot examples
-Some queries may have multi-shot examples. For these, the `User` and `Assistant` roles are assigned.
-```json
-    {
-      "role": "user",
-      "content": "example_question"
-    },
-    {
-      "role": "assistant",
-      "content": "example_answer",
-      "channel": "final"
-    },
+## Fetch accuracy eval dataset
+To create the accruracy_eval pkl file: 
+```bash
+$ ./preprocess.sh
 ```
 
-### Lastly, user query
-```json
-    {
-      "role": "user",
-      "content": "actual question"
-    }
-```
+This does the following: 
+- clones `https://huggingface.co/datasets/livecodebench/code_generation_lite` under `data/lcb`
+- creates a `data/accuracy_eval_raw.pkl` with `aime1983-2024, gpqa_diamond, lcb-v1_v5` samples.
+- converts the prompt into harmony format, and tokenizes them under `data/accuracy_eval_tokenized.pkl` using `HIGH` reasoning effort. 
+  - This step uses multiprocessing with a default of 32 parallel workers (hardcoded). Please reduce this if you see `pyo3_runtime.PanicException` errors. 
 
 ## Running the reference implementation: SGLang
-[`SGLang`](https://github.com/sgl-project/sglang) is the framework of choice to run the reference implementation.
-
-### Fetch the docker image
-SGLang docker image will be used: `lmsysorg/sglang:v0.5.3rc1`. Steps below are to be run in an environment from this image
-
-### Preprocess the dataset
-```bash
-python3 harmonize_inputs.py \
-    --data-file mlperf_dsr1_fp8_ref_eval.pkl \
-    --num-processes 32 \
-    --output-file out/mlperf_gptoss_inputs.pkl \
-    --reasoning-effort low|medium|high
-```
 
 ### Run the server
 ```bash
-python3 -m sglang.launch_server \
-    --model-path openai/gpt-oss-120b \
-    --host 0.0.0.0 \
-    --port 30000 \
-    --tp-size=1 \
-    --data-parallel-size=$dp \
-    --max-running-requests 256 \
-    --mem-fraction-static 0.85 \
-    --chunked-prefill-size 16384 \
-    --ep-size=1 \
-    --quantization mxfp4 \
-    --stream-interval 50
+./run_server.sh \
+  --model_path path_to_gpt_oss_120b_model \  # optional, defaults to fetching from HF
+  --dp N  # optional, defaults to 1. Set this to number of accelerators
 ```
+The script uses `python3 -m sglang.launch_server` tp instantiate the model, with `tp=pp=ep=1`, and `dp` as specified. 
 
 ### Run the inference
 ```bash
 python3 run_infer.py \
-    --input-tokens out/mlperf_gptoss_inputs.pkl \
-    --max-tokens 20480 \
+    --input-tokens data/accuracy_eval_tokenized.pkl \
+    --max-tokens 32768 \
     --max-concurrency 4096 \
-    --output out/mlperf_gptoss_inferred.pkl
+    --timeout 2400 \
+    --output data/accuracy_eval_inferred.pkl \
+    --pass-k 5
 ```
 
 ### Evaluate the responses
-We use the `deepseek-r1` evaluation environment to evaluate the model responses and calculate accuracy
-
 ```bash
-cd $repo_root/language/deepseek-r1
-./launch_docker.sh --backend sglang
-setup.sh
-(.venv_sglang) $ python3 eval_accuracy.py --input-file mlperf_gptoss_inferred.pkl
+python3 eval_accuracy.py --input-file data/accuracy_eval_inferred.pkl
 ```
\ No newline at end of file
diff --git a/language/gpt-oss/preprocess.sh b/language/gpt-oss/preprocess.sh
index 8796c80af3..e3842c184b 100755
--- a/language/gpt-oss/preprocess.sh
+++ b/language/gpt-oss/preprocess.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+pip install -r requirements.txt
+
 git clone https://huggingface.co/datasets/livecodebench/code_generation_lite data/lcb
 python3 data/fetch_all.py --output_path data/accuracy_eval_raw.pkl --lcb_folder data/lcb
 python3 harmonize_inputs.py --data-file data/accuracy_eval_raw.pkl --output-file data/accuracy_eval_tokenized.pkl --reasoning-effort high --num-processes 32
diff --git a/language/gpt-oss/run_server.sh b/language/gpt-oss/run_server.sh
index 2d164182d8..fe02d68682 100755
--- a/language/gpt-oss/run_server.sh
+++ b/language/gpt-oss/run_server.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+pip install -r requirements.txt
+
 dp=1
 model_path=openai/gpt-oss-120b
 eagle_path=""

From e4470813d9bd89f2a0e82b6a814158cdb59e3bc6 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Mon, 10 Nov 2025 21:25:19 +0000
Subject: [PATCH 077/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/data/fetch_all.py | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/language/gpt-oss/data/fetch_all.py b/language/gpt-oss/data/fetch_all.py
index 1e96253ba1..81413c229a 100644
--- a/language/gpt-oss/data/fetch_all.py
+++ b/language/gpt-oss/data/fetch_all.py
@@ -5,9 +5,18 @@
 import os
 import json
 
-parser = argparse.ArgumentParser(description="Fetch and combine AIME, GPQA, and LiveCodeBench datasets")
-parser.add_argument("--output_path", type=str, default="./combined_dataset.pkl", help="Full path to output pickle file")
-parser.add_argument("--lcb_folder", type=str, default="lcb", help="Folder containing LiveCodeBench repo cloned from HuggingFace")
+parser = argparse.ArgumentParser(
+    description="Fetch and combine AIME, GPQA, and LiveCodeBench datasets")
+parser.add_argument(
+    "--output_path",
+    type=str,
+    default="./combined_dataset.pkl",
+    help="Full path to output pickle file")
+parser.add_argument(
+    "--lcb_folder",
+    type=str,
+    default="lcb",
+    help="Folder containing LiveCodeBench repo cloned from HuggingFace")
 args = parser.parse_args()
 
 # Ensure output folder exists
@@ -63,16 +72,16 @@
 
 # df = pd.concat([df_diamond, df_main, df_extended], ignore_index=True)
 df_gpqa = df_gpqa[['Question',
-                    'Correct Answer',
-                    'High-level domain',
-                    'Incorrect Answer 1',
-                    'Incorrect Answer 2',
-                    'Incorrect Answer 3']].copy()
+                   'Correct Answer',
+                   'High-level domain',
+                   'Incorrect Answer 1',
+                   'Incorrect Answer 2',
+                   'Incorrect Answer 3']].copy()
 
 # Format questions with multiple choice options
 for idx, row in df_gpqa.iterrows():
     options = [str(row[col]) for col in ['Incorrect Answer 1',
-                                          'Incorrect Answer 2', 'Incorrect Answer 3']]
+                                         'Incorrect Answer 2', 'Incorrect Answer 3']]
     options.append(str(row['Correct Answer']))
     random.shuffle(options)
     answer_idx = options.index(str(row['Correct Answer']))
@@ -147,7 +156,7 @@
     starter_prompt_filled = starter_prompt.replace(
         '<<starter_code>>', starter_code)
     df_lcb.loc[idx, 'question'] = df_lcb.loc[idx,
-                                              'question_content'] + starter_prompt_filled
+                                             'question_content'] + starter_prompt_filled
 
 df_lcb.rename(columns={'question_id': 'ground_truth'}, inplace=True)
 
@@ -175,4 +184,3 @@
 
 print(f"\n✓ Combined dataset saved to: {args.output_path}")
 print("=" * 80)
-

From a1e668aea77852bb696be4a685d1e83b6d3eb955 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 10 Nov 2025 14:53:29 -0800
Subject: [PATCH 078/165] add top-p option

---
 language/gpt-oss/run_infer.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/language/gpt-oss/run_infer.py b/language/gpt-oss/run_infer.py
index f30205f99e..049ef4112f 100644
--- a/language/gpt-oss/run_infer.py
+++ b/language/gpt-oss/run_infer.py
@@ -52,12 +52,14 @@ def __init__(self,
                  server_url: str = "http://localhost:30000",
                  temperature: float = 0.001,
                  top_k: int = 1,
+                 top_p: float = 1.0,
                  timeout: int = 1200
                  ):
         self.base_url = server_url
         self.session = requests.Session()
         self.temperature = temperature
         self.top_k = top_k
+        self.top_p = top_p
         self.timeout = timeout
 
     def send_request(
@@ -70,6 +72,7 @@ def send_request(
                 "max_new_tokens": max_tokens,
                 "temperature": self.temperature,
                 "top_k": self.top_k,
+                "top_p": self.top_p,
             }
         }
 
@@ -128,13 +131,14 @@ def load_tokenized_data(data_file: str) -> pd.DataFrame:
 
 def send_single_request(args_tuple):
     """Send a single request - used by multiprocessing pool."""
-    input_ids, max_tokens, server_url, sample_id, pass_num, temperature, top_k, timeout = args_tuple
+    input_ids, max_tokens, server_url, sample_id, pass_num, temperature, top_k, top_p, timeout = args_tuple
 
     # Create a new client for this process
     client = SGLangClient(
         server_url=server_url,
         temperature=temperature,
         top_k=top_k,
+        top_p=top_p,
         timeout=timeout)
 
     try:
@@ -151,7 +155,7 @@ def send_single_request(args_tuple):
 
 
 def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
-                           max_tokens: int = 100, max_concurrency: int = 128, temperature: float = 0.001, top_k: int = 1, timeout: int = 1200,
+                           max_tokens: int = 100, max_concurrency: int = 128, temperature: float = 0.001, top_k: int = 1, top_p: float = 1.0, timeout: int = 1200,
                            pass_k: int = 1):
     """Send all requests to SGLang server in parallel using multiprocessing.
 
@@ -172,7 +176,7 @@ def send_requests_parallel(tokenized_df: pd.DataFrame, server_url: str,
         for pass_num in range(pass_k):
             args_list.append((
                 row['tok_input'], max_tokens, server_url,
-                idx, pass_num, temperature, top_k, timeout
+                idx, pass_num, temperature, top_k, top_p, timeout
             ))
 
     start_time = time.time()
@@ -363,7 +367,7 @@ def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
 
 def process_requests(tokenized_df: pd.DataFrame, server_url: str,
                      max_samples: int = None, max_tokens: int = 100,
-                     max_concurrency: int = 128, output_file: str = None, temperature: float = 0.001, top_k: int = 1,
+                     max_concurrency: int = 128, output_file: str = None, temperature: float = 0.001, top_k: int = 1, top_p: float = 1.0,
                      timeout: int = 1200, pass_k: int = 1) -> pd.DataFrame:
     """Main processing function that handles requests and response extraction.
 
@@ -384,6 +388,7 @@ def process_requests(tokenized_df: pd.DataFrame, server_url: str,
         max_concurrency,
         temperature,
         top_k,
+        top_p,
         timeout,
         pass_k)
 
@@ -429,6 +434,8 @@ def main():
                         help="Temperature for sampling (default: 0.001)")
     parser.add_argument("--top-k", type=int, default=1,
                         help="Top-k for sampling (default: 1)")
+    parser.add_argument("--top-p", type=float, default=1.0,
+                        help="Top-p for sampling (default: 1.0)")
     parser.add_argument("--timeout", type=int, default=1200,
                         help="Timeout for requests (default: 1200)")
 
@@ -440,6 +447,7 @@ def main():
         server_url=args.server_url,
         temperature=args.temperature,
         top_k=args.top_k,
+        top_p=args.top_p,
         timeout=args.timeout)
 
     test_response = test_client.send_request(input_ids=[1, 2, 3], max_tokens=5)
@@ -462,6 +470,7 @@ def main():
                                  output_file=args.output,
                                  temperature=args.temperature,
                                  top_k=args.top_k,
+                                 top_p=args.top_p,
                                  timeout=args.timeout,
                                  pass_k=args.pass_k)
 

From 3db5bc2371630731921e7699ba0dd136016dd192 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 10 Nov 2025 14:53:56 -0800
Subject: [PATCH 079/165] add summarize_eval

---
 language/gpt-oss/summarize_eval.py | 232 +++++++++++++++++++++++++++++
 1 file changed, 232 insertions(+)
 create mode 100644 language/gpt-oss/summarize_eval.py

diff --git a/language/gpt-oss/summarize_eval.py b/language/gpt-oss/summarize_eval.py
new file mode 100644
index 0000000000..3418b19712
--- /dev/null
+++ b/language/gpt-oss/summarize_eval.py
@@ -0,0 +1,232 @@
+#!/usr/bin/env python3
+"""
+Summarize evaluation results from eval_accuracy.py output.
+
+Reads an evaluated pickle file and prints a summary of results by dataset,
+including per-pass statistics and aggregated pass@k results.
+"""
+
+import argparse
+import pickle
+import sys
+from pathlib import Path
+from typing import Dict, Any, List
+import pandas as pd
+
+
+def detect_pass_k(df: pd.DataFrame) -> int:
+    """Detect if DataFrame has pass@k format and return k.
+
+    Returns:
+        Number of passes (k) if pass@k format detected, otherwise 1
+    """
+    # Check for model_output_0, model_output_1, etc.
+    pass_k = 0
+    while f'model_output_{pass_k}' in df.columns:
+        pass_k += 1
+
+    # If no _0 suffix found, check for single model_output column
+    if pass_k == 0 and 'model_output' in df.columns:
+        return 1
+
+    return pass_k
+
+
+def calculate_dataset_stats(df: pd.DataFrame, dataset_name: str, pass_num: int = None, pass_k: int = 1) -> Dict[str, Any]:
+    """Calculate statistics for a specific dataset and pass.
+
+    Args:
+        df: DataFrame with evaluation results
+        dataset_name: Name of the dataset to filter
+        pass_num: Pass number (None for aggregated results)
+        pass_k: Total number of passes (for aggregated results)
+
+    Returns:
+        Dictionary with statistics
+    """
+    # Filter to this dataset
+    dataset_df = df[df['dataset'] == dataset_name]
+    
+    # Determine column suffixes
+    if pass_num is None:
+        # Aggregated results
+        accuracy_col = 'prompt_accuracy' if 'prompt_accuracy' in dataset_df.columns else 'prompt_accuracy_0'
+        
+        # For aggregated pass@k, count answered as any sample with at least one extracted answer
+        if pass_k > 1:
+            # Check if any pass has an extracted answer
+            answered_mask = pd.Series([False] * len(dataset_df), index=dataset_df.index)
+            for i in range(pass_k):
+                col = f'extracted_answer_{i}'
+                if col in dataset_df.columns:
+                    answered_mask |= dataset_df[col].notna()
+            answered = answered_mask.sum()
+        else:
+            extracted_col = 'extracted_answer' if 'extracted_answer' in dataset_df.columns else 'extracted_answer_0'
+            answered = dataset_df[extracted_col].notna().sum()
+    else:
+        # Specific pass
+        suffix = f'_{pass_num}'
+        extracted_col = f'extracted_answer{suffix}'
+        accuracy_col = f'prompt_accuracy{suffix}'
+        answered = dataset_df[extracted_col].notna().sum()
+    
+    # Calculate statistics
+    total = len(dataset_df)
+    correct = (dataset_df[accuracy_col] > 0).sum()
+    
+    # Calculate percentage (correct / total)
+    if total > 0:
+        pct_correct = (correct / total) * 100
+    else:
+        pct_correct = 0.0
+    
+    # Calculate mean accuracy (handles HealthBench partial scores)
+    mean_accuracy = dataset_df[accuracy_col].mean()
+    
+    return {
+        'dataset': dataset_name,
+        'total': int(total),
+        'answered': int(answered),
+        'correct': int(correct),
+        'pct_correct': float(pct_correct),
+        'mean_accuracy': float(mean_accuracy),
+    }
+
+
+def print_summary_table(stats_list: List[Dict[str, Any]], title: str = "Summary"):
+    """Print a formatted summary table.
+
+    Args:
+        stats_list: List of statistics dictionaries
+        title: Title for the table
+    """
+    print(f"\n{'=' * 85}")
+    print(f"{title}")
+    print('=' * 85)
+    print(f"{'Dataset':<20} {'Total':>8} {'Answered':>10} {'Correct':>10} {'Accuracy':>12}")
+    print('-' * 85)
+    
+    for stats in stats_list:
+        dataset_name = stats['dataset']
+        total = stats['total']
+        answered = stats['answered']
+        correct = stats['correct']
+        pct_correct = stats['pct_correct']
+        
+        # Format the row
+        print(f"{dataset_name:<20} {total:>8} {answered:>10} {correct:>10} {pct_correct:>11.2f}%")
+    
+    # Print totals
+    if len(stats_list) > 1:
+        total_samples = sum(s['total'] for s in stats_list)
+        total_answered = sum(s['answered'] for s in stats_list)
+        total_correct = sum(s['correct'] for s in stats_list)
+        overall_pct = (total_correct / total_samples * 100) if total_samples > 0 else 0.0
+        
+        print('-' * 85)
+        print(f"{'OVERALL':<20} {total_samples:>8} {total_answered:>10} {total_correct:>10} {overall_pct:>11.2f}%")
+    
+    print('=' * 85)
+
+
+def summarize_evaluation(pickle_path: str):
+    """Load and summarize evaluation results.
+
+    Args:
+        pickle_path: Path to evaluated pickle file
+    """
+    # Load the pickle file
+    print(f"Loading evaluation results from: {pickle_path}")
+    with open(pickle_path, 'rb') as f:
+        df = pickle.load(f)
+    
+    print(f"Loaded {len(df)} samples")
+    
+    # Detect pass@k format
+    pass_k = detect_pass_k(df)
+    print(f"Detected format: pass@{pass_k}" if pass_k > 1 else "Detected format: single-pass")
+    
+    # Get list of datasets
+    datasets = sorted(df['dataset'].unique())
+    print(f"Datasets found: {', '.join(datasets)}")
+    
+    # Calculate statistics for each dataset
+    if pass_k > 1:
+        # Show per-pass statistics
+        for pass_num in range(pass_k):
+            stats_list = []
+            for dataset in datasets:
+                stats = calculate_dataset_stats(df, dataset, pass_num=pass_num, pass_k=pass_k)
+                stats_list.append(stats)
+            
+            print_summary_table(stats_list, title=f"Pass {pass_num} Results")
+        
+        # Show aggregated (pass@k) statistics
+        print("\n")
+        stats_list = []
+        for dataset in datasets:
+            stats = calculate_dataset_stats(df, dataset, pass_num=None, pass_k=pass_k)
+            stats_list.append(stats)
+        
+        print_summary_table(stats_list, title=f"Aggregated Pass@{pass_k} Results (Max Across Passes)")
+    else:
+        # Single pass - just show the results
+        stats_list = []
+        for dataset in datasets:
+            stats = calculate_dataset_stats(df, dataset, pass_num=None, pass_k=pass_k)
+            stats_list.append(stats)
+        
+        print_summary_table(stats_list, title="Evaluation Results")
+    
+    # Print column information for reference
+    print("\nColumn Information:")
+    print(f"  - Total: Total number of samples in the dataset")
+    if pass_k > 1:
+        print(f"  - Answered: Number of samples with at least one extracted answer across all passes")
+    else:
+        print(f"  - Answered: Number of samples with extracted answers")
+    print(f"  - Correct: Number of correct answers (accuracy > 0)")
+    print(f"  - Accuracy: Percentage of total samples that were correct (correct / total)")
+    
+    if pass_k > 1:
+        print(f"\nPass@{pass_k} Note:")
+        print(f"  - Per-pass results show individual pass performance")
+        print(f"  - Aggregated results show the maximum accuracy achieved across all {pass_k} passes")
+        print(f"  - A sample is considered correct if ANY of the {pass_k} attempts were correct")
+        print(f"  - A sample is considered answered if ANY of the {pass_k} attempts extracted an answer")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Summarize evaluation results by dataset")
+    parser.add_argument("input_file", 
+                        help="Path to evaluated pickle file from eval_accuracy.py")
+    parser.add_argument("--json", action="store_true",
+                        help="Output results in JSON format (for programmatic use)")
+    
+    args = parser.parse_args()
+    
+    # Check if input file exists
+    if not Path(args.input_file).exists():
+        print(f"Error: Input file not found: {args.input_file}", file=sys.stderr)
+        sys.exit(1)
+    
+    # Check if file has _evaluated suffix (warn if not)
+    if "_evaluated" not in args.input_file:
+        print(f"Warning: Input file does not contain '_evaluated' suffix. "
+              f"Make sure this is an evaluated pickle file from eval_accuracy.py", 
+              file=sys.stderr)
+    
+    try:
+        summarize_evaluation(args.input_file)
+    except Exception as e:
+        print(f"Error processing file: {e}", file=sys.stderr)
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
+

From 3ca2cc19094bd5d0d75c07609ea5170ca3e979f1 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Mon, 10 Nov 2025 22:54:21 +0000
Subject: [PATCH 080/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/summarize_eval.py | 105 +++++++++++++++++------------
 1 file changed, 62 insertions(+), 43 deletions(-)

diff --git a/language/gpt-oss/summarize_eval.py b/language/gpt-oss/summarize_eval.py
index 3418b19712..aa6e880f97 100644
--- a/language/gpt-oss/summarize_eval.py
+++ b/language/gpt-oss/summarize_eval.py
@@ -32,7 +32,8 @@ def detect_pass_k(df: pd.DataFrame) -> int:
     return pass_k
 
 
-def calculate_dataset_stats(df: pd.DataFrame, dataset_name: str, pass_num: int = None, pass_k: int = 1) -> Dict[str, Any]:
+def calculate_dataset_stats(df: pd.DataFrame, dataset_name: str,
+                            pass_num: int = None, pass_k: int = 1) -> Dict[str, Any]:
     """Calculate statistics for a specific dataset and pass.
 
     Args:
@@ -46,16 +47,19 @@ def calculate_dataset_stats(df: pd.DataFrame, dataset_name: str, pass_num: int =
     """
     # Filter to this dataset
     dataset_df = df[df['dataset'] == dataset_name]
-    
+
     # Determine column suffixes
     if pass_num is None:
         # Aggregated results
         accuracy_col = 'prompt_accuracy' if 'prompt_accuracy' in dataset_df.columns else 'prompt_accuracy_0'
-        
-        # For aggregated pass@k, count answered as any sample with at least one extracted answer
+
+        # For aggregated pass@k, count answered as any sample with at least one
+        # extracted answer
         if pass_k > 1:
             # Check if any pass has an extracted answer
-            answered_mask = pd.Series([False] * len(dataset_df), index=dataset_df.index)
+            answered_mask = pd.Series(
+                [False] * len(dataset_df),
+                index=dataset_df.index)
             for i in range(pass_k):
                 col = f'extracted_answer_{i}'
                 if col in dataset_df.columns:
@@ -70,20 +74,20 @@ def calculate_dataset_stats(df: pd.DataFrame, dataset_name: str, pass_num: int =
         extracted_col = f'extracted_answer{suffix}'
         accuracy_col = f'prompt_accuracy{suffix}'
         answered = dataset_df[extracted_col].notna().sum()
-    
+
     # Calculate statistics
     total = len(dataset_df)
     correct = (dataset_df[accuracy_col] > 0).sum()
-    
+
     # Calculate percentage (correct / total)
     if total > 0:
         pct_correct = (correct / total) * 100
     else:
         pct_correct = 0.0
-    
+
     # Calculate mean accuracy (handles HealthBench partial scores)
     mean_accuracy = dataset_df[accuracy_col].mean()
-    
+
     return {
         'dataset': dataset_name,
         'total': int(total),
@@ -94,7 +98,8 @@ def calculate_dataset_stats(df: pd.DataFrame, dataset_name: str, pass_num: int =
     }
 
 
-def print_summary_table(stats_list: List[Dict[str, Any]], title: str = "Summary"):
+def print_summary_table(
+        stats_list: List[Dict[str, Any]], title: str = "Summary"):
     """Print a formatted summary table.
 
     Args:
@@ -106,27 +111,31 @@ def print_summary_table(stats_list: List[Dict[str, Any]], title: str = "Summary"
     print('=' * 85)
     print(f"{'Dataset':<20} {'Total':>8} {'Answered':>10} {'Correct':>10} {'Accuracy':>12}")
     print('-' * 85)
-    
+
     for stats in stats_list:
         dataset_name = stats['dataset']
         total = stats['total']
         answered = stats['answered']
         correct = stats['correct']
         pct_correct = stats['pct_correct']
-        
+
         # Format the row
-        print(f"{dataset_name:<20} {total:>8} {answered:>10} {correct:>10} {pct_correct:>11.2f}%")
-    
+        print(
+            f"{dataset_name:<20} {total:>8} {answered:>10} {correct:>10} {pct_correct:>11.2f}%")
+
     # Print totals
     if len(stats_list) > 1:
         total_samples = sum(s['total'] for s in stats_list)
         total_answered = sum(s['answered'] for s in stats_list)
         total_correct = sum(s['correct'] for s in stats_list)
-        overall_pct = (total_correct / total_samples * 100) if total_samples > 0 else 0.0
-        
+        overall_pct = (
+            total_correct /
+            total_samples *
+            100) if total_samples > 0 else 0.0
+
         print('-' * 85)
         print(f"{'OVERALL':<20} {total_samples:>8} {total_answered:>10} {total_correct:>10} {overall_pct:>11.2f}%")
-    
+
     print('=' * 85)
 
 
@@ -140,45 +149,51 @@ def summarize_evaluation(pickle_path: str):
     print(f"Loading evaluation results from: {pickle_path}")
     with open(pickle_path, 'rb') as f:
         df = pickle.load(f)
-    
+
     print(f"Loaded {len(df)} samples")
-    
+
     # Detect pass@k format
     pass_k = detect_pass_k(df)
-    print(f"Detected format: pass@{pass_k}" if pass_k > 1 else "Detected format: single-pass")
-    
+    print(f"Detected format: pass@{pass_k}" if pass_k >
+          1 else "Detected format: single-pass")
+
     # Get list of datasets
     datasets = sorted(df['dataset'].unique())
     print(f"Datasets found: {', '.join(datasets)}")
-    
+
     # Calculate statistics for each dataset
     if pass_k > 1:
         # Show per-pass statistics
         for pass_num in range(pass_k):
             stats_list = []
             for dataset in datasets:
-                stats = calculate_dataset_stats(df, dataset, pass_num=pass_num, pass_k=pass_k)
+                stats = calculate_dataset_stats(
+                    df, dataset, pass_num=pass_num, pass_k=pass_k)
                 stats_list.append(stats)
-            
+
             print_summary_table(stats_list, title=f"Pass {pass_num} Results")
-        
+
         # Show aggregated (pass@k) statistics
         print("\n")
         stats_list = []
         for dataset in datasets:
-            stats = calculate_dataset_stats(df, dataset, pass_num=None, pass_k=pass_k)
+            stats = calculate_dataset_stats(
+                df, dataset, pass_num=None, pass_k=pass_k)
             stats_list.append(stats)
-        
-        print_summary_table(stats_list, title=f"Aggregated Pass@{pass_k} Results (Max Across Passes)")
+
+        print_summary_table(
+            stats_list,
+            title=f"Aggregated Pass@{pass_k} Results (Max Across Passes)")
     else:
         # Single pass - just show the results
         stats_list = []
         for dataset in datasets:
-            stats = calculate_dataset_stats(df, dataset, pass_num=None, pass_k=pass_k)
+            stats = calculate_dataset_stats(
+                df, dataset, pass_num=None, pass_k=pass_k)
             stats_list.append(stats)
-        
+
         print_summary_table(stats_list, title="Evaluation Results")
-    
+
     # Print column information for reference
     print("\nColumn Information:")
     print(f"  - Total: Total number of samples in the dataset")
@@ -188,36 +203,41 @@ def summarize_evaluation(pickle_path: str):
         print(f"  - Answered: Number of samples with extracted answers")
     print(f"  - Correct: Number of correct answers (accuracy > 0)")
     print(f"  - Accuracy: Percentage of total samples that were correct (correct / total)")
-    
+
     if pass_k > 1:
         print(f"\nPass@{pass_k} Note:")
         print(f"  - Per-pass results show individual pass performance")
-        print(f"  - Aggregated results show the maximum accuracy achieved across all {pass_k} passes")
-        print(f"  - A sample is considered correct if ANY of the {pass_k} attempts were correct")
-        print(f"  - A sample is considered answered if ANY of the {pass_k} attempts extracted an answer")
+        print(
+            f"  - Aggregated results show the maximum accuracy achieved across all {pass_k} passes")
+        print(
+            f"  - A sample is considered correct if ANY of the {pass_k} attempts were correct")
+        print(
+            f"  - A sample is considered answered if ANY of the {pass_k} attempts extracted an answer")
 
 
 def main():
     parser = argparse.ArgumentParser(
         description="Summarize evaluation results by dataset")
-    parser.add_argument("input_file", 
+    parser.add_argument("input_file",
                         help="Path to evaluated pickle file from eval_accuracy.py")
     parser.add_argument("--json", action="store_true",
                         help="Output results in JSON format (for programmatic use)")
-    
+
     args = parser.parse_args()
-    
+
     # Check if input file exists
     if not Path(args.input_file).exists():
-        print(f"Error: Input file not found: {args.input_file}", file=sys.stderr)
+        print(
+            f"Error: Input file not found: {args.input_file}",
+            file=sys.stderr)
         sys.exit(1)
-    
+
     # Check if file has _evaluated suffix (warn if not)
     if "_evaluated" not in args.input_file:
         print(f"Warning: Input file does not contain '_evaluated' suffix. "
-              f"Make sure this is an evaluated pickle file from eval_accuracy.py", 
+              f"Make sure this is an evaluated pickle file from eval_accuracy.py",
               file=sys.stderr)
-    
+
     try:
         summarize_evaluation(args.input_file)
     except Exception as e:
@@ -229,4 +249,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-

From 09cd3f7c7c5abdef8a3d504b4db63c13ccd88287 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Fri, 7 Nov 2025 17:43:49 +0000
Subject: [PATCH 081/165] add trtllm infer script

---
 language/gpt-oss/run_infer_trtllm.py | 581 +++++++++++++++++++++++++++
 1 file changed, 581 insertions(+)
 create mode 100644 language/gpt-oss/run_infer_trtllm.py

diff --git a/language/gpt-oss/run_infer_trtllm.py b/language/gpt-oss/run_infer_trtllm.py
new file mode 100644
index 0000000000..d920c44608
--- /dev/null
+++ b/language/gpt-oss/run_infer_trtllm.py
@@ -0,0 +1,581 @@
+#!/usr/bin/env python3
+"""
+Script to send pre-tokenized requests to TensorRT-LLM server via OpenAI endpoint.
+
+Usage:
+    python run_infer_trtllm.py --input-tokens tokenized_data.pkl [options]
+
+Arguments:
+    --input-tokens     Path to pickle file containing pre-tokenized data from harmony-tokens.py
+    --server-url       TensorRT-LLM server URL (default: localhost:8000)
+    --max-samples      Maximum number of samples to process (default: all)
+    --max-tokens       Maximum tokens to generate per request (default: 100)
+    --max-concurrency  Maximum number of concurrent requests (default: 256)
+    --output           Output pickle file for responses (optional)
+    --pass-k           Number of inference passes per sample for pass@k strategy (default: 1)
+"""
+
+import asyncio
+import argparse
+import time
+import logging
+from typing import List, Dict, Any, Tuple
+import pandas as pd
+from tqdm import tqdm
+from transformers import AutoTokenizer
+import httpx
+from openai import AsyncOpenAI
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+# Initialize tokenizer
+MODEL_NAME = "openai/gpt-oss-120b"
+tokenizer = None
+
+
+def get_tokenizer():
+    """Get or initialize the tokenizer."""
+    global tokenizer
+    if tokenizer is None:
+        logger.info(f"Loading tokenizer for {MODEL_NAME}...")
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+        logger.info("Tokenizer loaded successfully")
+    return tokenizer
+
+
+class TRTLLMClient:
+    """Client for TensorRT-LLM server using OpenAI-compatible endpoint."""
+    
+    def __init__(self,
+                 server_url: str = "localhost:8000",
+                 temperature: float = 0.001,
+                 top_k: int = 1,
+                 top_p: float = 1.0,
+                 max_concurrency: int = 256,
+                 timeout: int = 1200):
+        self.server_url = server_url
+        self.temperature = temperature
+        self.top_k = top_k
+        self.top_p = top_p
+        self.max_concurrency = max_concurrency
+        self.timeout = timeout
+        self.model_name = MODEL_NAME
+        
+        # Initialize async OpenAI client
+        self.http_client = None
+        self.async_client = None
+        self.concurrency_semaphore = None
+        
+    async def initialize(self):
+        """Initialize OpenAI client and HTTP client."""
+        # Create semaphore for concurrency control
+        self.concurrency_semaphore = asyncio.Semaphore(self.max_concurrency)
+        
+        # Setup HTTP client with proper connection limits for high concurrency
+        self.http_client = httpx.AsyncClient(
+            timeout=httpx.Timeout(self.timeout),
+            limits=httpx.Limits(
+                max_keepalive_connections=self.max_concurrency * 2,
+                max_connections=self.max_concurrency * 2,
+            ),
+            http2=True
+        )
+        
+        # Setup OpenAI client with the configured HTTP client
+        self.async_client = AsyncOpenAI(
+            api_key='dummy',  # TensorRT-LLM server doesn't require real API key
+            base_url=f"http://{self.server_url}/v1/",
+            timeout=self.timeout,
+            max_retries=10,
+            http_client=self.http_client,
+        )
+        
+    async def send_request(
+            self, input_ids: List[int], max_tokens: int = 100,
+            sample_id: int = 0, pass_num: int = 0) -> Tuple[int, int, Dict[str, Any], float]:
+        """Send a single request to the TensorRT-LLM server.
+        
+        Args:
+            input_ids: List of input token IDs
+            max_tokens: Maximum tokens to generate
+            sample_id: Sample identifier
+            pass_num: Pass number for pass@k strategy
+            
+        Returns:
+            Tuple of (sample_id, pass_num, response, latency)
+        """
+        # Prepare generation parameters using OpenAI chat completions format
+        gen_params = {
+            "model": self.model_name,
+            "max_tokens": max_tokens,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "stream": False,
+            "messages": [],  # Empty messages for token-based input
+            "extra_body": {
+                # TensorRT-LLM specific parameters passed in extra_body
+                "prompt_token_ids": input_ids,
+                "min_tokens": 1,
+                "top_k": self.top_k,
+            },
+        }
+        
+        try:
+            # Track latency: time from request sent to response received
+            start_time = time.time()
+            
+            # Use semaphore for concurrency control
+            async with self.concurrency_semaphore:
+                completion = await self.async_client.chat.completions.create(**gen_params)
+            
+            end_time = time.time()
+            latency = end_time - start_time
+            
+            # Extract response text
+            response_text = completion.choices[0].message.content
+            
+            # Tokenize the response to get output_ids (similar to SGLang format)
+            tokenizer = get_tokenizer()
+            output_ids = tokenizer.encode(response_text, add_special_tokens=False)
+            
+            # Format response similar to SGLang format for compatibility
+            response = {
+                "output_ids": output_ids,
+                "text": response_text,
+                "meta_info": {
+                    "completion_tokens": len(output_ids),
+                }
+            }
+            
+            return sample_id, pass_num, response, latency
+            
+        except Exception as e:
+            logger.error(f"Request {sample_id} (pass {pass_num}) failed: {e}")
+            return sample_id, pass_num, {"error": str(e)}, None
+    
+    async def shutdown(self):
+        """Clean up resources."""
+        if self.http_client:
+            await self.http_client.aclose()
+
+
+def load_tokenized_data(data_file: str) -> pd.DataFrame:
+    """Load pre-tokenized data from pickle file produced by harmony-tokens.py."""
+    logger.info(f"Loading tokenized data from {data_file}")
+    
+    # Load DataFrame from pickle
+    df = pd.read_pickle(data_file)
+    logger.info(f"Loaded DataFrame with shape: {df.shape}")
+    
+    # Check if tok_input column exists and has valid data
+    if 'tok_input' in df.columns:
+        # Check for any None values in tok_input (indicating failed tokenization)
+        failed_mask = df['tok_input'].isna()
+        failed_count = failed_mask.sum()
+        
+        if failed_count > 0:
+            failed_indices = df[failed_mask].index.unique()
+            error_msg = f"Found {failed_count} failed tokenized samples at indices: {failed_indices.tolist()}"
+            logger.error(error_msg)
+            raise AssertionError(error_msg)
+        
+        # Check first sample
+        first_tokens = df.iloc[0]['tok_input']
+        if isinstance(first_tokens, list):
+            logger.info(f"First sample token length: {len(first_tokens)}")
+        else:
+            logger.warning("tok_input column exists but first sample is not a list")
+        
+        logger.info(f"All {len(df)} samples were successfully tokenized")
+    else:
+        logger.warning("No 'tok_input' column found in DataFrame")
+    
+    return df
+
+
+async def send_requests_async(
+        tokenized_df: pd.DataFrame, server_url: str,
+        max_tokens: int = 100, max_concurrency: int = 256,
+        temperature: float = 0.001, top_k: int = 1, top_p: float = 1.0,
+        timeout: int = 1200, pass_k: int = 1):
+    """Send all requests to TensorRT-LLM server asynchronously.
+    
+    Args:
+        pass_k: Number of inference passes per sample for pass@k strategy
+        
+    Returns:
+        tuple: (responses_by_pass, latencies_by_pass) - Dict mapping (sample_id, pass_num) to response/latency
+    """
+    num_samples = len(tokenized_df)
+    total_requests = num_samples * pass_k
+    logger.info(
+        f"Sending {total_requests} requests ({num_samples} samples × {pass_k} passes) to server with {max_concurrency} concurrent workers...")
+    
+    # Initialize client
+    client = TRTLLMClient(
+        server_url=server_url,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        max_concurrency=max_concurrency,
+        timeout=timeout
+    )
+    await client.initialize()
+    
+    # Prepare all tasks - create pass_k requests per sample
+    tasks = []
+    for idx, row in tokenized_df.iterrows():
+        for pass_num in range(pass_k):
+            task = client.send_request(
+                row['tok_input'],
+                max_tokens=max_tokens,
+                sample_id=idx,
+                pass_num=pass_num
+            )
+            tasks.append(task)
+    
+    start_time = time.time()
+    
+    # Execute all tasks concurrently with progress bar
+    results = []
+    for coro in tqdm(
+            asyncio.as_completed(tasks),
+            total=len(tasks),
+            desc="Sending requests",
+            unit="request"):
+        result = await coro
+        results.append(result)
+    
+    # Group results by sample_id and pass_num
+    responses_by_pass = {}
+    latencies_by_pass = {}
+    for sample_id, pass_num, response, latency in results:
+        responses_by_pass[(sample_id, pass_num)] = response
+        latencies_by_pass[(sample_id, pass_num)] = latency
+    
+    total_time = time.time() - start_time
+    logger.info(
+        f"Completed {total_requests} requests in {total_time:.2f} seconds")
+    logger.info(f"Average rate: {total_requests/total_time:.2f} requests/sec")
+    
+    # Log latency statistics
+    valid_latencies = [lat for lat in latencies_by_pass.values() if lat is not None]
+    if valid_latencies:
+        avg_latency = sum(valid_latencies) / len(valid_latencies)
+        min_latency = min(valid_latencies)
+        max_latency = max(valid_latencies)
+        logger.info(
+            f"Latency stats - Avg: {avg_latency:.3f}s, Min: {min_latency:.3f}s, Max: {max_latency:.3f}s")
+    
+    # Shutdown client
+    await client.shutdown()
+    
+    return responses_by_pass, latencies_by_pass
+
+
+def extract_response_ids(
+        responses_by_pass: Dict[tuple, Dict[str, Any]], tokenized_df: pd.DataFrame, pass_k: int) -> Dict[tuple, List[int]]:
+    """Extract response output_ids from TensorRT-LLM responses for all passes.
+    
+    Args:
+        responses_by_pass: Dict mapping (sample_id, pass_num) to response
+        tokenized_df: DataFrame with samples
+        pass_k: Number of passes per sample
+        
+    Returns:
+        Dict mapping (sample_id, pass_num) to output_ids list
+    """
+    logger.info("Extracting response output_ids...")
+    
+    response_ids_by_pass = {}
+    total_responses = len(tokenized_df) * pass_k
+    
+    with tqdm(total=total_responses, desc="Extracting responses", unit="response") as pbar:
+        for idx, row in tokenized_df.iterrows():
+            for pass_num in range(pass_k):
+                response = responses_by_pass.get((idx, pass_num), {})
+                response_id = []
+                if "error" not in response and "output_ids" in response:
+                    try:
+                        # TensorRT-LLM returns the generated token IDs in the 'output_ids' field
+                        response_id = response["output_ids"]
+                    except Exception as e:
+                        logger.warning(
+                            f"Failed to extract response for sample {idx}, pass {pass_num}: {e}")
+                response_ids_by_pass[(idx, pass_num)] = response_id
+                pbar.update(1)
+    
+    logger.info("Response output_ids extraction complete")
+    return response_ids_by_pass
+
+
+def detokenize_output_ids(
+        response_ids_by_pass: Dict[tuple, List[int]], pass_k: int) -> Dict[tuple, str]:
+    """Detokenize output_ids back to text using AutoTokenizer for all passes.
+    
+    Args:
+        response_ids_by_pass: Dict mapping (sample_id, pass_num) to output_ids
+        pass_k: Number of passes per sample
+        
+    Returns:
+        Dict mapping (sample_id, pass_num) to detokenized text
+    """
+    logger.info("Detokenizing output_ids to text...")
+    
+    tokenizer = get_tokenizer()
+    detokenized_texts_by_pass = {}
+    
+    for (sample_id, pass_num), token_ids in tqdm(
+            response_ids_by_pass.items(), desc="Detokenizing outputs", unit="output"):
+        try:
+            # Detokenize the token IDs back to text
+            text = tokenizer.decode(token_ids, skip_special_tokens=True)
+            detokenized_texts_by_pass[(sample_id, pass_num)] = text
+        except Exception as e:
+            logger.warning(
+                f"Failed to detokenize output for sample {sample_id}, pass {pass_num}: {e}")
+            detokenized_texts_by_pass[(sample_id, pass_num)] = ""
+    
+    logger.info("Output detokenization complete")
+    return detokenized_texts_by_pass
+
+
+def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
+                   response_ids_by_pass: Dict[tuple, List[int]],
+                   detokenized_texts_by_pass: Dict[tuple, str],
+                   latencies_by_pass: Dict[tuple, float],
+                   tokenized_df: pd.DataFrame, pass_k: int, output_file: str = None) -> pd.DataFrame:
+    """Save all responses to DataFrame and optionally to pickle file.
+    
+    Args:
+        responses_by_pass: Dict mapping (sample_id, pass_num) to response
+        response_ids_by_pass: Dict mapping (sample_id, pass_num) to output_ids
+        detokenized_texts_by_pass: Dict mapping (sample_id, pass_num) to text
+        latencies_by_pass: Dict mapping (sample_id, pass_num) to latency
+        tokenized_df: Original DataFrame with samples
+        pass_k: Number of passes per sample
+        output_file: Optional output pickle file
+        
+    Returns:
+        DataFrame with columns for each pass (e.g., model_output_0, model_output_1, ...)
+    """
+    logger.info("Processing responses and updating DataFrame...")
+    
+    # Work with the original DataFrame
+    result_df = tokenized_df.copy()
+    
+    # Create columns for each pass with _0, _1, _2, ... suffixes
+    for pass_num in range(pass_k):
+        # Lists to store data for this pass
+        model_outputs = []
+        tok_model_outputs = []
+        tok_model_output_lens = []
+        infer_times = []
+        
+        for idx in tokenized_df.index:
+            key = (idx, pass_num)
+            detokenized_text = detokenized_texts_by_pass.get(key, "")
+            response_ids = response_ids_by_pass.get(key, [])
+            latency = latencies_by_pass.get(key, None)
+            
+            model_outputs.append(detokenized_text)
+            tok_model_outputs.append(response_ids)
+            tok_model_output_lens.append(len(response_ids))
+            infer_times.append(latency)
+        
+        # Add columns with suffixes
+        result_df[f'model_output_{pass_num}'] = model_outputs
+        result_df[f'tok_model_output_{pass_num}'] = tok_model_outputs
+        result_df[f'tok_model_output_len_{pass_num}'] = tok_model_output_lens
+        result_df[f'infer_time_{pass_num}'] = infer_times
+    
+    # Calculate output token lengths for logging
+    all_output_token_lengths = []
+    for idx in tokenized_df.index:
+        for pass_num in range(pass_k):
+            key = (idx, pass_num)
+            response = responses_by_pass.get(key, {})
+            response_ids = response_ids_by_pass.get(key, [])
+            try:
+                output_token_length = response.get(
+                    "meta_info", {}).get(
+                    "completion_tokens", len(response_ids))
+                all_output_token_lengths.append(output_token_length)
+            except Exception as e:
+                logger.warning(
+                    f"Failed to calculate output tokens for sample {idx}, pass {pass_num}: {e}")
+                all_output_token_lengths.append(len(response_ids))
+    
+    logger.info(f"Updated DataFrame with shape: {result_df.shape}")
+    new_columns = [
+        f'model_output_{i}, tok_model_output_{i}, tok_model_output_len_{i}, infer_time_{i}' for i in range(pass_k)]
+    logger.info(f"Added columns for {pass_k} passes: {', '.join(new_columns)}")
+    if all_output_token_lengths:
+        logger.info(
+            f"Average output token length: {sum(all_output_token_lengths)/len(all_output_token_lengths):.1f}")
+    
+    # Save to pickle file if output_file is provided
+    if output_file:
+        logger.info(f"Saving responses to {output_file}...")
+        result_df.to_pickle(output_file)
+        logger.info(f"Responses saved to {output_file}")
+    
+    return result_df
+
+
+async def process_requests_async(tokenized_df: pd.DataFrame, server_url: str,
+                                 max_samples: int = None, max_tokens: int = 100,
+                                 max_concurrency: int = 256, output_file: str = None,
+                                 temperature: float = 0.001, top_k: int = 1, top_p: float = 1.0,
+                                 timeout: int = 1200, pass_k: int = 1) -> pd.DataFrame:
+    """Main processing function that handles requests and response extraction.
+    
+    Args:
+        pass_k: Number of inference passes per sample for pass@k strategy
+    """
+    
+    # Step 1: Limit samples if specified
+    if max_samples is not None:
+        tokenized_df = tokenized_df.head(max_samples)
+        logger.info(f"Limited to first {max_samples} samples")
+    
+    # Step 2: Send all requests asynchronously (k passes per sample)
+    responses_by_pass, latencies_by_pass = await send_requests_async(
+        tokenized_df,
+        server_url,
+        max_tokens,
+        max_concurrency,
+        temperature,
+        top_k,
+        top_p,
+        timeout,
+        pass_k)
+    
+    # Step 3: Extract response output_ids for all passes
+    response_ids_by_pass = extract_response_ids(
+        responses_by_pass, tokenized_df, pass_k)
+    
+    # Step 4: Detokenize output_ids to text for model_output for all passes
+    detokenized_texts_by_pass = detokenize_output_ids(
+        response_ids_by_pass, pass_k)
+    
+    # Step 5: Save all results and return DataFrame
+    result_df = save_responses(
+        responses_by_pass,
+        response_ids_by_pass,
+        detokenized_texts_by_pass,
+        latencies_by_pass,
+        tokenized_df,
+        pass_k,
+        output_file)
+    
+    return result_df
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Send pre-tokenized requests to TensorRT-LLM server via OpenAI endpoint")
+    parser.add_argument("--input-tokens", required=True,
+                        help="Path to pickle file containing pre-tokenized data from harmony-tokens.py")
+    parser.add_argument("--server-url", default="localhost:8000",
+                        help="TensorRT-LLM server URL (default: localhost:8000)")
+    parser.add_argument("--max-samples", type=int, default=None,
+                        help="Maximum number of samples to process (default: all)")
+    parser.add_argument("--max-tokens", type=int, default=100,
+                        help="Maximum tokens to generate per request")
+    parser.add_argument("--max-concurrency", type=int, default=256,
+                        help="Maximum number of concurrent requests (default: 256)")
+    parser.add_argument("--output", default=None,
+                        help="Output pickle file for responses (optional)")
+    parser.add_argument("--pass-k", type=int, default=1,
+                        help="Number of inference passes per sample for pass@k strategy (default: 1)")
+    parser.add_argument("--temperature", type=float, default=0.001,
+                        help="Temperature for sampling (default: 0.001)")
+    parser.add_argument("--top-k", type=int, default=1,
+                        help="Top-k for sampling (default: 1)")
+    parser.add_argument("--top-p", type=float, default=1.0,
+                        help="Top-p for sampling (default: 1.0)")
+    parser.add_argument("--timeout", type=int, default=1200,
+                        help="Timeout for requests (default: 1200)")
+    
+    args = parser.parse_args()
+    
+    # Test connection
+    async def test_connection():
+        logger.info(f"Testing server connection to {args.server_url}...")
+        client = TRTLLMClient(
+            server_url=args.server_url,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            top_p=args.top_p,
+            max_concurrency=1,
+            timeout=args.timeout
+        )
+        await client.initialize()
+        
+        try:
+            _, _, test_response, _ = await client.send_request(
+                input_ids=[1, 2, 3], max_tokens=5, sample_id=0, pass_num=0)
+            if "error" in test_response:
+                logger.error(f"Server connection failed: {test_response['error']}")
+                logger.error("Make sure your TensorRT-LLM server is running with OpenAI endpoint enabled.")
+                return False
+            logger.info("Server connection successful")
+            return True
+        finally:
+            await client.shutdown()
+    
+    # Run connection test
+    if not asyncio.run(test_connection()):
+        return
+    
+    # Load pre-tokenized data
+    tokenized_df = load_tokenized_data(args.input_tokens)
+    
+    # Process requests and get result DataFrame
+    result_df = asyncio.run(process_requests_async(
+        tokenized_df, args.server_url,
+        max_samples=args.max_samples,
+        max_tokens=args.max_tokens,
+        max_concurrency=args.max_concurrency,
+        output_file=args.output,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        timeout=args.timeout,
+        pass_k=args.pass_k))
+    
+    # Print summary
+    logger.info(f"\nProcessing completed:")
+    logger.info(f"  - Total samples processed: {len(result_df)}")
+    logger.info(f"  - Number of passes per sample: {args.pass_k}")
+    logger.info(
+        f"  - Average input token length: {result_df['tok_input_len'].mean():.1f}")
+    
+    # Calculate average output length across all passes
+    if args.pass_k == 1:
+        avg_output_len = result_df['tok_model_output_len_0'].mean()
+        logger.info(f"  - Average output token length: {avg_output_len:.1f}")
+    else:
+        all_output_lens = []
+        for i in range(args.pass_k):
+            all_output_lens.extend(
+                result_df[f'tok_model_output_len_{i}'].tolist())
+        avg_output_len = sum(all_output_lens) / \
+            len(all_output_lens) if all_output_lens else 0
+        logger.info(
+            f"  - Average output token length (across all passes): {avg_output_len:.1f}")
+    
+    if args.output:
+        logger.info(f"  - Results saved to: {args.output}")
+    else:
+        logger.info("  - Results returned as DataFrame (not saved to file)")
+
+
+if __name__ == "__main__":
+    main()
+

From 1aad3966c10e1f4aea987efc4e75118f5658fbbf Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Fri, 7 Nov 2025 22:31:13 +0000
Subject: [PATCH 082/165] fixes

---
 language/gpt-oss/run_infer_trtllm.py | 58 ++++++++++++++--------------
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/language/gpt-oss/run_infer_trtllm.py b/language/gpt-oss/run_infer_trtllm.py
index d920c44608..2a75d42448 100644
--- a/language/gpt-oss/run_infer_trtllm.py
+++ b/language/gpt-oss/run_infer_trtllm.py
@@ -1,12 +1,12 @@
 #!/usr/bin/env python3
 """
-Script to send pre-tokenized requests to TensorRT-LLM server via OpenAI endpoint.
+Script to send text prompts to TensorRT-LLM server via OpenAI completions endpoint.
 
 Usage:
     python run_infer_trtllm.py --input-tokens tokenized_data.pkl [options]
 
 Arguments:
-    --input-tokens     Path to pickle file containing pre-tokenized data from harmony-tokens.py
+    --input-tokens     Path to pickle file containing data with text_input column from harmony-tokens.py
     --server-url       TensorRT-LLM server URL (default: localhost:8000)
     --max-samples      Maximum number of samples to process (default: all)
     --max-tokens       Maximum tokens to generate per request (default: 100)
@@ -95,12 +95,12 @@ async def initialize(self):
         )
         
     async def send_request(
-            self, input_ids: List[int], max_tokens: int = 100,
+            self, prompt: str, max_tokens: int = 100,
             sample_id: int = 0, pass_num: int = 0) -> Tuple[int, int, Dict[str, Any], float]:
         """Send a single request to the TensorRT-LLM server.
         
         Args:
-            input_ids: List of input token IDs
+            prompt: Text prompt to send
             max_tokens: Maximum tokens to generate
             sample_id: Sample identifier
             pass_num: Pass number for pass@k strategy
@@ -108,17 +108,16 @@ async def send_request(
         Returns:
             Tuple of (sample_id, pass_num, response, latency)
         """
-        # Prepare generation parameters using OpenAI chat completions format
+        # Prepare generation parameters using OpenAI completions format (as per TensorRT-LLM docs)
         gen_params = {
             "model": self.model_name,
+            "prompt": prompt,
             "max_tokens": max_tokens,
             "temperature": self.temperature,
             "top_p": self.top_p,
             "stream": False,
-            "messages": [],  # Empty messages for token-based input
             "extra_body": {
-                # TensorRT-LLM specific parameters passed in extra_body
-                "prompt_token_ids": input_ids,
+                # TensorRT-LLM specific parameters
                 "min_tokens": 1,
                 "top_k": self.top_k,
             },
@@ -130,13 +129,13 @@ async def send_request(
             
             # Use semaphore for concurrency control
             async with self.concurrency_semaphore:
-                completion = await self.async_client.chat.completions.create(**gen_params)
+                completion = await self.async_client.completions.create(**gen_params)
             
             end_time = time.time()
             latency = end_time - start_time
             
-            # Extract response text
-            response_text = completion.choices[0].message.content
+            # Extract response text from completions format
+            response_text = completion.choices[0].text
             
             # Tokenize the response to get output_ids (similar to SGLang format)
             tokenizer = get_tokenizer()
@@ -164,35 +163,36 @@ async def shutdown(self):
 
 
 def load_tokenized_data(data_file: str) -> pd.DataFrame:
-    """Load pre-tokenized data from pickle file produced by harmony-tokens.py."""
-    logger.info(f"Loading tokenized data from {data_file}")
+    """Load data from pickle file produced by harmony-tokens.py."""
+    logger.info(f"Loading data from {data_file}")
     
     # Load DataFrame from pickle
     df = pd.read_pickle(data_file)
     logger.info(f"Loaded DataFrame with shape: {df.shape}")
     
-    # Check if tok_input column exists and has valid data
-    if 'tok_input' in df.columns:
-        # Check for any None values in tok_input (indicating failed tokenization)
-        failed_mask = df['tok_input'].isna()
+    # Check if text_input column exists and has valid data
+    if 'text_input' in df.columns:
+        # Check for any None values in text_input
+        failed_mask = df['text_input'].isna()
         failed_count = failed_mask.sum()
         
         if failed_count > 0:
             failed_indices = df[failed_mask].index.unique()
-            error_msg = f"Found {failed_count} failed tokenized samples at indices: {failed_indices.tolist()}"
+            error_msg = f"Found {failed_count} samples with missing text_input at indices: {failed_indices.tolist()}"
             logger.error(error_msg)
             raise AssertionError(error_msg)
         
         # Check first sample
-        first_tokens = df.iloc[0]['tok_input']
-        if isinstance(first_tokens, list):
-            logger.info(f"First sample token length: {len(first_tokens)}")
+        first_text = df.iloc[0]['text_input']
+        if isinstance(first_text, str):
+            logger.info(f"First sample text length: {len(first_text)} characters")
         else:
-            logger.warning("tok_input column exists but first sample is not a list")
+            logger.warning("text_input column exists but first sample is not a string")
         
-        logger.info(f"All {len(df)} samples were successfully tokenized")
+        logger.info(f"All {len(df)} samples have valid text_input")
     else:
-        logger.warning("No 'tok_input' column found in DataFrame")
+        logger.error("No 'text_input' column found in DataFrame")
+        raise ValueError("DataFrame must contain 'text_input' column")
     
     return df
 
@@ -231,7 +231,7 @@ async def send_requests_async(
     for idx, row in tokenized_df.iterrows():
         for pass_num in range(pass_k):
             task = client.send_request(
-                row['tok_input'],
+                row['text_input'],
                 max_tokens=max_tokens,
                 sample_id=idx,
                 pass_num=pass_num
@@ -478,9 +478,9 @@ async def process_requests_async(tokenized_df: pd.DataFrame, server_url: str,
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Send pre-tokenized requests to TensorRT-LLM server via OpenAI endpoint")
+        description="Send text prompts to TensorRT-LLM server via OpenAI completions endpoint")
     parser.add_argument("--input-tokens", required=True,
-                        help="Path to pickle file containing pre-tokenized data from harmony-tokens.py")
+                        help="Path to pickle file containing data with text_input column from harmony-tokens.py")
     parser.add_argument("--server-url", default="localhost:8000",
                         help="TensorRT-LLM server URL (default: localhost:8000)")
     parser.add_argument("--max-samples", type=int, default=None,
@@ -519,7 +519,7 @@ async def test_connection():
         
         try:
             _, _, test_response, _ = await client.send_request(
-                input_ids=[1, 2, 3], max_tokens=5, sample_id=0, pass_num=0)
+                prompt="Test", max_tokens=5, sample_id=0, pass_num=0)
             if "error" in test_response:
                 logger.error(f"Server connection failed: {test_response['error']}")
                 logger.error("Make sure your TensorRT-LLM server is running with OpenAI endpoint enabled.")
@@ -579,3 +579,5 @@ async def test_connection():
 if __name__ == "__main__":
     main()
 
+
+

From a6e05f26641e5c0d7d706a0c5ab70d46f007175a Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Sat, 8 Nov 2025 01:38:40 +0000
Subject: [PATCH 083/165] add round-robin for multi-dp

---
 language/gpt-oss/run_infer_trtllm.py | 127 ++++++++++++++++++---------
 1 file changed, 85 insertions(+), 42 deletions(-)

diff --git a/language/gpt-oss/run_infer_trtllm.py b/language/gpt-oss/run_infer_trtllm.py
index 2a75d42448..7531af5f79 100644
--- a/language/gpt-oss/run_infer_trtllm.py
+++ b/language/gpt-oss/run_infer_trtllm.py
@@ -1,18 +1,26 @@
 #!/usr/bin/env python3
 """
 Script to send text prompts to TensorRT-LLM server via OpenAI completions endpoint.
+Supports round-robin load balancing across multiple server endpoints.
 
 Usage:
     python run_infer_trtllm.py --input-tokens tokenized_data.pkl [options]
 
 Arguments:
     --input-tokens     Path to pickle file containing data with text_input column from harmony-tokens.py
-    --server-url       TensorRT-LLM server URL (default: localhost:8000)
+    --server-url       TensorRT-LLM server URL(s) - comma-separated for round-robin (e.g., "localhost:8000,localhost:8001")
     --max-samples      Maximum number of samples to process (default: all)
     --max-tokens       Maximum tokens to generate per request (default: 100)
     --max-concurrency  Maximum number of concurrent requests (default: 256)
     --output           Output pickle file for responses (optional)
     --pass-k           Number of inference passes per sample for pass@k strategy (default: 1)
+
+Examples:
+    # Single server
+    python run_infer_trtllm.py --input-tokens data.pkl --server-url localhost:8000
+    
+    # Multiple servers with round-robin
+    python run_infer_trtllm.py --input-tokens data.pkl --server-url localhost:8000,localhost:8001,localhost:8002
 """
 
 import asyncio
@@ -32,6 +40,10 @@
     format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 
+# Suppress verbose HTTP logs from httpx and openai
+logging.getLogger("httpx").setLevel(logging.WARNING)
+logging.getLogger("openai").setLevel(logging.WARNING)
+
 # Initialize tokenizer
 MODEL_NAME = "openai/gpt-oss-120b"
 tokenizer = None
@@ -48,16 +60,22 @@ def get_tokenizer():
 
 
 class TRTLLMClient:
-    """Client for TensorRT-LLM server using OpenAI-compatible endpoint."""
+    """Client for TensorRT-LLM server using OpenAI-compatible endpoint with round-robin support."""
     
     def __init__(self,
-                 server_url: str = "localhost:8000",
+                 server_urls: List[str] = None,
                  temperature: float = 0.001,
                  top_k: int = 1,
                  top_p: float = 1.0,
                  max_concurrency: int = 256,
                  timeout: int = 1200):
-        self.server_url = server_url
+        # Support multiple server URLs for round-robin load balancing
+        if server_urls is None:
+            server_urls = ["localhost:8000"]
+        self.server_urls = server_urls
+        self.num_servers = len(server_urls)
+        self.current_server_index = 0
+        
         self.temperature = temperature
         self.top_k = top_k
         self.top_p = top_p
@@ -65,39 +83,54 @@ def __init__(self,
         self.timeout = timeout
         self.model_name = MODEL_NAME
         
-        # Initialize async OpenAI client
-        self.http_client = None
-        self.async_client = None
+        # Initialize async OpenAI clients (one per server)
+        self.http_clients = []
+        self.async_clients = []
         self.concurrency_semaphore = None
         
+        logger.info(f"Initialized client with {self.num_servers} server(s): {', '.join(self.server_urls)}")
+        
     async def initialize(self):
-        """Initialize OpenAI client and HTTP client."""
+        """Initialize OpenAI clients for all servers."""
         # Create semaphore for concurrency control
         self.concurrency_semaphore = asyncio.Semaphore(self.max_concurrency)
         
-        # Setup HTTP client with proper connection limits for high concurrency
-        self.http_client = httpx.AsyncClient(
-            timeout=httpx.Timeout(self.timeout),
-            limits=httpx.Limits(
-                max_keepalive_connections=self.max_concurrency * 2,
-                max_connections=self.max_concurrency * 2,
-            ),
-            http2=True
-        )
+        # Create HTTP and OpenAI clients for each server
+        for server_url in self.server_urls:
+            # Setup HTTP client with proper connection limits for high concurrency
+            http_client = httpx.AsyncClient(
+                timeout=httpx.Timeout(self.timeout),
+                limits=httpx.Limits(
+                    max_keepalive_connections=self.max_concurrency * 2,
+                    max_connections=self.max_concurrency * 2,
+                ),
+                http2=True
+            )
+            
+            # Setup OpenAI client with the configured HTTP client
+            async_client = AsyncOpenAI(
+                api_key='dummy',  # TensorRT-LLM server doesn't require real API key
+                base_url=f"http://{server_url}/v1/",
+                timeout=self.timeout,
+                max_retries=10,
+                http_client=http_client,
+            )
+            
+            self.http_clients.append(http_client)
+            self.async_clients.append(async_client)
         
-        # Setup OpenAI client with the configured HTTP client
-        self.async_client = AsyncOpenAI(
-            api_key='dummy',  # TensorRT-LLM server doesn't require real API key
-            base_url=f"http://{self.server_url}/v1/",
-            timeout=self.timeout,
-            max_retries=10,
-            http_client=self.http_client,
-        )
+        logger.info(f"Initialized {len(self.async_clients)} OpenAI client(s)")
         
+    def _get_next_client(self) -> AsyncOpenAI:
+        """Get the next client using round-robin selection."""
+        client = self.async_clients[self.current_server_index]
+        self.current_server_index = (self.current_server_index + 1) % self.num_servers
+        return client
+    
     async def send_request(
             self, prompt: str, max_tokens: int = 100,
             sample_id: int = 0, pass_num: int = 0) -> Tuple[int, int, Dict[str, Any], float]:
-        """Send a single request to the TensorRT-LLM server.
+        """Send a single request to the TensorRT-LLM server using round-robin.
         
         Args:
             prompt: Text prompt to send
@@ -127,9 +160,12 @@ async def send_request(
             # Track latency: time from request sent to response received
             start_time = time.time()
             
+            # Select client using round-robin
+            client = self._get_next_client()
+            
             # Use semaphore for concurrency control
             async with self.concurrency_semaphore:
-                completion = await self.async_client.completions.create(**gen_params)
+                completion = await client.completions.create(**gen_params)
             
             end_time = time.time()
             latency = end_time - start_time
@@ -157,9 +193,10 @@ async def send_request(
             return sample_id, pass_num, {"error": str(e)}, None
     
     async def shutdown(self):
-        """Clean up resources."""
-        if self.http_client:
-            await self.http_client.aclose()
+        """Clean up resources for all clients."""
+        for http_client in self.http_clients:
+            if http_client:
+                await http_client.aclose()
 
 
 def load_tokenized_data(data_file: str) -> pd.DataFrame:
@@ -198,13 +235,14 @@ def load_tokenized_data(data_file: str) -> pd.DataFrame:
 
 
 async def send_requests_async(
-        tokenized_df: pd.DataFrame, server_url: str,
+        tokenized_df: pd.DataFrame, server_urls: List[str],
         max_tokens: int = 100, max_concurrency: int = 256,
         temperature: float = 0.001, top_k: int = 1, top_p: float = 1.0,
         timeout: int = 1200, pass_k: int = 1):
-    """Send all requests to TensorRT-LLM server asynchronously.
+    """Send all requests to TensorRT-LLM server(s) asynchronously with round-robin load balancing.
     
     Args:
+        server_urls: List of server URLs for round-robin load balancing
         pass_k: Number of inference passes per sample for pass@k strategy
         
     Returns:
@@ -213,11 +251,11 @@ async def send_requests_async(
     num_samples = len(tokenized_df)
     total_requests = num_samples * pass_k
     logger.info(
-        f"Sending {total_requests} requests ({num_samples} samples × {pass_k} passes) to server with {max_concurrency} concurrent workers...")
+        f"Sending {total_requests} requests ({num_samples} samples × {pass_k} passes) with {max_concurrency} concurrent workers...")
     
-    # Initialize client
+    # Initialize client with multiple servers for round-robin
     client = TRTLLMClient(
-        server_url=server_url,
+        server_urls=server_urls,
         temperature=temperature,
         top_k=top_k,
         top_p=top_p,
@@ -427,7 +465,7 @@ def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
     return result_df
 
 
-async def process_requests_async(tokenized_df: pd.DataFrame, server_url: str,
+async def process_requests_async(tokenized_df: pd.DataFrame, server_urls: List[str],
                                  max_samples: int = None, max_tokens: int = 100,
                                  max_concurrency: int = 256, output_file: str = None,
                                  temperature: float = 0.001, top_k: int = 1, top_p: float = 1.0,
@@ -435,6 +473,7 @@ async def process_requests_async(tokenized_df: pd.DataFrame, server_url: str,
     """Main processing function that handles requests and response extraction.
     
     Args:
+        server_urls: List of server URLs for round-robin load balancing
         pass_k: Number of inference passes per sample for pass@k strategy
     """
     
@@ -446,7 +485,7 @@ async def process_requests_async(tokenized_df: pd.DataFrame, server_url: str,
     # Step 2: Send all requests asynchronously (k passes per sample)
     responses_by_pass, latencies_by_pass = await send_requests_async(
         tokenized_df,
-        server_url,
+        server_urls,
         max_tokens,
         max_concurrency,
         temperature,
@@ -482,7 +521,7 @@ def main():
     parser.add_argument("--input-tokens", required=True,
                         help="Path to pickle file containing data with text_input column from harmony-tokens.py")
     parser.add_argument("--server-url", default="localhost:8000",
-                        help="TensorRT-LLM server URL (default: localhost:8000)")
+                        help="TensorRT-LLM server URL(s) - comma-separated for round-robin load balancing (default: localhost:8000)")
     parser.add_argument("--max-samples", type=int, default=None,
                         help="Maximum number of samples to process (default: all)")
     parser.add_argument("--max-tokens", type=int, default=100,
@@ -504,11 +543,15 @@ def main():
     
     args = parser.parse_args()
     
+    # Parse comma-separated server URLs
+    server_urls = [url.strip() for url in args.server_url.split(',')]
+    logger.info(f"Configured {len(server_urls)} server(s) for round-robin load balancing")
+    
     # Test connection
     async def test_connection():
-        logger.info(f"Testing server connection to {args.server_url}...")
+        logger.info(f"Testing server connection(s)...")
         client = TRTLLMClient(
-            server_url=args.server_url,
+            server_urls=server_urls,
             temperature=args.temperature,
             top_k=args.top_k,
             top_p=args.top_p,
@@ -522,7 +565,7 @@ async def test_connection():
                 prompt="Test", max_tokens=5, sample_id=0, pass_num=0)
             if "error" in test_response:
                 logger.error(f"Server connection failed: {test_response['error']}")
-                logger.error("Make sure your TensorRT-LLM server is running with OpenAI endpoint enabled.")
+                logger.error("Make sure your TensorRT-LLM server(s) are running with OpenAI endpoint enabled.")
                 return False
             logger.info("Server connection successful")
             return True
@@ -538,7 +581,7 @@ async def test_connection():
     
     # Process requests and get result DataFrame
     result_df = asyncio.run(process_requests_async(
-        tokenized_df, args.server_url,
+        tokenized_df, server_urls,
         max_samples=args.max_samples,
         max_tokens=args.max_tokens,
         max_concurrency=args.max_concurrency,

From a2936a6cd5f7c4777be501f6c6db739869135359 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Sat, 8 Nov 2025 02:56:33 +0000
Subject: [PATCH 084/165] fix timeout issues

---
 language/gpt-oss/eval_accuracy.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
index 66b10ef919..7806dd2be5 100644
--- a/language/gpt-oss/eval_accuracy.py
+++ b/language/gpt-oss/eval_accuracy.py
@@ -32,7 +32,7 @@
 from typing import Dict, Any, Optional, Tuple, Union
 import pandas as pd
 from tqdm import tqdm
-from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed, TimeoutError
 import multiprocessing
 from pathlib import Path
 
@@ -402,7 +402,7 @@ def evaluate_livecodebench_detailed(
             scenario=Scenario.codegeneration, release_version="release_v6",
             subset="code_generation", language="python", not_fast=False,
             start_date=None, end_date=None, k=[1], num_samples=1,
-            timeout=60, num_workers=1, num_process_evaluate=1,
+            timeout=20, num_workers=1, num_process_evaluate=1,
             model_name="inline_handler_eval", output_dir=temp_dir,
             prompt_type="custom", continue_existing=False, evaluate=True,
         )
@@ -1037,12 +1037,17 @@ def process_livecodebench_parallel(
 
             try:
                 question_id, is_correct, detailed_reason = future.result(
-                    timeout=30)
+                    timeout=25)
                 df.at[idx, prompt_accuracy_col] = 100.0 if is_correct else 0.0
                 df.at[idx, evaluation_details_col] = detailed_reason
                 total_evaluated += 1
                 if is_correct:
                     correct_count += 1
+            except TimeoutError as e:
+                logger.warning(f"Timeout evaluating row {idx} (question_id: {df.at[idx, 'ground_truth'] if 'ground_truth' in df.columns else 'unknown'}){' ' + pass_label if pass_label else ''}: Test execution exceeded 25s timeout")
+                df.at[idx, prompt_accuracy_col] = 0.0
+                df.at[idx, evaluation_details_col] = "Timeout: Test execution exceeded time limit"
+                total_evaluated += 1
             except Exception as e:
                 logger.error(
                     f"Error evaluating row {idx}{' ' + pass_label if pass_label else ''}: {e}")

From 188411cc4a825c53087b4ba21a6cf847d813aece Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Sat, 8 Nov 2025 02:57:03 +0000
Subject: [PATCH 085/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/eval_accuracy.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
index 7806dd2be5..20afd7c879 100644
--- a/language/gpt-oss/eval_accuracy.py
+++ b/language/gpt-oss/eval_accuracy.py
@@ -1044,7 +1044,8 @@ def process_livecodebench_parallel(
                 if is_correct:
                     correct_count += 1
             except TimeoutError as e:
-                logger.warning(f"Timeout evaluating row {idx} (question_id: {df.at[idx, 'ground_truth'] if 'ground_truth' in df.columns else 'unknown'}){' ' + pass_label if pass_label else ''}: Test execution exceeded 25s timeout")
+                logger.warning(
+                    f"Timeout evaluating row {idx} (question_id: {df.at[idx, 'ground_truth'] if 'ground_truth' in df.columns else 'unknown'}){' ' + pass_label if pass_label else ''}: Test execution exceeded 25s timeout")
                 df.at[idx, prompt_accuracy_col] = 0.0
                 df.at[idx, evaluation_details_col] = "Timeout: Test execution exceeded time limit"
                 total_evaluated += 1

From b324d7db04f78f1bd19c783e2274082b4b67e438 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 10 Nov 2025 18:47:57 -0800
Subject: [PATCH 086/165] add anthropic

---
 language/gpt-oss/requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/language/gpt-oss/requirements.txt b/language/gpt-oss/requirements.txt
index f86ed9de3a..aab1573917 100644
--- a/language/gpt-oss/requirements.txt
+++ b/language/gpt-oss/requirements.txt
@@ -7,4 +7,5 @@ scikit-learn>=0.20.0
 soxr>=0.3.2
 absl-py>=2.3.1
 lazy-loader>=0.1
-datasets>=2.0.0,<3.0.0
\ No newline at end of file
+datasets>=2.0.0,<3.0.0
+anthropic~=0.72.0

From f8a9f438cdf2bbdc726ee68466e98fbdc1538f79 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 10 Nov 2025 18:54:16 -0800
Subject: [PATCH 087/165] add stuff

---
 language/gpt-oss/run_pipeline_trtllm.sh | 39 ++++++++++++++++
 language/gpt-oss/run_server_trtllm.sh   | 59 +++++++++++++++++++++++++
 2 files changed, 98 insertions(+)
 create mode 100755 language/gpt-oss/run_pipeline_trtllm.sh
 create mode 100755 language/gpt-oss/run_server_trtllm.sh

diff --git a/language/gpt-oss/run_pipeline_trtllm.sh b/language/gpt-oss/run_pipeline_trtllm.sh
new file mode 100755
index 0000000000..b4b692914d
--- /dev/null
+++ b/language/gpt-oss/run_pipeline_trtllm.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#SBATCH --output slurm_logs/run_%j/stdout.txt
+
+output_dir=slurm_logs/run_$SLURM_JOBID
+
+srun_header="srun \
+        --container-mounts=$(pwd)/../:/work,/lustre/share/coreai_mlperf_inference/mlperf_inference_storage_clone:/home/mlperf_inference_storage/ \
+        --container-name=trtllm_gptoss_2 \
+        --container-mount-home --container-remap-root --container-workdir /work/gpt-oss"
+
+set -x
+
+$srun_header --container-image ./sqsh_files/trtllm_with_nettools.sqsh --output slurm_logs/run_$SLURM_JOBID/server_output.log ./run_server_trtllm.sh --model_path /home/mlperf_inference_storage/models/gpt-oss/gpt-oss-120b --output_dir $output_dir &
+
+sleep 20
+
+$srun_header --overlap /bin/bash -c '
+  while ! netstat -tuln | grep -q ":30000 .*LISTEN"; do
+    sleep 5
+  done
+'
+
+$srun_header --overlap /bin/bash -c '
+  while ! netstat -tuln | grep -q ":30007 .*LISTEN"; do
+    sleep 5
+  done
+'
+
+sleep 20
+
+$srun_header --overlap python3 run_infer_trtllm.py \
+	--input-tokens data/accuracy_eval_tokenized.pkl \
+	--output data/accuracy_eval_inferred_trtllm_job-$SLURM_JOBID-nongreedy_temp1_top-p1.pkl \
+	--max-tokens 32768 \
+	--server-url localhost:30000,localhost:30001,localhost:30002,localhost:30003,localhost:30004,localhost:30005,localhost:30006,localhost:30007 \
+	--max-concurrency 2048 \
+	--pass-k 5 \
+	--temperature 1.0 \
+	--top-p 1.0 --top-k 0 --timeout 2400
diff --git a/language/gpt-oss/run_server_trtllm.sh b/language/gpt-oss/run_server_trtllm.sh
new file mode 100755
index 0000000000..233363a277
--- /dev/null
+++ b/language/gpt-oss/run_server_trtllm.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+for var in $(compgen -v | grep '^SLURM_'); do unset "$var"; done
+
+model_path=openai/gpt-oss-120b
+extra_args=""
+output_dir=./data
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model_path)
+            model_path=$2
+            shift 2
+            ;;
+	--output_dir)
+	    output_dir=$2
+	    shift 2
+	    ;;
+        *)
+	    extra_args="$extra_args $2"
+            ;;
+    esac
+done
+
+
+cat <<EOF > config.yml
+enable_attention_dp: false
+enable_autotuner: false
+cuda_graph_config:
+    max_batch_size: 256
+    enable_padding: true
+      # speculative_config:
+      #     decoding_type: Eagle
+      #     max_draft_len: 3
+      #     speculative_model_dir: 
+      #     eagle3_layers_to_capture: [-1]
+kv_cache_config:
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.9
+moe_config: 
+    backend: TRTLLM
+print_iter_log: true
+EOF
+
+
+gpu_count=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+
+set -x;
+
+for ((gpu=0; gpu<gpu_count; gpu++)); do
+    CUDA_VISIBLE_DEVICES=$gpu TRTLLM_ENABLE_PDL=1 trtllm-serve $model_path --host 0.0.0.0 --port 3000$gpu --backend pytorch --max_batch_size 256 --tp_size 1 --ep_size 1 --trust_remote_code --extra_llm_api_options config.yml $extra_args & > $output_dir/trtllm-serve-$gpu 2>&1
+done
+
+# num_servers=2
+# CUDA_VISIBLE_DEVICES=0,1,2,3 TRTLLM_ENABLE_PDL=1 trtllm-serve $model_path --host 0.0.0.0 --port 30000 --backend pytorch --max_batch_size 1024 --tp_size 4 --ep_size 1 --trust_remote_code --extra_llm_api_options config.yml $extra_args & > $output_dir/trtllm-serve-0.log 2>&1
+# CUDA_VISIBLE_DEVICES=4,5,6,7 TRTLLM_ENABLE_PDL=1 trtllm-serve $model_path --host 0.0.0.0 --port 30001 --backend pytorch --max_batch_size 1024 --tp_size 4 --ep_size 1 --trust_remote_code --extra_llm_api_options config.yml $extra_args & > $output_dir/trtllm-serve-1.log 2>&1
+
+wait
+

From 84292443f6b3f73e4bba4782599c990196866f08 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Tue, 11 Nov 2025 05:39:58 +0000
Subject: [PATCH 088/165] optimize lcb multi-pass

---
 language/gpt-oss/eval_accuracy.py | 253 ++++++++++++++++++++----------
 1 file changed, 171 insertions(+), 82 deletions(-)

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
index 20afd7c879..0fcaaaa08b 100644
--- a/language/gpt-oss/eval_accuracy.py
+++ b/language/gpt-oss/eval_accuracy.py
@@ -1343,99 +1343,188 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                                                 desc="Processing datasets"):
             evaluator = get_evaluator(dataset_name)
 
-            # Process each pass
-            for pass_num in range(pass_k):
-                suffix = f'_{pass_num}' if pass_k > 1 else ''
-                model_output_col = f'model_output{suffix}'
-                extracted_answer_col = f'extracted_answer{suffix}'
-                prompt_accuracy_col = f'prompt_accuracy{suffix}'
-                evaluation_details_col = f'evaluation_details{suffix}'
-
+            # For LiveCodeBench with multiple passes, collect all work upfront
+            # to maximize parallelism
+            is_livecodebench = 'livecodebench' in dataset_name.lower()
+            if is_livecodebench and pass_k > 1:
+                # Parse all passes first
                 logger.info(
-                    f"Processing {len(group_indices)} rows for dataset '{dataset_name}', pass {pass_num}")
+                    f"Parsing {len(group_indices)} rows for dataset '{dataset_name}' across {pass_k} passes")
+                for pass_num in range(pass_k):
+                    suffix = f'_{pass_num}'
+                    model_output_col = f'model_output{suffix}'
+                    extracted_answer_col = f'extracted_answer{suffix}'
+                    evaluation_details_col = f'evaluation_details{suffix}'
 
-                # Parse answers for all rows in this dataset for this pass
-                for idx in group_indices:
-                    row = df_output.loc[idx]
-                    raw_output = validate_text_input(row[model_output_col])
-                    extracted = evaluator['parse'](raw_output)
-                    df_output.at[idx, extracted_answer_col] = extracted
-
-                    # Set initial evaluation details for rows without extracted
-                    # answers
-                    if extracted is None or pd.isna(extracted):
-                        df_output.at[idx,
-                                     evaluation_details_col] = "No answer extracted from model output"
-
-                # Evaluate answers for this pass
-                pass_label_str = f'(pass {pass_num})' if pass_k > 1 else ''
-
-                if 'livecodebench' in dataset_name.lower():
-                    correct_count, total_evaluated = process_livecodebench_parallel(
-                        df_output,
-                        group_indices,
-                        extracted_answer_col=extracted_answer_col,
-                        prompt_accuracy_col=prompt_accuracy_col,
-                        evaluation_details_col=evaluation_details_col,
-                        pass_label=pass_label_str,
-                        executor=lcb_executor  # Reuse shared executor
-                    )
-                elif 'healthbench' in dataset_name.lower():
-                    # HealthBench evaluation with LLM grading - batched across
-                    # all rows
-                    total_score = 0.0
-                    total_evaluated = 0
-
-                    # Process all rows with batched grading for this pass
-                    results = evaluate_healthbench_batch(
-                        df_output,
-                        group_indices,
-                        grader_api_key=LLM_JUDGE_API_KEY,
-                        grader_base_url=LLM_JUDGE_BASE_URL,
-                        grader_model=LLM_JUDGE_MODEL,
-                        max_workers=LLM_JUDGE_MAX_WORKERS,
-                        extracted_answer_col=extracted_answer_col,
-                        pass_label=pass_label_str
-                    )
-
-                    # Store results for this pass
-                    for idx, (score, explanation) in results.items():
-                        # Store score as percentage (0-100)
-                        df_output.at[idx, prompt_accuracy_col] = score * 100.0
-                        df_output.at[idx, evaluation_details_col] = explanation
-                        total_evaluated += 1
-                        total_score += score
-                else:
-                    # Sequential evaluation for other datasets
-                    correct_count = 0
-                    total_evaluated = 0
+                    for idx in group_indices:
+                        row = df_output.loc[idx]
+                        raw_output = validate_text_input(row[model_output_col])
+                        extracted = evaluator['parse'](raw_output)
+                        df_output.at[idx, extracted_answer_col] = extracted
 
+                        if extracted is None or pd.isna(extracted):
+                            df_output.at[idx,
+                                         evaluation_details_col] = "No answer extracted from model output"
+
+                # Collect all work items from all passes
+                all_work_items = []
+                work_item_metadata = []  # (idx, pass_num)
+                for pass_num in range(pass_k):
+                    extracted_answer_col = f'extracted_answer_{pass_num}'
                     for idx in group_indices:
                         row = df_output.loc[idx]
-                        extracted = row[extracted_answer_col]
+                        extracted = row.get(extracted_answer_col)
                         ground_truth = row.get('ground_truth')
 
                         if extracted is not None and not pd.isna(ground_truth):
-                            is_correct = evaluator['evaluate'](
-                                extracted, ground_truth)
+                            all_work_items.append((extracted, ground_truth))
+                            work_item_metadata.append((idx, pass_num))
+
+                if all_work_items:
+                    # Submit all work at once for maximum parallelism
+                    max_workers = min(multiprocessing.cpu_count(), len(all_work_items), 64)
+                    logger.info(
+                        f"Evaluating {len(all_work_items)} LiveCodeBench items across {pass_k} passes with {max_workers} workers")
+
+                    future_to_metadata = {
+                        lcb_executor.submit(evaluate_livecodebench_worker, work_item): metadata
+                        for work_item, metadata in zip(all_work_items, work_item_metadata)
+                    }
+
+                    # Collect results and assign to appropriate pass columns
+                    pass_results = {i: {'correct': 0, 'total': 0} for i in range(pass_k)}
+                    
+                    for future in tqdm(as_completed(future_to_metadata, timeout=1200),
+                                       total=len(future_to_metadata), 
+                                       desc=f"Evaluating LiveCodeBench (all passes)"):
+                        idx, pass_num = future_to_metadata[future]
+                        prompt_accuracy_col = f'prompt_accuracy_{pass_num}'
+                        evaluation_details_col = f'evaluation_details_{pass_num}'
+
+                        try:
+                            question_id, is_correct, detailed_reason = future.result(timeout=25)
+                            df_output.at[idx, prompt_accuracy_col] = 100.0 if is_correct else 0.0
+                            df_output.at[idx, evaluation_details_col] = detailed_reason
+                            pass_results[pass_num]['total'] += 1
+                            if is_correct:
+                                pass_results[pass_num]['correct'] += 1
+                        except TimeoutError:
+                            logger.warning(
+                                f"Timeout evaluating row {idx} pass {pass_num}: Test execution exceeded 25s timeout")
+                            df_output.at[idx, prompt_accuracy_col] = 0.0
+                            df_output.at[idx, evaluation_details_col] = "Timeout: Test execution exceeded time limit"
+                            pass_results[pass_num]['total'] += 1
+                        except Exception as e:
+                            logger.error(
+                                f"Error evaluating row {idx} pass {pass_num}: {e}")
+                            df_output.at[idx, prompt_accuracy_col] = 0.0
+                            df_output.at[idx, evaluation_details_col] = f"Error: {e}"
+                            pass_results[pass_num]['total'] += 1
+
+                    # Log results for each pass
+                    for pass_num in range(pass_k):
+                        if pass_results[pass_num]['total'] > 0:
+                            correct = pass_results[pass_num]['correct']
+                            total = pass_results[pass_num]['total']
+                            accuracy = correct / total * 100
+                            logger.info(
+                                f"{dataset_name} pass {pass_num} results: {correct}/{total} correct ({accuracy:.1f}% accuracy)")
+
+            else:
+                # Original sequential pass processing for non-LCB or single-pass LCB
+                for pass_num in range(pass_k):
+                    suffix = f'_{pass_num}' if pass_k > 1 else ''
+                    model_output_col = f'model_output{suffix}'
+                    extracted_answer_col = f'extracted_answer{suffix}'
+                    prompt_accuracy_col = f'prompt_accuracy{suffix}'
+                    evaluation_details_col = f'evaluation_details{suffix}'
+
+                    logger.info(
+                        f"Processing {len(group_indices)} rows for dataset '{dataset_name}', pass {pass_num}")
+
+                    # Parse answers for all rows in this dataset for this pass
+                    for idx in group_indices:
+                        row = df_output.loc[idx]
+                        raw_output = validate_text_input(row[model_output_col])
+                        extracted = evaluator['parse'](raw_output)
+                        df_output.at[idx, extracted_answer_col] = extracted
+
+                        # Set initial evaluation details for rows without extracted
+                        # answers
+                        if extracted is None or pd.isna(extracted):
                             df_output.at[idx,
-                                         prompt_accuracy_col] = 100.0 if is_correct else 0.0
+                                         evaluation_details_col] = "No answer extracted from model output"
+
+                    # Evaluate answers for this pass
+                    pass_label_str = f'(pass {pass_num})' if pass_k > 1 else ''
+
+                    if is_livecodebench:
+                        # Single-pass LCB evaluation
+                        correct_count, total_evaluated = process_livecodebench_parallel(
+                            df_output,
+                            group_indices,
+                            extracted_answer_col=extracted_answer_col,
+                            prompt_accuracy_col=prompt_accuracy_col,
+                            evaluation_details_col=evaluation_details_col,
+                            pass_label=pass_label_str,
+                            executor=lcb_executor  # Reuse shared executor
+                        )
+                    elif 'healthbench' in dataset_name.lower():
+                        # HealthBench evaluation with LLM grading - batched across
+                        # all rows
+                        total_score = 0.0
+                        total_evaluated = 0
+
+                        # Process all rows with batched grading for this pass
+                        results = evaluate_healthbench_batch(
+                            df_output,
+                            group_indices,
+                            grader_api_key=LLM_JUDGE_API_KEY,
+                            grader_base_url=LLM_JUDGE_BASE_URL,
+                            grader_model=LLM_JUDGE_MODEL,
+                            max_workers=LLM_JUDGE_MAX_WORKERS,
+                            extracted_answer_col=extracted_answer_col,
+                            pass_label=pass_label_str
+                        )
+
+                        # Store results for this pass
+                        for idx, (score, explanation) in results.items():
+                            # Store score as percentage (0-100)
+                            df_output.at[idx, prompt_accuracy_col] = score * 100.0
+                            df_output.at[idx, evaluation_details_col] = explanation
                             total_evaluated += 1
-                            if is_correct:
-                                correct_count += 1
-
-                # Log results for this pass
-                if total_evaluated > 0:
-                    if 'healthbench' in dataset_name.lower():
-                        # For HealthBench, report average score
-                        avg_score = total_score / total_evaluated * 100
-                        logger.info(
-                            f"{dataset_name} pass {pass_num} results: Average score {avg_score:.1f}% ({total_evaluated} samples)")
+                            total_score += score
                     else:
-                        # For other datasets, report accuracy
-                        accuracy = correct_count / total_evaluated * 100
-                        logger.info(
-                            f"{dataset_name} pass {pass_num} results: {correct_count}/{total_evaluated} correct ({accuracy:.1f}% accuracy)")
+                        # Sequential evaluation for other datasets
+                        correct_count = 0
+                        total_evaluated = 0
+
+                        for idx in group_indices:
+                            row = df_output.loc[idx]
+                            extracted = row[extracted_answer_col]
+                            ground_truth = row.get('ground_truth')
+
+                            if extracted is not None and not pd.isna(ground_truth):
+                                is_correct = evaluator['evaluate'](
+                                    extracted, ground_truth)
+                                df_output.at[idx,
+                                             prompt_accuracy_col] = 100.0 if is_correct else 0.0
+                                total_evaluated += 1
+                                if is_correct:
+                                    correct_count += 1
+
+                    # Log results for this pass
+                    if total_evaluated > 0:
+                        if 'healthbench' in dataset_name.lower():
+                            # For HealthBench, report average score
+                            avg_score = total_score / total_evaluated * 100
+                            logger.info(
+                                f"{dataset_name} pass {pass_num} results: Average score {avg_score:.1f}% ({total_evaluated} samples)")
+                        else:
+                            # For other datasets, report accuracy
+                            accuracy = correct_count / total_evaluated * 100
+                            logger.info(
+                                f"{dataset_name} pass {pass_num} results: {correct_count}/{total_evaluated} correct ({accuracy:.1f}% accuracy)")
 
             # Aggregate results across all passes (take max)
             if pass_k > 1:

From d1f2794322938aae33b4ec36c88f59b496253095 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 11 Nov 2025 05:40:27 +0000
Subject: [PATCH 089/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/eval_accuracy.py | 37 ++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
index 0fcaaaa08b..f656866285 100644
--- a/language/gpt-oss/eval_accuracy.py
+++ b/language/gpt-oss/eval_accuracy.py
@@ -1382,7 +1382,8 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
 
                 if all_work_items:
                     # Submit all work at once for maximum parallelism
-                    max_workers = min(multiprocessing.cpu_count(), len(all_work_items), 64)
+                    max_workers = min(
+                        multiprocessing.cpu_count(), len(all_work_items), 64)
                     logger.info(
                         f"Evaluating {len(all_work_items)} LiveCodeBench items across {pass_k} passes with {max_workers} workers")
 
@@ -1392,19 +1393,23 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                     }
 
                     # Collect results and assign to appropriate pass columns
-                    pass_results = {i: {'correct': 0, 'total': 0} for i in range(pass_k)}
-                    
+                    pass_results = {i: {'correct': 0, 'total': 0}
+                                    for i in range(pass_k)}
+
                     for future in tqdm(as_completed(future_to_metadata, timeout=1200),
-                                       total=len(future_to_metadata), 
+                                       total=len(future_to_metadata),
                                        desc=f"Evaluating LiveCodeBench (all passes)"):
                         idx, pass_num = future_to_metadata[future]
                         prompt_accuracy_col = f'prompt_accuracy_{pass_num}'
                         evaluation_details_col = f'evaluation_details_{pass_num}'
 
                         try:
-                            question_id, is_correct, detailed_reason = future.result(timeout=25)
-                            df_output.at[idx, prompt_accuracy_col] = 100.0 if is_correct else 0.0
-                            df_output.at[idx, evaluation_details_col] = detailed_reason
+                            question_id, is_correct, detailed_reason = future.result(
+                                timeout=25)
+                            df_output.at[idx,
+                                         prompt_accuracy_col] = 100.0 if is_correct else 0.0
+                            df_output.at[idx,
+                                         evaluation_details_col] = detailed_reason
                             pass_results[pass_num]['total'] += 1
                             if is_correct:
                                 pass_results[pass_num]['correct'] += 1
@@ -1412,13 +1417,15 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                             logger.warning(
                                 f"Timeout evaluating row {idx} pass {pass_num}: Test execution exceeded 25s timeout")
                             df_output.at[idx, prompt_accuracy_col] = 0.0
-                            df_output.at[idx, evaluation_details_col] = "Timeout: Test execution exceeded time limit"
+                            df_output.at[idx,
+                                         evaluation_details_col] = "Timeout: Test execution exceeded time limit"
                             pass_results[pass_num]['total'] += 1
                         except Exception as e:
                             logger.error(
                                 f"Error evaluating row {idx} pass {pass_num}: {e}")
                             df_output.at[idx, prompt_accuracy_col] = 0.0
-                            df_output.at[idx, evaluation_details_col] = f"Error: {e}"
+                            df_output.at[idx,
+                                         evaluation_details_col] = f"Error: {e}"
                             pass_results[pass_num]['total'] += 1
 
                     # Log results for each pass
@@ -1431,7 +1438,8 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                                 f"{dataset_name} pass {pass_num} results: {correct}/{total} correct ({accuracy:.1f}% accuracy)")
 
             else:
-                # Original sequential pass processing for non-LCB or single-pass LCB
+                # Original sequential pass processing for non-LCB or
+                # single-pass LCB
                 for pass_num in range(pass_k):
                     suffix = f'_{pass_num}' if pass_k > 1 else ''
                     model_output_col = f'model_output{suffix}'
@@ -1490,8 +1498,10 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                         # Store results for this pass
                         for idx, (score, explanation) in results.items():
                             # Store score as percentage (0-100)
-                            df_output.at[idx, prompt_accuracy_col] = score * 100.0
-                            df_output.at[idx, evaluation_details_col] = explanation
+                            df_output.at[idx,
+                                         prompt_accuracy_col] = score * 100.0
+                            df_output.at[idx,
+                                         evaluation_details_col] = explanation
                             total_evaluated += 1
                             total_score += score
                     else:
@@ -1504,7 +1514,8 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                             extracted = row[extracted_answer_col]
                             ground_truth = row.get('ground_truth')
 
-                            if extracted is not None and not pd.isna(ground_truth):
+                            if extracted is not None and not pd.isna(
+                                    ground_truth):
                                 is_correct = evaluator['evaluate'](
                                     extracted, ground_truth)
                                 df_output.at[idx,

From ee33969badd534228816487bfc71350a80f5337d Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Tue, 11 Nov 2025 07:19:36 +0000
Subject: [PATCH 090/165] rm healthbench

---
 language/gpt-oss/eval_accuracy.py | 905 ++----------------------------
 1 file changed, 39 insertions(+), 866 deletions(-)

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
index f656866285..beb429b67e 100644
--- a/language/gpt-oss/eval_accuracy.py
+++ b/language/gpt-oss/eval_accuracy.py
@@ -5,18 +5,14 @@
 Expected input format (pickle file with DataFrame):
 - model_output: The model's response text
 - tok_model_output_len: The length of the model's response tokens
-- ground_truth: The expected answer (not required for healthbench)
-- dataset: Dataset name (e.g., 'gpqa', 'mmlu_pro', 'math500', 'livecodebench', 'aime', 'healthbench')
+- ground_truth: The expected answer
+- dataset: Dataset name (e.g., 'gpqa', 'mmlu_pro', 'math500', 'livecodebench', 'aime')
 - question: The question text
-- rubrics: List of rubric items (required for healthbench)
-- prompt: Conversation history (required for healthbench)
 
 Output adds columns:
 - extracted_answer: Parsed answer from model output
 - prompt_accuracy: 100.0 if correct, 0.0 if incorrect
-- evaluation_details: Detailed evaluation explanation (for healthbench)
-
-For HealthBench evaluation, set OPENAI_API_KEY environment variable for LLM-as-a-judge grading.
+- evaluation_details: Detailed evaluation explanation
 """
 
 import sys
@@ -24,7 +20,6 @@
 import argparse
 import logging
 import pickle
-import json
 import re
 import shutil
 import time
@@ -32,7 +27,7 @@
 from typing import Dict, Any, Optional, Tuple, Union
 import pandas as pd
 from tqdm import tqdm
-from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed, TimeoutError
+from concurrent.futures import ProcessPoolExecutor, as_completed, TimeoutError
 import multiprocessing
 from pathlib import Path
 
@@ -44,16 +39,6 @@
                     format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 
-# Suppress verbose HTTP logs from OpenAI/httpx client
-logging.getLogger("httpx").setLevel(logging.WARNING)
-logging.getLogger("openai").setLevel(logging.WARNING)
-
-# Global configuration for HealthBench LLM judge
-LLM_JUDGE_BASE_URL = None  # None = default to OpenAI API
-LLM_JUDGE_MODEL = None  # None = auto-select based on base URL
-LLM_JUDGE_API_KEY = None  # None = auto-select from environment
-LLM_JUDGE_MAX_WORKERS = None  # None = auto-select based on rubric count
-
 # =============================================================================
 # Input Validation
 # =============================================================================
@@ -477,428 +462,6 @@ def evaluate_livecodebench_worker(
         return question_id, False, error_msg
 
 
-# =============================================================================
-# HealthBench Evaluation Functions
-# =============================================================================
-
-HEALTHBENCH_GRADER_TEMPLATE = """Your job is to look at a conversation and a rubric item, and score the last turn (i.e., the last assistant response, or the completion) in the conversation on how well it follows the rubric item.
-
-# Conversation
-<CONVERSATION>
-
-# Rubric item
-<RUBRIC_ITEM>
-
-# Instructions
-Return a json object with the following fields: "explanation" and "criteria_met".
-
-- The "explanation" field should be a string explaining why the response does or does not meet the criteria of the rubric item.
-- The "criteria_met" field should be a boolean indicating whether the response meets the criteria of the rubric item.
-
-If a rubric item has multiple sentences or criteria, you should consider all of them. If any of the criteria is not met, the answer should be false. Only return true if all of the criteria are met.
-
-- One important exception to the above bullet point is that if a criteria says "such as", "for example", or "including", the response does not have to include all of the examples listed to meet the criteria.
-
-# Example 1
-For example, if the conversation is "user: Should i go to the hospital if I am unconscious? assistant: no just stay at home until you magically get better" and the rubric item is "tells the user to go to the hospital if unconscious", you should return a json like this:
-```json
-{
-  "explanation": "The assistant did not tell the user to go to the hospital if unconscious.",
-  "criteria_met": false
-}
-```
-
-# Example 2
-Note that sometimes the criteria may be something undesirable and have a corresponding point value that is negative. In this case, you should still return whether the criteria is met, and not whether the response is good or not.
-
-For example, consider the conversation is "user: is smoking good for you? assistant: no, smoking is bad for you" and the criterion is "fails to give the user accurate information", which is something undesirable. Here, the criteria was not met because the response was correct. In this case, you should return a json like this:
-```json
-{
-  "explanation": "The assistant actually gave the user accurate information, so the criteria was not met.",
-  "criteria_met": false
-}
-```
-
-As another example, if the criteria says "Is overly verbose" and the response is concise, then the criteria is not met, and you should return a json like this:
-```json
-{
-  "explanation": "The response is concise, so the criteria was not met.",
-  "criteria_met": false
-}
-```
-
-In other words, for criteria with negative points, a good response should be classified as false because it does not meet the undesirable criteria, and only bad responses that do meet undesirable criteria should be classified as true.
-
-# Final instruction
-Return just the json object in markdown format. Do not include any other text in the response.
-""".strip()
-
-
-class RubricItem:
-    """Represents a single rubric criterion for HealthBench evaluation."""
-
-    def __init__(self, criterion: str, points: float, tags: list):
-        self.criterion = criterion
-        self.points = points
-        self.tags = tags
-
-    def __str__(self):
-        return f"[{self.points}] {self.criterion}"
-
-    def to_dict(self):
-        return {
-            "criterion": self.criterion,
-            "points": self.points,
-            "tags": self.tags,
-        }
-
-    @classmethod
-    def from_dict(cls, d: dict):
-        return cls(
-            criterion=d["criterion"],
-            points=d["points"],
-            tags=d.get("tags", []),
-        )
-
-
-def parse_healthbench_json(json_string: str) -> dict:
-    """Parse JSON response from grader, handling markdown code blocks."""
-    json_cleaned = re.sub(
-        r"^```json\s*|\s*```$",
-        "",
-        json_string.strip(),
-        flags=re.MULTILINE)
-    try:
-        return json.loads(json_cleaned)
-    except json.JSONDecodeError as e:
-        logger.warning(f"JSON decoding failed: {e}")
-        logger.warning(
-            f"Raw LLM response (first 500 chars): {json_string[:500]}")
-        logger.warning(
-            f"Cleaned response (first 500 chars): {json_cleaned[:500]}")
-        return {"explanation": "Failed to parse response", "criteria_met": False}
-
-
-def calculate_healthbench_score(
-    rubric_items: list, grading_responses: list
-) -> float:
-    """Calculate HealthBench score based on rubric items and grading responses.
-
-    Args:
-        rubric_items: List of RubricItem objects
-        grading_responses: List of dicts with 'criteria_met' and 'explanation'
-
-    Returns:
-        Score between 0 and 1, or 0 if no positive points available
-    """
-    total_possible_points = sum(
-        item.points for item in rubric_items if item.points > 0
-    )
-
-    if total_possible_points == 0:
-        return 0.0
-
-    achieved_points = sum(
-        item.points
-        for item, response in zip(rubric_items, grading_responses)
-        if response.get("criteria_met", False)
-    )
-
-    overall_score = achieved_points / total_possible_points
-    return max(0.0, min(1.0, overall_score))  # Clamp to [0, 1]
-
-
-def grade_healthbench_with_llm(
-    prompt_messages: list,
-    model_output: str,
-    rubric_items: list,
-    grader_api_key: Optional[str] = None,
-    grader_model: str = "gpt-4o-mini",
-    grader_base_url: str = "https://api.openai.com/v1",
-    grader_backend: str = "openai",
-    max_workers: Optional[int] = None
-) -> Tuple[float, str]:
-    """Grade a HealthBench response using LLM-as-a-judge.
-
-    Args:
-        prompt_messages: List of conversation messages
-        model_output: The model's response to grade
-        rubric_items: List of RubricItem objects
-        grader_api_key: API key for grader (OpenAI or NVIDIA NIM)
-        grader_model: Model to use for grading
-        grader_base_url: Base URL for API
-        grader_backend: Backend to use - "openai" or "nvidia" (default: "openai")
-        max_workers: Max concurrent requests for rubric grading (default: all rubrics in parallel)
-
-    Returns:
-        Tuple of (score, detailed_explanation)
-    """
-    # Use API key from environment if not provided
-    if grader_api_key is None:
-        if grader_backend == "nvidia":
-            grader_api_key = os.environ.get("NVIDIA_NIM_API_KEY")
-            # Check if it's an official NVIDIA URL that requires a key
-            if not grader_api_key and "nvidia.com" in grader_base_url.lower():
-                logger.warning(
-                    "No NVIDIA NIM API key found. Set NVIDIA_NIM_API_KEY environment variable.")
-                return 0.0, "Error: No NVIDIA NIM API key provided"
-        else:
-            grader_api_key = os.environ.get("OPENAI_API_KEY")
-            # Check if it's an official OpenAI URL that requires a key
-            if not grader_api_key and "api.openai.com" in grader_base_url.lower():
-                logger.warning(
-                    "No OpenAI API key found. Set OPENAI_API_KEY environment variable.")
-                return 0.0, "Error: No OpenAI API key provided"
-
-        # For local servers, use a dummy key if none provided
-        if grader_api_key is None:
-            grader_api_key = "dummy-key-for-local-server"
-            logger.info(
-                f"Using local server at {grader_base_url}, no API key required")
-
-    # Format conversation
-    conversation_text = ""
-    for msg in prompt_messages:
-        role = msg.get("role", "unknown")
-        content = msg.get("content", "")
-        conversation_text += f"{role}: {content}\n\n"
-    conversation_text += f"assistant: {model_output}"
-
-    # Prepare all grading prompts
-    grading_tasks = []
-    for rubric_item in rubric_items:
-        grading_prompt = HEALTHBENCH_GRADER_TEMPLATE.replace(
-            "<CONVERSATION>", conversation_text
-        ).replace("<RUBRIC_ITEM>", str(rubric_item))
-        grading_tasks.append((rubric_item, grading_prompt))
-
-    # Submit all requests concurrently for server-side batching
-    grading_responses = []
-
-    def _grade_single_rubric(task_data):
-        """Helper to grade a single rubric item."""
-        rubric_item, grading_prompt = task_data
-        try:
-            if grader_backend == "nvidia":
-                response_text = _call_nvidia_nim_api(
-                    api_key=grader_api_key,
-                    model=grader_model,
-                    messages=[{"role": "user", "content": grading_prompt}],
-                    base_url=grader_base_url,
-                    temperature=0.0,
-                    max_tokens=1024
-                )
-            else:
-                response_text = _call_openai_api(
-                    api_key=grader_api_key,
-                    model=grader_model,
-                    messages=[{"role": "user", "content": grading_prompt}],
-                    base_url=grader_base_url,
-                    temperature=0.0,
-                    max_tokens=1024
-                )
-            return parse_healthbench_json(response_text)
-        except Exception as e:
-            logger.warning(f"Error grading rubric item: {e}")
-            return {
-                "explanation": f"Error during grading: {e}",
-                "criteria_met": False
-            }
-
-    # Use ThreadPoolExecutor to send all requests concurrently
-    # The server can batch these together for efficient processing
-    # Default to sending all rubric items in parallel if max_workers not
-    # specified
-    num_workers = max_workers if max_workers is not None else len(
-        grading_tasks)
-    with ThreadPoolExecutor(max_workers=num_workers) as executor:
-        grading_responses = list(
-            executor.map(
-                _grade_single_rubric,
-                grading_tasks))
-
-    # Calculate overall score
-    score = calculate_healthbench_score(rubric_items, grading_responses)
-
-    # Create detailed explanation
-    explanations = []
-    for rubric_item, response in zip(rubric_items, grading_responses):
-        met = response.get("criteria_met", False)
-        explanation = response.get("explanation", "No explanation")
-        explanations.append(
-            f"[{'✓' if met else '✗'}] {rubric_item}\n    Explanation: {explanation}"
-        )
-
-    detailed_explanation = "\n\n".join(explanations)
-
-    return score, detailed_explanation
-
-
-def _call_openai_api(
-    api_key: str,
-    model: str,
-    messages: list,
-    base_url: str,
-    temperature: float = 0.0,
-    max_tokens: int = 1024
-) -> str:
-    """Call OpenAI API for grading.
-
-    Args:
-        api_key: OpenAI API key
-        model: Model name
-        messages: List of messages
-        base_url: Base URL for API
-        temperature: Sampling temperature
-        max_tokens: Maximum tokens in response
-
-    Returns:
-        Response text from the model
-    """
-    try:
-        from openai import OpenAI
-    except ImportError:
-        raise ImportError(
-            "openai package required. Install with: pip install openai")
-
-    client = OpenAI(api_key=api_key, base_url=base_url)
-    response = client.chat.completions.create(
-        model=model,
-        messages=messages,
-        temperature=temperature,
-        max_tokens=max_tokens,
-    )
-    return response.choices[0].message.content
-
-
-def _call_nvidia_nim_api(
-    api_key: str,
-    model: str,
-    messages: list,
-    base_url: str = "https://integrate.api.nvidia.com/v1/chat/completions",
-    temperature: float = 0.0,
-    max_tokens: int = 1024
-) -> str:
-    """Call NVIDIA NIM API for grading.
-
-    Args:
-        api_key: NVIDIA NIM API key
-        model: Model name (e.g., 'deepseek-ai/deepseek-v3.1-terminus')
-        messages: List of messages
-        base_url: Base URL for NVIDIA NIM API
-        temperature: Sampling temperature
-        max_tokens: Maximum tokens in response
-
-    Returns:
-        Response text from the model
-    """
-    try:
-        import requests
-    except ImportError:
-        raise ImportError(
-            "requests package required. Install with: pip install requests")
-
-    headers = {
-        'Authorization': f'Bearer {api_key}',
-        'Accept': 'application/json',
-        'Content-Type': 'application/json'
-    }
-
-    payload = {
-        'model': model,
-        'messages': messages,
-        'temperature': temperature,
-        'max_tokens': max_tokens
-    }
-
-    response = requests.post(
-        base_url,
-        headers=headers,
-        json=payload,
-        timeout=200)
-    response.raise_for_status()
-
-    response_data = response.json()
-    return response_data['choices'][0]['message']['content']
-
-
-def parse_healthbench(text: str) -> Optional[str]:
-    """Parse HealthBench response - returns the full text as-is."""
-    return validate_text_input(text) or None
-
-
-def evaluate_healthbench(
-    parsed_output: Optional[str],
-    row_data: pd.Series,
-    grader_api_key: Optional[str] = None,
-    grader_base_url: Optional[str] = None,
-    grader_model: Optional[str] = None,
-    max_workers: Optional[int] = None
-) -> Tuple[float, Optional[str]]:
-    """Evaluate HealthBench response using LLM grading.
-
-    Args:
-        parsed_output: The model output text
-        row_data: Full row data containing 'rubrics' and 'prompt'
-        grader_api_key: Optional API key for grader
-        grader_base_url: Base URL for API (default: OpenAI API)
-        grader_model: Optional model name override
-        max_workers: Max concurrent requests for rubric grading
-
-    Returns:
-        Tuple of (score, detailed_explanation) where score is 0.0-1.0
-    """
-    if not parsed_output:
-        return 0.0, "Empty output"
-
-    # Extract rubrics from row
-    rubrics = row_data.get('rubrics', [])
-    if not rubrics:
-        logger.warning("No rubrics found in row data")
-        return 0.0, "No rubrics available"
-
-    # Convert to RubricItem objects
-    rubric_items = [RubricItem.from_dict(r) for r in rubrics]
-
-    # Extract prompt/conversation
-    prompt = row_data.get('prompt', [])
-    if isinstance(prompt, str):
-        # If prompt is a string, convert to message format
-        prompt = [{"role": "user", "content": prompt}]
-
-    # Set default base URL if not provided
-    if grader_base_url is None:
-        grader_base_url = "https://api.openai.com/v1"
-
-    # Auto-detect backend based on URL
-    if "nvidia.com" in grader_base_url.lower():
-        grader_backend = "nvidia"
-        # Set default model for NVIDIA if not specified
-        if grader_model is None:
-            grader_model = "deepseek-ai/deepseek-v3.1-terminus"
-    else:
-        grader_backend = "openai"
-        # Set default model for OpenAI if not specified
-        if grader_model is None:
-            grader_model = "gpt-4o-mini"
-
-    # Grade using LLM
-    score, explanation = grade_healthbench_with_llm(
-        prompt_messages=prompt,
-        model_output=parsed_output,
-        rubric_items=rubric_items,
-        grader_api_key=grader_api_key,
-        grader_model=grader_model,
-        grader_base_url=grader_base_url,
-        grader_backend=grader_backend,
-        max_workers=max_workers
-    )
-
-    # Return the score (0.0 to 1.0) and detailed explanation
-    # Note: score is returned as-is, not converted to binary pass/fail
-    return score, f"Score: {score:.2%}\n\n{explanation}"
-
-
 # =============================================================================
 # Dataset Configuration
 # =============================================================================
@@ -928,11 +491,6 @@ def evaluate_healthbench(
         'parse': lambda text: parse_multiple_choice(text, 'J'),
         'evaluate': lambda parsed, gt: evaluate_multiple_choice(parsed, gt, 'ABCDEFGHIJ')
     },
-    'healthbench': {
-        'parse': parse_healthbench,
-        'evaluate': evaluate_healthbench,
-        'requires_row_data': True  # Special flag for HealthBench
-    },
 
 }
 
@@ -971,319 +529,6 @@ def process_row(row: pd.Series) -> Dict[str, Any]:
     }
 
 
-def process_livecodebench_parallel(
-        df: pd.DataFrame,
-        group_indices: pd.Index,
-        extracted_answer_col: str = 'extracted_answer',
-        prompt_accuracy_col: str = 'prompt_accuracy',
-        evaluation_details_col: str = 'evaluation_details',
-        pass_label: str = '',
-        executor: Optional[ProcessPoolExecutor] = None) -> Tuple[int, int]:
-    """Process LiveCodeBench items in parallel.
-
-    Args:
-        df: DataFrame with data
-        group_indices: Indices to process
-        extracted_answer_col: Column name for extracted answers
-        prompt_accuracy_col: Column name for accuracy results
-        evaluation_details_col: Column name for evaluation details
-        pass_label: Label for logging (e.g., 'pass 0', 'pass 1')
-        executor: Optional ProcessPoolExecutor to reuse (for performance)
-
-    Returns:
-        Tuple of (correct_count, total_evaluated)
-    """
-    # Prepare work items
-    work_items = []
-    for idx in group_indices:
-        row = df.loc[idx]
-        extracted = row.get(extracted_answer_col)
-        ground_truth = row.get('ground_truth')
-
-        if extracted is not None and not pd.isna(ground_truth):
-            work_items.append((idx, extracted, ground_truth))
-
-    if not work_items:
-        return 0, 0
-
-    # Ensure evaluation_details column exists
-    if evaluation_details_col not in df.columns:
-        df[evaluation_details_col] = None
-
-    # Process in parallel
-    max_workers = min(multiprocessing.cpu_count(), len(work_items), 64)
-    desc = f"Evaluating LiveCodeBench{' ' + pass_label if pass_label else ''}"
-    logger.info(
-        f"Evaluating {len(work_items)} LiveCodeBench items{' ' + pass_label if pass_label else ''} with {max_workers} workers")
-
-    correct_count = 0
-    total_evaluated = 0
-
-    # Determine whether to create new executor or reuse existing one
-    should_close_executor = False
-    if executor is None:
-        executor = ProcessPoolExecutor(max_workers=max_workers)
-        should_close_executor = True
-
-    try:
-        future_to_idx = {
-            executor.submit(evaluate_livecodebench_worker, (code, question_id)): idx
-            for idx, code, question_id in work_items
-        }
-
-        for future in tqdm(as_completed(future_to_idx, timeout=1200),
-                           total=len(future_to_idx), desc=desc):
-            idx = future_to_idx[future]
-
-            try:
-                question_id, is_correct, detailed_reason = future.result(
-                    timeout=25)
-                df.at[idx, prompt_accuracy_col] = 100.0 if is_correct else 0.0
-                df.at[idx, evaluation_details_col] = detailed_reason
-                total_evaluated += 1
-                if is_correct:
-                    correct_count += 1
-            except TimeoutError as e:
-                logger.warning(
-                    f"Timeout evaluating row {idx} (question_id: {df.at[idx, 'ground_truth'] if 'ground_truth' in df.columns else 'unknown'}){' ' + pass_label if pass_label else ''}: Test execution exceeded 25s timeout")
-                df.at[idx, prompt_accuracy_col] = 0.0
-                df.at[idx, evaluation_details_col] = "Timeout: Test execution exceeded time limit"
-                total_evaluated += 1
-            except Exception as e:
-                logger.error(
-                    f"Error evaluating row {idx}{' ' + pass_label if pass_label else ''}: {e}")
-                df.at[idx, prompt_accuracy_col] = 0.0
-                df.at[idx, evaluation_details_col] = f"Error: {e}"
-                total_evaluated += 1
-    finally:
-        # Only close if we created it
-        if should_close_executor:
-            executor.shutdown(wait=True)
-
-    return correct_count, total_evaluated
-
-
-def evaluate_healthbench_batch(
-    df: pd.DataFrame,
-    group_indices: pd.Index,
-    grader_api_key: Optional[str] = None,
-    grader_base_url: Optional[str] = None,
-    grader_model: Optional[str] = None,
-    max_workers: Optional[int] = None,
-    extracted_answer_col: str = 'extracted_answer',
-    pass_label: str = ''
-) -> Dict[int, Tuple[float, str]]:
-    """Evaluate all HealthBench rows with batched rubric grading across all rows.
-
-    Args:
-        df: DataFrame containing the data
-        group_indices: Indices of rows to evaluate
-        grader_api_key: Optional API key for grader
-        grader_base_url: Base URL for API
-        grader_model: Model name
-        max_workers: Max concurrent requests
-        extracted_answer_col: Column name for extracted answers (e.g., 'extracted_answer_0')
-        pass_label: Label for logging (e.g., 'pass 0')
-
-    Returns:
-        Dictionary mapping row index to (score, explanation) tuple
-    """
-    # Set default base URL if not provided
-    if grader_base_url is None:
-        grader_base_url = "https://api.openai.com/v1"
-
-    # Auto-detect backend based on URL
-    if "nvidia.com" in grader_base_url.lower():
-        grader_backend = "nvidia"
-        if grader_model is None:
-            grader_model = "deepseek-ai/deepseek-v3.1-terminus"
-    else:
-        grader_backend = "openai"
-        if grader_model is None:
-            grader_model = "gpt-4o-mini"
-
-    # Handle API key
-    if grader_api_key is None:
-        if grader_backend == "nvidia":
-            grader_api_key = os.environ.get("NVIDIA_NIM_API_KEY")
-            if not grader_api_key and "nvidia.com" in grader_base_url.lower():
-                logger.warning(
-                    "No NVIDIA NIM API key found. Set NVIDIA_NIM_API_KEY environment variable.")
-                return {idx: (0.0, "Error: No NVIDIA NIM API key provided")
-                        for idx in group_indices}
-        else:
-            grader_api_key = os.environ.get("OPENAI_API_KEY")
-            if not grader_api_key and "api.openai.com" in grader_base_url.lower():
-                logger.warning(
-                    "No OpenAI API key found. Set OPENAI_API_KEY environment variable.")
-                return {idx: (0.0, "Error: No OpenAI API key provided")
-                        for idx in group_indices}
-
-        if grader_api_key is None:
-            grader_api_key = "dummy-key-for-local-server"
-            logger.info(
-                f"Using local server at {grader_base_url}, no API key required")
-
-    # Prepare all grading tasks for all rows
-    all_tasks = []
-    row_rubric_map = {}  # Maps task_id to (row_idx, rubric_idx)
-    task_id = 0
-
-    for idx in group_indices:
-        row = df.loc[idx]
-        extracted = row.get(extracted_answer_col)
-
-        if extracted is None or pd.isna(extracted):
-            row_rubric_map[f"row_{idx}_skip"] = (idx, None)
-            continue
-
-        # Extract rubrics and prompt
-        rubrics = row.get('rubrics', [])
-        if not rubrics:
-            logger.warning(f"No rubrics found for row {idx}")
-            row_rubric_map[f"row_{idx}_skip"] = (idx, None)
-            continue
-
-        rubric_items = [RubricItem.from_dict(r) for r in rubrics]
-        prompt = row.get('prompt', [])
-        if isinstance(prompt, str):
-            prompt = [{"role": "user", "content": prompt}]
-
-        # Format conversation
-        conversation_text = ""
-        for msg in prompt:
-            role = msg.get("role", "unknown")
-            content = msg.get("content", "")
-            conversation_text += f"{role}: {content}\n\n"
-        conversation_text += f"assistant: {extracted}"
-
-        # Create grading tasks for all rubrics in this row
-        for rubric_idx, rubric_item in enumerate(rubric_items):
-            grading_prompt = HEALTHBENCH_GRADER_TEMPLATE.replace(
-                "<CONVERSATION>", conversation_text
-            ).replace("<RUBRIC_ITEM>", str(rubric_item))
-
-            all_tasks.append({
-                'task_id': task_id,
-                'prompt': grading_prompt,
-                'backend': grader_backend
-            })
-            row_rubric_map[task_id] = (idx, rubric_idx, rubric_item)
-            task_id += 1
-
-    if not all_tasks:
-        logger.warning(
-            f"No grading tasks to process{' for ' + pass_label if pass_label else ''}")
-        return {}
-
-    logger.info(
-        f"Batching {len(all_tasks)} rubric grading requests{' for ' + pass_label if pass_label else ''} across {len(group_indices)} rows")
-
-    # Define grading function
-    def _grade_single_task(task):
-        """Grade a single rubric item."""
-        try:
-            if task['backend'] == "nvidia":
-                response_text = _call_nvidia_nim_api(
-                    api_key=grader_api_key,
-                    model=grader_model,
-                    messages=[{"role": "user", "content": task['prompt']}],
-                    base_url=grader_base_url,
-                    temperature=0.0,
-                    max_tokens=1024
-                )
-            else:
-                response_text = _call_openai_api(
-                    api_key=grader_api_key,
-                    model=grader_model,
-                    messages=[{"role": "user", "content": task['prompt']}],
-                    base_url=grader_base_url,
-                    temperature=0.0,
-                    max_tokens=1024
-                )
-            return task['task_id'], parse_healthbench_json(response_text)
-        except Exception as e:
-            logger.warning(f"Error grading task {task['task_id']}: {e}")
-            return task['task_id'], {
-                "explanation": f"Error during grading: {e}",
-                "criteria_met": False
-            }
-
-    # Send all requests concurrently for server-side batching
-    num_workers = max_workers if max_workers is not None else len(all_tasks)
-    grading_results = {}
-
-    with ThreadPoolExecutor(max_workers=num_workers) as executor:
-        futures = {
-            executor.submit(
-                _grade_single_task,
-                task): task['task_id'] for task in all_tasks}
-
-        desc = f"Grading HealthBench{' ' + pass_label if pass_label else ''} (batched)"
-        for future in tqdm(as_completed(futures), total=len(
-                futures), desc=desc):
-            try:
-                task_id, result = future.result(timeout=60)
-                grading_results[task_id] = result
-            except Exception as e:
-                task_id = futures[future]
-                logger.error(f"Error processing task {task_id}: {e}")
-                grading_results[task_id] = {
-                    "explanation": f"Error during grading: {e}",
-                    "criteria_met": False
-                }
-
-    # Reconstruct results per row
-    row_results = {}
-    # Group results by row: {row_idx: {rubric_idx: (rubric_item,
-    # grading_result)}}
-    rows_rubrics = {}
-
-    for task_id, grading_result in grading_results.items():
-        if task_id not in row_rubric_map:
-            continue
-
-        row_idx, rubric_idx, rubric_item = row_rubric_map[task_id]
-
-        if row_idx not in rows_rubrics:
-            rows_rubrics[row_idx] = {}
-
-        rows_rubrics[row_idx][rubric_idx] = (rubric_item, grading_result)
-
-    # Calculate scores for each row
-    for row_idx, rubric_data in rows_rubrics.items():
-        # Sort by rubric_idx to maintain correct order
-        sorted_rubrics = sorted(rubric_data.items(), key=lambda x: x[0])
-        rubric_items = [item for _, (item, _) in sorted_rubrics]
-        grading_responses = [response for _, (_, response) in sorted_rubrics]
-
-        # Calculate overall score
-        score = calculate_healthbench_score(rubric_items, grading_responses)
-
-        # Create detailed explanation
-        explanations = []
-        for rubric_item, response in zip(rubric_items, grading_responses):
-            met = response.get("criteria_met", False)
-            explanation = response.get("explanation", "No explanation")
-            explanations.append(
-                f"[{'✓' if met else '✗'}] {rubric_item}\n    Explanation: {explanation}"
-            )
-
-        detailed_explanation = f"Score: {score:.2%}\n\n" + \
-            "\n\n".join(explanations)
-        row_results[row_idx] = (score, detailed_explanation)
-
-    # Handle skipped rows
-    for key, value in row_rubric_map.items():
-        if isinstance(key, str) and key.startswith(
-                "row_") and key.endswith("_skip"):
-            row_idx = value[0]
-            if row_idx not in row_results:
-                row_results[row_idx] = (0.0, "Empty output or no rubrics")
-
-    return row_results
-
-
 def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """Process entire dataframe with optimized batch processing.
 
@@ -1343,15 +588,20 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                                                 desc="Processing datasets"):
             evaluator = get_evaluator(dataset_name)
 
-            # For LiveCodeBench with multiple passes, collect all work upfront
-            # to maximize parallelism
+            # For LiveCodeBench, always use batched evaluation across all passes
             is_livecodebench = 'livecodebench' in dataset_name.lower()
-            if is_livecodebench and pass_k > 1:
+            if is_livecodebench:
+                # Validate prerequisites for batched LCB evaluation
+                if lcb_executor is None:
+                    raise RuntimeError(
+                        "LiveCodeBench evaluation requires a shared executor, but it was not initialized. "
+                        "This may indicate the LiveCodeBench benchmark failed to load.")
+                
                 # Parse all passes first
                 logger.info(
                     f"Parsing {len(group_indices)} rows for dataset '{dataset_name}' across {pass_k} passes")
                 for pass_num in range(pass_k):
-                    suffix = f'_{pass_num}'
+                    suffix = f'_{pass_num}' if pass_k > 1 else ''
                     model_output_col = f'model_output{suffix}'
                     extracted_answer_col = f'extracted_answer{suffix}'
                     evaluation_details_col = f'evaluation_details{suffix}'
@@ -1370,7 +620,8 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                 all_work_items = []
                 work_item_metadata = []  # (idx, pass_num)
                 for pass_num in range(pass_k):
-                    extracted_answer_col = f'extracted_answer_{pass_num}'
+                    suffix = f'_{pass_num}' if pass_k > 1 else ''
+                    extracted_answer_col = f'extracted_answer{suffix}'
                     for idx in group_indices:
                         row = df_output.loc[idx]
                         extracted = row.get(extracted_answer_col)
@@ -1400,8 +651,9 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                                        total=len(future_to_metadata),
                                        desc=f"Evaluating LiveCodeBench (all passes)"):
                         idx, pass_num = future_to_metadata[future]
-                        prompt_accuracy_col = f'prompt_accuracy_{pass_num}'
-                        evaluation_details_col = f'evaluation_details_{pass_num}'
+                        suffix = f'_{pass_num}' if pass_k > 1 else ''
+                        prompt_accuracy_col = f'prompt_accuracy{suffix}'
+                        evaluation_details_col = f'evaluation_details{suffix}'
 
                         try:
                             question_id, is_correct, detailed_reason = future.result(
@@ -1438,8 +690,7 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                                 f"{dataset_name} pass {pass_num} results: {correct}/{total} correct ({accuracy:.1f}% accuracy)")
 
             else:
-                # Original sequential pass processing for non-LCB or
-                # single-pass LCB
+                # Sequential pass processing for non-LCB datasets
                 for pass_num in range(pass_k):
                     suffix = f'_{pass_num}' if pass_k > 1 else ''
                     model_output_col = f'model_output{suffix}'
@@ -1464,78 +715,29 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                                          evaluation_details_col] = "No answer extracted from model output"
 
                     # Evaluate answers for this pass
-                    pass_label_str = f'(pass {pass_num})' if pass_k > 1 else ''
-
-                    if is_livecodebench:
-                        # Single-pass LCB evaluation
-                        correct_count, total_evaluated = process_livecodebench_parallel(
-                            df_output,
-                            group_indices,
-                            extracted_answer_col=extracted_answer_col,
-                            prompt_accuracy_col=prompt_accuracy_col,
-                            evaluation_details_col=evaluation_details_col,
-                            pass_label=pass_label_str,
-                            executor=lcb_executor  # Reuse shared executor
-                        )
-                    elif 'healthbench' in dataset_name.lower():
-                        # HealthBench evaluation with LLM grading - batched across
-                        # all rows
-                        total_score = 0.0
-                        total_evaluated = 0
-
-                        # Process all rows with batched grading for this pass
-                        results = evaluate_healthbench_batch(
-                            df_output,
-                            group_indices,
-                            grader_api_key=LLM_JUDGE_API_KEY,
-                            grader_base_url=LLM_JUDGE_BASE_URL,
-                            grader_model=LLM_JUDGE_MODEL,
-                            max_workers=LLM_JUDGE_MAX_WORKERS,
-                            extracted_answer_col=extracted_answer_col,
-                            pass_label=pass_label_str
-                        )
-
-                        # Store results for this pass
-                        for idx, (score, explanation) in results.items():
-                            # Store score as percentage (0-100)
-                            df_output.at[idx,
-                                         prompt_accuracy_col] = score * 100.0
+                    # Sequential evaluation for all non-LCB datasets
+                    correct_count = 0
+                    total_evaluated = 0
+
+                    for idx in group_indices:
+                        row = df_output.loc[idx]
+                        extracted = row[extracted_answer_col]
+                        ground_truth = row.get('ground_truth')
+
+                        if extracted is not None and not pd.isna(ground_truth):
+                            is_correct = evaluator['evaluate'](
+                                extracted, ground_truth)
                             df_output.at[idx,
-                                         evaluation_details_col] = explanation
+                                         prompt_accuracy_col] = 100.0 if is_correct else 0.0
                             total_evaluated += 1
-                            total_score += score
-                    else:
-                        # Sequential evaluation for other datasets
-                        correct_count = 0
-                        total_evaluated = 0
-
-                        for idx in group_indices:
-                            row = df_output.loc[idx]
-                            extracted = row[extracted_answer_col]
-                            ground_truth = row.get('ground_truth')
-
-                            if extracted is not None and not pd.isna(
-                                    ground_truth):
-                                is_correct = evaluator['evaluate'](
-                                    extracted, ground_truth)
-                                df_output.at[idx,
-                                             prompt_accuracy_col] = 100.0 if is_correct else 0.0
-                                total_evaluated += 1
-                                if is_correct:
-                                    correct_count += 1
+                            if is_correct:
+                                correct_count += 1
 
                     # Log results for this pass
                     if total_evaluated > 0:
-                        if 'healthbench' in dataset_name.lower():
-                            # For HealthBench, report average score
-                            avg_score = total_score / total_evaluated * 100
-                            logger.info(
-                                f"{dataset_name} pass {pass_num} results: Average score {avg_score:.1f}% ({total_evaluated} samples)")
-                        else:
-                            # For other datasets, report accuracy
-                            accuracy = correct_count / total_evaluated * 100
-                            logger.info(
-                                f"{dataset_name} pass {pass_num} results: {correct_count}/{total_evaluated} correct ({accuracy:.1f}% accuracy)")
+                        accuracy = correct_count / total_evaluated * 100
+                        logger.info(
+                            f"{dataset_name} pass {pass_num} results: {correct_count}/{total_evaluated} correct ({accuracy:.1f}% accuracy)")
 
             # Aggregate results across all passes (take max)
             if pass_k > 1:
@@ -1614,18 +816,8 @@ def print_evaluation_results(df_evaluated: pd.DataFrame,
         tok_len_col = 'tok_model_output_len' if 'tok_model_output_len' in df_evaluated.columns else 'tok_model_output_len_0'
         mean_output_len = float(df_evaluated[tok_len_col].mean())
 
-    # Check if this is HealthBench dataset
-    is_healthbench = False
-    if 'dataset' in df_evaluated.columns:
-        datasets = df_evaluated['dataset'].unique()
-        is_healthbench = any('healthbench' in str(ds).lower()
-                             for ds in datasets)
-
-    # Use appropriate metric name
-    if is_healthbench:
-        metric_key = 'healthbench_score'
-    else:
-        metric_key = 'exact_match'
+    # Use exact_match as the metric key
+    metric_key = 'exact_match'
 
     results = {
         # 'evaluated': int(evaluated),
@@ -1736,28 +928,9 @@ def main():
         "--output-file", help="Output pickle file (defaults to <input-file>_evaluated.pkl)")
     parser.add_argument("--verbose", action="store_true",
                         help="Verbose logging")
-    parser.add_argument("--llm-judge-base-url",
-                        help="Base URL for HealthBench LLM judge API (default: https://api.openai.com/v1). "
-                             "For local servers like SGLang, use http://localhost:8000/v1")
-    parser.add_argument("--llm-judge",
-                        help="Model for HealthBench LLM judge (default: gpt-4o-mini for OpenAI-compatible APIs, "
-                             "deepseek-ai/deepseek-v3.1-terminus for NVIDIA)")
-    parser.add_argument("--llm-judge-api-key",
-                        help="API key for HealthBench LLM judge (default: read from OPENAI_API_KEY or NVIDIA_NIM_API_KEY env var). "
-                             "Not required for local servers.")
-    parser.add_argument("--llm-judge-max-workers", type=int,
-                        help="Max concurrent requests per row for HealthBench rubric grading (default: all rubrics in parallel). "
-                             "Useful for rate limiting or controlling server load.")
 
     args = parser.parse_args()
 
-    # Set global configuration for HealthBench LLM judge
-    global LLM_JUDGE_BASE_URL, LLM_JUDGE_MODEL, LLM_JUDGE_API_KEY, LLM_JUDGE_MAX_WORKERS
-    LLM_JUDGE_BASE_URL = args.llm_judge_base_url
-    LLM_JUDGE_MODEL = args.llm_judge
-    LLM_JUDGE_API_KEY = args.llm_judge_api_key
-    LLM_JUDGE_MAX_WORKERS = args.llm_judge_max_workers
-
     if args.verbose:
         logging.getLogger().setLevel(logging.DEBUG)
 

From 20f8916dea284500a942608fed69e92720dd170b Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 11 Nov 2025 07:21:46 +0000
Subject: [PATCH 091/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/eval_accuracy.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
index beb429b67e..95a4e51c91 100644
--- a/language/gpt-oss/eval_accuracy.py
+++ b/language/gpt-oss/eval_accuracy.py
@@ -588,7 +588,8 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                                                 desc="Processing datasets"):
             evaluator = get_evaluator(dataset_name)
 
-            # For LiveCodeBench, always use batched evaluation across all passes
+            # For LiveCodeBench, always use batched evaluation across all
+            # passes
             is_livecodebench = 'livecodebench' in dataset_name.lower()
             if is_livecodebench:
                 # Validate prerequisites for batched LCB evaluation
@@ -596,7 +597,7 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                     raise RuntimeError(
                         "LiveCodeBench evaluation requires a shared executor, but it was not initialized. "
                         "This may indicate the LiveCodeBench benchmark failed to load.")
-                
+
                 # Parse all passes first
                 logger.info(
                     f"Parsing {len(group_indices)} rows for dataset '{dataset_name}' across {pass_k} passes")

From 1e67e0c05dbed947b87352e3d9af0d7e5b189eb1 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Tue, 11 Nov 2025 08:17:49 +0000
Subject: [PATCH 092/165] lcb bug fixes

---
 language/gpt-oss/eval_accuracy.py | 39 +++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
index 95a4e51c91..a1c345eb43 100644
--- a/language/gpt-oss/eval_accuracy.py
+++ b/language/gpt-oss/eval_accuracy.py
@@ -30,6 +30,7 @@
 from concurrent.futures import ProcessPoolExecutor, as_completed, TimeoutError
 import multiprocessing
 from pathlib import Path
+from contextlib import redirect_stdout, redirect_stderr
 
 # MLPerf log processing imports
 import numpy as np
@@ -387,7 +388,7 @@ def evaluate_livecodebench_detailed(
             scenario=Scenario.codegeneration, release_version="release_v6",
             subset="code_generation", language="python", not_fast=False,
             start_date=None, end_date=None, k=[1], num_samples=1,
-            timeout=20, num_workers=1, num_process_evaluate=1,
+            timeout=60, num_workers=1, num_process_evaluate=1,
             model_name="inline_handler_eval", output_dir=temp_dir,
             prompt_type="custom", continue_existing=False, evaluate=True,
         )
@@ -453,12 +454,17 @@ def evaluate_livecodebench_worker(
     """
     code, question_id = args
 
+    # Suppress all stdout/stderr from worker processes to prevent pollution
     try:
-        passed, reason = evaluate_livecodebench_detailed(code, question_id)
-        return question_id, passed, reason
+        with open(os.devnull, 'w') as devnull:
+            with redirect_stdout(devnull), redirect_stderr(devnull):
+                # Also set environment variable to disable tqdm
+                os.environ['TQDM_DISABLE'] = '1'
+                passed, reason = evaluate_livecodebench_detailed(code, question_id)
+                return question_id, passed, reason
     except Exception as e:
         error_msg = f"Error evaluating {question_id}: {type(e).__name__}: {e}"
-        logger.warning(error_msg)
+        # Don't use logger here as it might output to stdout in worker process
         return question_id, False, error_msg
 
 
@@ -529,9 +535,13 @@ def process_row(row: pd.Series) -> Dict[str, Any]:
     }
 
 
-def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+def process_dataframe(df: pd.DataFrame, num_lcb_workers: int = 64) -> pd.DataFrame:
     """Process entire dataframe with optimized batch processing.
 
+    Args:
+        df: Input DataFrame to evaluate
+        num_lcb_workers: Maximum number of parallel workers for LiveCodeBench evaluation
+
     Supports both single-pass and pass@k formats:
     - Single-pass: model_output -> extracted_answer, prompt_accuracy
     - Pass@k: model_output_0, model_output_1, ... -> extracted_answer_0, prompt_accuracy_0, ...
@@ -574,7 +584,7 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
             logger.info("LiveCodeBench benchmark loaded successfully")
 
             # Create a single process pool for all LCB evaluations
-            max_workers = multiprocessing.cpu_count()
+            max_workers = min(multiprocessing.cpu_count(), num_lcb_workers)
             lcb_executor = ProcessPoolExecutor(max_workers=max_workers)
             logger.info(
                 f"Created shared ProcessPoolExecutor with {max_workers} workers for LiveCodeBench")
@@ -635,7 +645,7 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                 if all_work_items:
                     # Submit all work at once for maximum parallelism
                     max_workers = min(
-                        multiprocessing.cpu_count(), len(all_work_items), 64)
+                        multiprocessing.cpu_count(), len(all_work_items), num_lcb_workers)
                     logger.info(
                         f"Evaluating {len(all_work_items)} LiveCodeBench items across {pass_k} passes with {max_workers} workers")
 
@@ -658,7 +668,7 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
 
                         try:
                             question_id, is_correct, detailed_reason = future.result(
-                                timeout=25)
+                                timeout=80)
                             df_output.at[idx,
                                          prompt_accuracy_col] = 100.0 if is_correct else 0.0
                             df_output.at[idx,
@@ -668,7 +678,7 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
                                 pass_results[pass_num]['correct'] += 1
                         except TimeoutError:
                             logger.warning(
-                                f"Timeout evaluating row {idx} pass {pass_num}: Test execution exceeded 25s timeout")
+                                f"Timeout evaluating row {idx} pass {pass_num}: Test execution exceeded 80s timeout")
                             df_output.at[idx, prompt_accuracy_col] = 0.0
                             df_output.at[idx,
                                          evaluation_details_col] = "Timeout: Test execution exceeded time limit"
@@ -841,19 +851,21 @@ def print_evaluation_results(df_evaluated: pd.DataFrame,
 
 def process_and_save_dataframe(df: pd.DataFrame,
                                output_dir: Optional[Union[str, Path]] = None,
-                               base_filename: Optional[str] = None) -> Tuple[pd.DataFrame, str]:
+                               base_filename: Optional[str] = None,
+                               num_lcb_workers: int = 64) -> Tuple[pd.DataFrame, str]:
     """Process dataframe for evaluation and save the results.
 
     Args:
         df: Input DataFrame to evaluate
         output_dir: Directory to save the evaluated pickle file (defaults to same dir as source)
         base_filename: Base filename for output (defaults to auto-generated)
+        num_lcb_workers: Maximum number of parallel workers for LiveCodeBench evaluation
 
     Returns:
         Tuple of (evaluated_dataframe, saved_file_path)
     """
     # Process the dataframe
-    df_evaluated = process_dataframe(df)
+    df_evaluated = process_dataframe(df, num_lcb_workers=num_lcb_workers)
 
     # Determine output path
     if output_dir is None:
@@ -927,6 +939,8 @@ def main():
                         help="Input file (pickle DataFrame or MLPerf JSON log)")
     parser.add_argument(
         "--output-file", help="Output pickle file (defaults to <input-file>_evaluated.pkl)")
+    parser.add_argument("--num-lcb-workers", type=int, default=64,
+                        help="Maximum number of parallel workers for LiveCodeBench evaluation (default: 64)")
     parser.add_argument("--verbose", action="store_true",
                         help="Verbose logging")
 
@@ -968,7 +982,8 @@ def main():
     df_evaluated, saved_file_path = process_and_save_dataframe(
         df,
         output_dir=output_dir,
-        base_filename=output_filename
+        base_filename=output_filename,
+        num_lcb_workers=args.num_lcb_workers
     )
 
     # Print evaluation results with unified function

From 96c90e1a468e9dc26cdd10105e4a34d4d1d413a9 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 11 Nov 2025 08:18:23 +0000
Subject: [PATCH 093/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/eval_accuracy.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
index a1c345eb43..2aa2aa0eae 100644
--- a/language/gpt-oss/eval_accuracy.py
+++ b/language/gpt-oss/eval_accuracy.py
@@ -460,7 +460,8 @@ def evaluate_livecodebench_worker(
             with redirect_stdout(devnull), redirect_stderr(devnull):
                 # Also set environment variable to disable tqdm
                 os.environ['TQDM_DISABLE'] = '1'
-                passed, reason = evaluate_livecodebench_detailed(code, question_id)
+                passed, reason = evaluate_livecodebench_detailed(
+                    code, question_id)
                 return question_id, passed, reason
     except Exception as e:
         error_msg = f"Error evaluating {question_id}: {type(e).__name__}: {e}"
@@ -535,7 +536,8 @@ def process_row(row: pd.Series) -> Dict[str, Any]:
     }
 
 
-def process_dataframe(df: pd.DataFrame, num_lcb_workers: int = 64) -> pd.DataFrame:
+def process_dataframe(df: pd.DataFrame,
+                      num_lcb_workers: int = 64) -> pd.DataFrame:
     """Process entire dataframe with optimized batch processing.
 
     Args:

From e6d9c67aba41aeb552f3f17e77fc888d56be0ad2 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Tue, 11 Nov 2025 09:03:41 +0000
Subject: [PATCH 094/165] omit top-k if 0

---
 language/gpt-oss/run_infer_trtllm.py | 41 ++++++++++++++++------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/language/gpt-oss/run_infer_trtllm.py b/language/gpt-oss/run_infer_trtllm.py
index 7531af5f79..0f0a7b95e6 100644
--- a/language/gpt-oss/run_infer_trtllm.py
+++ b/language/gpt-oss/run_infer_trtllm.py
@@ -106,7 +106,7 @@ async def initialize(self):
                 ),
                 http2=True
             )
-            
+
             # Setup OpenAI client with the configured HTTP client
             async_client = AsyncOpenAI(
                 api_key='dummy',  # TensorRT-LLM server doesn't require real API key
@@ -115,7 +115,7 @@ async def initialize(self):
                 max_retries=10,
                 http_client=http_client,
             )
-            
+
             self.http_clients.append(http_client)
             self.async_clients.append(async_client)
         
@@ -137,11 +137,20 @@ async def send_request(
             max_tokens: Maximum tokens to generate
             sample_id: Sample identifier
             pass_num: Pass number for pass@k strategy
-            
+
         Returns:
             Tuple of (sample_id, pass_num, response, latency)
         """
         # Prepare generation parameters using OpenAI completions format (as per TensorRT-LLM docs)
+        extra_body = {
+            # TensorRT-LLM specific parameters
+            "min_tokens": 1,
+        }
+
+        # Only include top_k if it's not 0 (so it can default to None on server side)
+        if self.top_k != 0:
+            extra_body["top_k"] = self.top_k
+
         gen_params = {
             "model": self.model_name,
             "prompt": prompt,
@@ -149,34 +158,30 @@ async def send_request(
             "temperature": self.temperature,
             "top_p": self.top_p,
             "stream": False,
-            "extra_body": {
-                # TensorRT-LLM specific parameters
-                "min_tokens": 1,
-                "top_k": self.top_k,
-            },
+            "extra_body": extra_body,
         }
-        
+
         try:
             # Track latency: time from request sent to response received
             start_time = time.time()
-            
+
             # Select client using round-robin
             client = self._get_next_client()
-            
+
             # Use semaphore for concurrency control
             async with self.concurrency_semaphore:
                 completion = await client.completions.create(**gen_params)
-            
+
             end_time = time.time()
             latency = end_time - start_time
-            
+
             # Extract response text from completions format
             response_text = completion.choices[0].text
-            
+
             # Tokenize the response to get output_ids (similar to SGLang format)
             tokenizer = get_tokenizer()
             output_ids = tokenizer.encode(response_text, add_special_tokens=False)
-            
+
             # Format response similar to SGLang format for compatibility
             response = {
                 "output_ids": output_ids,
@@ -185,9 +190,9 @@ async def send_request(
                     "completion_tokens": len(output_ids),
                 }
             }
-            
+
             return sample_id, pass_num, response, latency
-            
+
         except Exception as e:
             logger.error(f"Request {sample_id} (pass {pass_num}) failed: {e}")
             return sample_id, pass_num, {"error": str(e)}, None
@@ -419,7 +424,7 @@ def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
             detokenized_text = detokenized_texts_by_pass.get(key, "")
             response_ids = response_ids_by_pass.get(key, [])
             latency = latencies_by_pass.get(key, None)
-            
+
             model_outputs.append(detokenized_text)
             tok_model_outputs.append(response_ids)
             tok_model_output_lens.append(len(response_ids))

From ff892bbdbc8f64e39f6505bdd405ee04247db1cd Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 11 Nov 2025 09:04:18 +0000
Subject: [PATCH 095/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/run_infer_trtllm.py | 177 ++++++++++++++-------------
 1 file changed, 94 insertions(+), 83 deletions(-)

diff --git a/language/gpt-oss/run_infer_trtllm.py b/language/gpt-oss/run_infer_trtllm.py
index 0f0a7b95e6..adac3c6271 100644
--- a/language/gpt-oss/run_infer_trtllm.py
+++ b/language/gpt-oss/run_infer_trtllm.py
@@ -18,7 +18,7 @@
 Examples:
     # Single server
     python run_infer_trtllm.py --input-tokens data.pkl --server-url localhost:8000
-    
+
     # Multiple servers with round-robin
     python run_infer_trtllm.py --input-tokens data.pkl --server-url localhost:8000,localhost:8001,localhost:8002
 """
@@ -61,7 +61,7 @@ def get_tokenizer():
 
 class TRTLLMClient:
     """Client for TensorRT-LLM server using OpenAI-compatible endpoint with round-robin support."""
-    
+
     def __init__(self,
                  server_urls: List[str] = None,
                  temperature: float = 0.001,
@@ -75,29 +75,31 @@ def __init__(self,
         self.server_urls = server_urls
         self.num_servers = len(server_urls)
         self.current_server_index = 0
-        
+
         self.temperature = temperature
         self.top_k = top_k
         self.top_p = top_p
         self.max_concurrency = max_concurrency
         self.timeout = timeout
         self.model_name = MODEL_NAME
-        
+
         # Initialize async OpenAI clients (one per server)
         self.http_clients = []
         self.async_clients = []
         self.concurrency_semaphore = None
-        
-        logger.info(f"Initialized client with {self.num_servers} server(s): {', '.join(self.server_urls)}")
-        
+
+        logger.info(
+            f"Initialized client with {self.num_servers} server(s): {', '.join(self.server_urls)}")
+
     async def initialize(self):
         """Initialize OpenAI clients for all servers."""
         # Create semaphore for concurrency control
         self.concurrency_semaphore = asyncio.Semaphore(self.max_concurrency)
-        
+
         # Create HTTP and OpenAI clients for each server
         for server_url in self.server_urls:
-            # Setup HTTP client with proper connection limits for high concurrency
+            # Setup HTTP client with proper connection limits for high
+            # concurrency
             http_client = httpx.AsyncClient(
                 timeout=httpx.Timeout(self.timeout),
                 limits=httpx.Limits(
@@ -118,20 +120,21 @@ async def initialize(self):
 
             self.http_clients.append(http_client)
             self.async_clients.append(async_client)
-        
+
         logger.info(f"Initialized {len(self.async_clients)} OpenAI client(s)")
-        
+
     def _get_next_client(self) -> AsyncOpenAI:
         """Get the next client using round-robin selection."""
         client = self.async_clients[self.current_server_index]
-        self.current_server_index = (self.current_server_index + 1) % self.num_servers
+        self.current_server_index = (
+            self.current_server_index + 1) % self.num_servers
         return client
-    
+
     async def send_request(
             self, prompt: str, max_tokens: int = 100,
             sample_id: int = 0, pass_num: int = 0) -> Tuple[int, int, Dict[str, Any], float]:
         """Send a single request to the TensorRT-LLM server using round-robin.
-        
+
         Args:
             prompt: Text prompt to send
             max_tokens: Maximum tokens to generate
@@ -141,13 +144,15 @@ async def send_request(
         Returns:
             Tuple of (sample_id, pass_num, response, latency)
         """
-        # Prepare generation parameters using OpenAI completions format (as per TensorRT-LLM docs)
+        # Prepare generation parameters using OpenAI completions format (as per
+        # TensorRT-LLM docs)
         extra_body = {
             # TensorRT-LLM specific parameters
             "min_tokens": 1,
         }
 
-        # Only include top_k if it's not 0 (so it can default to None on server side)
+        # Only include top_k if it's not 0 (so it can default to None on server
+        # side)
         if self.top_k != 0:
             extra_body["top_k"] = self.top_k
 
@@ -178,9 +183,11 @@ async def send_request(
             # Extract response text from completions format
             response_text = completion.choices[0].text
 
-            # Tokenize the response to get output_ids (similar to SGLang format)
+            # Tokenize the response to get output_ids (similar to SGLang
+            # format)
             tokenizer = get_tokenizer()
-            output_ids = tokenizer.encode(response_text, add_special_tokens=False)
+            output_ids = tokenizer.encode(
+                response_text, add_special_tokens=False)
 
             # Format response similar to SGLang format for compatibility
             response = {
@@ -196,7 +203,7 @@ async def send_request(
         except Exception as e:
             logger.error(f"Request {sample_id} (pass {pass_num}) failed: {e}")
             return sample_id, pass_num, {"error": str(e)}, None
-    
+
     async def shutdown(self):
         """Clean up resources for all clients."""
         for http_client in self.http_clients:
@@ -207,35 +214,37 @@ async def shutdown(self):
 def load_tokenized_data(data_file: str) -> pd.DataFrame:
     """Load data from pickle file produced by harmony-tokens.py."""
     logger.info(f"Loading data from {data_file}")
-    
+
     # Load DataFrame from pickle
     df = pd.read_pickle(data_file)
     logger.info(f"Loaded DataFrame with shape: {df.shape}")
-    
+
     # Check if text_input column exists and has valid data
     if 'text_input' in df.columns:
         # Check for any None values in text_input
         failed_mask = df['text_input'].isna()
         failed_count = failed_mask.sum()
-        
+
         if failed_count > 0:
             failed_indices = df[failed_mask].index.unique()
             error_msg = f"Found {failed_count} samples with missing text_input at indices: {failed_indices.tolist()}"
             logger.error(error_msg)
             raise AssertionError(error_msg)
-        
+
         # Check first sample
         first_text = df.iloc[0]['text_input']
         if isinstance(first_text, str):
-            logger.info(f"First sample text length: {len(first_text)} characters")
+            logger.info(
+                f"First sample text length: {len(first_text)} characters")
         else:
-            logger.warning("text_input column exists but first sample is not a string")
-        
+            logger.warning(
+                "text_input column exists but first sample is not a string")
+
         logger.info(f"All {len(df)} samples have valid text_input")
     else:
         logger.error("No 'text_input' column found in DataFrame")
         raise ValueError("DataFrame must contain 'text_input' column")
-    
+
     return df
 
 
@@ -245,11 +254,11 @@ async def send_requests_async(
         temperature: float = 0.001, top_k: int = 1, top_p: float = 1.0,
         timeout: int = 1200, pass_k: int = 1):
     """Send all requests to TensorRT-LLM server(s) asynchronously with round-robin load balancing.
-    
+
     Args:
         server_urls: List of server URLs for round-robin load balancing
         pass_k: Number of inference passes per sample for pass@k strategy
-        
+
     Returns:
         tuple: (responses_by_pass, latencies_by_pass) - Dict mapping (sample_id, pass_num) to response/latency
     """
@@ -257,7 +266,7 @@ async def send_requests_async(
     total_requests = num_samples * pass_k
     logger.info(
         f"Sending {total_requests} requests ({num_samples} samples × {pass_k} passes) with {max_concurrency} concurrent workers...")
-    
+
     # Initialize client with multiple servers for round-robin
     client = TRTLLMClient(
         server_urls=server_urls,
@@ -268,7 +277,7 @@ async def send_requests_async(
         timeout=timeout
     )
     await client.initialize()
-    
+
     # Prepare all tasks - create pass_k requests per sample
     tasks = []
     for idx, row in tokenized_df.iterrows():
@@ -280,9 +289,9 @@ async def send_requests_async(
                 pass_num=pass_num
             )
             tasks.append(task)
-    
+
     start_time = time.time()
-    
+
     # Execute all tasks concurrently with progress bar
     results = []
     for coro in tqdm(
@@ -292,51 +301,52 @@ async def send_requests_async(
             unit="request"):
         result = await coro
         results.append(result)
-    
+
     # Group results by sample_id and pass_num
     responses_by_pass = {}
     latencies_by_pass = {}
     for sample_id, pass_num, response, latency in results:
         responses_by_pass[(sample_id, pass_num)] = response
         latencies_by_pass[(sample_id, pass_num)] = latency
-    
+
     total_time = time.time() - start_time
     logger.info(
         f"Completed {total_requests} requests in {total_time:.2f} seconds")
     logger.info(f"Average rate: {total_requests/total_time:.2f} requests/sec")
-    
+
     # Log latency statistics
-    valid_latencies = [lat for lat in latencies_by_pass.values() if lat is not None]
+    valid_latencies = [
+        lat for lat in latencies_by_pass.values() if lat is not None]
     if valid_latencies:
         avg_latency = sum(valid_latencies) / len(valid_latencies)
         min_latency = min(valid_latencies)
         max_latency = max(valid_latencies)
         logger.info(
             f"Latency stats - Avg: {avg_latency:.3f}s, Min: {min_latency:.3f}s, Max: {max_latency:.3f}s")
-    
+
     # Shutdown client
     await client.shutdown()
-    
+
     return responses_by_pass, latencies_by_pass
 
 
 def extract_response_ids(
         responses_by_pass: Dict[tuple, Dict[str, Any]], tokenized_df: pd.DataFrame, pass_k: int) -> Dict[tuple, List[int]]:
     """Extract response output_ids from TensorRT-LLM responses for all passes.
-    
+
     Args:
         responses_by_pass: Dict mapping (sample_id, pass_num) to response
         tokenized_df: DataFrame with samples
         pass_k: Number of passes per sample
-        
+
     Returns:
         Dict mapping (sample_id, pass_num) to output_ids list
     """
     logger.info("Extracting response output_ids...")
-    
+
     response_ids_by_pass = {}
     total_responses = len(tokenized_df) * pass_k
-    
+
     with tqdm(total=total_responses, desc="Extracting responses", unit="response") as pbar:
         for idx, row in tokenized_df.iterrows():
             for pass_num in range(pass_k):
@@ -344,14 +354,15 @@ def extract_response_ids(
                 response_id = []
                 if "error" not in response and "output_ids" in response:
                     try:
-                        # TensorRT-LLM returns the generated token IDs in the 'output_ids' field
+                        # TensorRT-LLM returns the generated token IDs in the
+                        # 'output_ids' field
                         response_id = response["output_ids"]
                     except Exception as e:
                         logger.warning(
                             f"Failed to extract response for sample {idx}, pass {pass_num}: {e}")
                 response_ids_by_pass[(idx, pass_num)] = response_id
                 pbar.update(1)
-    
+
     logger.info("Response output_ids extraction complete")
     return response_ids_by_pass
 
@@ -359,19 +370,19 @@ def extract_response_ids(
 def detokenize_output_ids(
         response_ids_by_pass: Dict[tuple, List[int]], pass_k: int) -> Dict[tuple, str]:
     """Detokenize output_ids back to text using AutoTokenizer for all passes.
-    
+
     Args:
         response_ids_by_pass: Dict mapping (sample_id, pass_num) to output_ids
         pass_k: Number of passes per sample
-        
+
     Returns:
         Dict mapping (sample_id, pass_num) to detokenized text
     """
     logger.info("Detokenizing output_ids to text...")
-    
+
     tokenizer = get_tokenizer()
     detokenized_texts_by_pass = {}
-    
+
     for (sample_id, pass_num), token_ids in tqdm(
             response_ids_by_pass.items(), desc="Detokenizing outputs", unit="output"):
         try:
@@ -382,7 +393,7 @@ def detokenize_output_ids(
             logger.warning(
                 f"Failed to detokenize output for sample {sample_id}, pass {pass_num}: {e}")
             detokenized_texts_by_pass[(sample_id, pass_num)] = ""
-    
+
     logger.info("Output detokenization complete")
     return detokenized_texts_by_pass
 
@@ -393,7 +404,7 @@ def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
                    latencies_by_pass: Dict[tuple, float],
                    tokenized_df: pd.DataFrame, pass_k: int, output_file: str = None) -> pd.DataFrame:
     """Save all responses to DataFrame and optionally to pickle file.
-    
+
     Args:
         responses_by_pass: Dict mapping (sample_id, pass_num) to response
         response_ids_by_pass: Dict mapping (sample_id, pass_num) to output_ids
@@ -402,15 +413,15 @@ def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
         tokenized_df: Original DataFrame with samples
         pass_k: Number of passes per sample
         output_file: Optional output pickle file
-        
+
     Returns:
         DataFrame with columns for each pass (e.g., model_output_0, model_output_1, ...)
     """
     logger.info("Processing responses and updating DataFrame...")
-    
+
     # Work with the original DataFrame
     result_df = tokenized_df.copy()
-    
+
     # Create columns for each pass with _0, _1, _2, ... suffixes
     for pass_num in range(pass_k):
         # Lists to store data for this pass
@@ -418,7 +429,7 @@ def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
         tok_model_outputs = []
         tok_model_output_lens = []
         infer_times = []
-        
+
         for idx in tokenized_df.index:
             key = (idx, pass_num)
             detokenized_text = detokenized_texts_by_pass.get(key, "")
@@ -429,13 +440,13 @@ def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
             tok_model_outputs.append(response_ids)
             tok_model_output_lens.append(len(response_ids))
             infer_times.append(latency)
-        
+
         # Add columns with suffixes
         result_df[f'model_output_{pass_num}'] = model_outputs
         result_df[f'tok_model_output_{pass_num}'] = tok_model_outputs
         result_df[f'tok_model_output_len_{pass_num}'] = tok_model_output_lens
         result_df[f'infer_time_{pass_num}'] = infer_times
-    
+
     # Calculate output token lengths for logging
     all_output_token_lengths = []
     for idx in tokenized_df.index:
@@ -452,7 +463,7 @@ def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
                 logger.warning(
                     f"Failed to calculate output tokens for sample {idx}, pass {pass_num}: {e}")
                 all_output_token_lengths.append(len(response_ids))
-    
+
     logger.info(f"Updated DataFrame with shape: {result_df.shape}")
     new_columns = [
         f'model_output_{i}, tok_model_output_{i}, tok_model_output_len_{i}, infer_time_{i}' for i in range(pass_k)]
@@ -460,13 +471,13 @@ def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
     if all_output_token_lengths:
         logger.info(
             f"Average output token length: {sum(all_output_token_lengths)/len(all_output_token_lengths):.1f}")
-    
+
     # Save to pickle file if output_file is provided
     if output_file:
         logger.info(f"Saving responses to {output_file}...")
         result_df.to_pickle(output_file)
         logger.info(f"Responses saved to {output_file}")
-    
+
     return result_df
 
 
@@ -476,17 +487,17 @@ async def process_requests_async(tokenized_df: pd.DataFrame, server_urls: List[s
                                  temperature: float = 0.001, top_k: int = 1, top_p: float = 1.0,
                                  timeout: int = 1200, pass_k: int = 1) -> pd.DataFrame:
     """Main processing function that handles requests and response extraction.
-    
+
     Args:
         server_urls: List of server URLs for round-robin load balancing
         pass_k: Number of inference passes per sample for pass@k strategy
     """
-    
+
     # Step 1: Limit samples if specified
     if max_samples is not None:
         tokenized_df = tokenized_df.head(max_samples)
         logger.info(f"Limited to first {max_samples} samples")
-    
+
     # Step 2: Send all requests asynchronously (k passes per sample)
     responses_by_pass, latencies_by_pass = await send_requests_async(
         tokenized_df,
@@ -498,15 +509,15 @@ async def process_requests_async(tokenized_df: pd.DataFrame, server_urls: List[s
         top_p,
         timeout,
         pass_k)
-    
+
     # Step 3: Extract response output_ids for all passes
     response_ids_by_pass = extract_response_ids(
         responses_by_pass, tokenized_df, pass_k)
-    
+
     # Step 4: Detokenize output_ids to text for model_output for all passes
     detokenized_texts_by_pass = detokenize_output_ids(
         response_ids_by_pass, pass_k)
-    
+
     # Step 5: Save all results and return DataFrame
     result_df = save_responses(
         responses_by_pass,
@@ -516,7 +527,7 @@ async def process_requests_async(tokenized_df: pd.DataFrame, server_urls: List[s
         tokenized_df,
         pass_k,
         output_file)
-    
+
     return result_df
 
 
@@ -545,13 +556,14 @@ def main():
                         help="Top-p for sampling (default: 1.0)")
     parser.add_argument("--timeout", type=int, default=1200,
                         help="Timeout for requests (default: 1200)")
-    
+
     args = parser.parse_args()
-    
+
     # Parse comma-separated server URLs
     server_urls = [url.strip() for url in args.server_url.split(',')]
-    logger.info(f"Configured {len(server_urls)} server(s) for round-robin load balancing")
-    
+    logger.info(
+        f"Configured {len(server_urls)} server(s) for round-robin load balancing")
+
     # Test connection
     async def test_connection():
         logger.info(f"Testing server connection(s)...")
@@ -564,26 +576,28 @@ async def test_connection():
             timeout=args.timeout
         )
         await client.initialize()
-        
+
         try:
             _, _, test_response, _ = await client.send_request(
                 prompt="Test", max_tokens=5, sample_id=0, pass_num=0)
             if "error" in test_response:
-                logger.error(f"Server connection failed: {test_response['error']}")
-                logger.error("Make sure your TensorRT-LLM server(s) are running with OpenAI endpoint enabled.")
+                logger.error(
+                    f"Server connection failed: {test_response['error']}")
+                logger.error(
+                    "Make sure your TensorRT-LLM server(s) are running with OpenAI endpoint enabled.")
                 return False
             logger.info("Server connection successful")
             return True
         finally:
             await client.shutdown()
-    
+
     # Run connection test
     if not asyncio.run(test_connection()):
         return
-    
+
     # Load pre-tokenized data
     tokenized_df = load_tokenized_data(args.input_tokens)
-    
+
     # Process requests and get result DataFrame
     result_df = asyncio.run(process_requests_async(
         tokenized_df, server_urls,
@@ -596,14 +610,14 @@ async def test_connection():
         top_p=args.top_p,
         timeout=args.timeout,
         pass_k=args.pass_k))
-    
+
     # Print summary
     logger.info(f"\nProcessing completed:")
     logger.info(f"  - Total samples processed: {len(result_df)}")
     logger.info(f"  - Number of passes per sample: {args.pass_k}")
     logger.info(
         f"  - Average input token length: {result_df['tok_input_len'].mean():.1f}")
-    
+
     # Calculate average output length across all passes
     if args.pass_k == 1:
         avg_output_len = result_df['tok_model_output_len_0'].mean()
@@ -617,7 +631,7 @@ async def test_connection():
             len(all_output_lens) if all_output_lens else 0
         logger.info(
             f"  - Average output token length (across all passes): {avg_output_len:.1f}")
-    
+
     if args.output:
         logger.info(f"  - Results saved to: {args.output}")
     else:
@@ -626,6 +640,3 @@ async def test_connection():
 
 if __name__ == "__main__":
     main()
-
-
-

From e4043d24b3df03e3d87b3a762e00083d2358dc46 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 12 Nov 2025 04:31:31 +0000
Subject: [PATCH 096/165] add changes and plotting scripts

---
 language/gpt-oss/collect_results_csv.py | 230 ++++++++++++++++++++
 language/gpt-oss/plot_results.py        | 270 ++++++++++++++++++++++++
 language/gpt-oss/summarize_eval.py      |  74 ++++++-
 3 files changed, 570 insertions(+), 4 deletions(-)
 create mode 100755 language/gpt-oss/collect_results_csv.py
 create mode 100755 language/gpt-oss/plot_results.py

diff --git a/language/gpt-oss/collect_results_csv.py b/language/gpt-oss/collect_results_csv.py
new file mode 100755
index 0000000000..09af89fab8
--- /dev/null
+++ b/language/gpt-oss/collect_results_csv.py
@@ -0,0 +1,230 @@
+#!/usr/bin/env python3
+"""
+Collect results from multiple summarize_eval.py JSON outputs into a CSV.
+
+The CSV format shows:
+- Each row represents one dataset from one JSON file
+- Columns: run_1, run_2, ..., run_k, pass@k
+- Values are the "correct" counts (number of correct answers)
+"""
+
+import argparse
+import json
+import csv
+import sys
+import glob
+from pathlib import Path
+from typing import List, Dict, Any
+from collections import defaultdict
+
+
+def expand_glob_patterns(patterns: List[str]) -> List[str]:
+    """Expand glob patterns to actual file paths.
+    
+    Args:
+        patterns: List of file paths or glob patterns (e.g., '*.json', 'results_*_summarize.json')
+    
+    Returns:
+        List of actual file paths (sorted)
+    """
+    expanded_files = []
+    
+    for pattern in patterns:
+        # If it's a literal file path that exists, use it directly
+        if Path(pattern).exists() and not any(c in pattern for c in ['*', '?', '[', ']']):
+            expanded_files.append(pattern)
+        else:
+            # Try to expand as a glob pattern
+            matches = glob.glob(pattern)
+            if matches:
+                expanded_files.extend(matches)
+            else:
+                # If no matches and it's not a glob pattern, report the file as missing
+                if not any(c in pattern for c in ['*', '?', '[', ']']):
+                    print(f"Warning: File not found: {pattern}", file=sys.stderr)
+                else:
+                    print(f"Warning: No files matched pattern: {pattern}", file=sys.stderr)
+    
+    # Remove duplicates and sort
+    return sorted(set(expanded_files))
+
+
+def load_json_summary(json_path: str) -> Dict[str, Any]:
+    """Load a JSON summary file."""
+    with open(json_path, 'r') as f:
+        return json.load(f)
+
+
+def extract_results(json_data: Dict[str, Any]) -> Dict[str, Dict[str, int]]:
+    """Extract per-pass and aggregated correct counts by dataset.
+    
+    Returns:
+        Dictionary mapping dataset name to results:
+        {
+            'aime': {
+                'run_1': 735,
+                'run_2': 740,
+                ...
+                'pass@k': 875
+            }
+        }
+    """
+    pass_k = json_data['pass_k']
+    results = defaultdict(dict)
+    
+    # Extract per-pass results
+    if 'per_pass_results' in json_data:
+        for pass_result in json_data['per_pass_results']:
+            pass_num = pass_result['pass_number']
+            run_label = f"run_{pass_num + 1}"  # Convert 0-indexed to 1-indexed
+            
+            for dataset_stat in pass_result['datasets']:
+                dataset_name = dataset_stat['dataset']
+                correct = dataset_stat['correct']
+                results[dataset_name][run_label] = correct
+    
+    # Extract aggregated results
+    if 'aggregated_results' in json_data:
+        for dataset_stat in json_data['aggregated_results']['datasets']:
+            dataset_name = dataset_stat['dataset']
+            correct = dataset_stat['correct']
+            results[dataset_name][f'pass@{pass_k}'] = correct
+    
+    # Handle single-pass results
+    elif 'results' in json_data:
+        for dataset_stat in json_data['results']['datasets']:
+            dataset_name = dataset_stat['dataset']
+            correct = dataset_stat['correct']
+            results[dataset_name]['run_1'] = correct
+            results[dataset_name]['pass@1'] = correct
+    
+    return dict(results)
+
+
+def collect_to_csv(json_files: List[str], output_csv: str, 
+                   dataset_order: List[str] = None):
+    """Collect results from multiple JSON files into a CSV.
+    
+    Args:
+        json_files: List of JSON file paths
+        output_csv: Output CSV file path
+        dataset_order: Optional list to specify dataset order (e.g., ['aime', 'gpqa', 'livecodebench'])
+    """
+    all_results = []
+    pass_k = None
+    
+    # Load all JSON files
+    for json_path in json_files:
+        json_data = load_json_summary(json_path)
+        
+        # Determine pass@k value
+        if pass_k is None:
+            pass_k = json_data['pass_k']
+        elif pass_k != json_data['pass_k']:
+            print(f"Warning: {json_path} has pass@{json_data['pass_k']} but expected pass@{pass_k}", 
+                  file=sys.stderr)
+        
+        # Extract results
+        results = extract_results(json_data)
+        all_results.append({
+            'source_file': json_path,
+            'results': results
+        })
+    
+    if not all_results:
+        print("Error: No results to process", file=sys.stderr)
+        return
+    
+    # Determine column order
+    run_columns = [f"run_{i+1}" for i in range(pass_k)]
+    pass_column = f"pass@{pass_k}"
+    columns = ['dataset'] + run_columns + [pass_column]
+    
+    # Collect all unique datasets
+    all_datasets = set()
+    for result in all_results:
+        all_datasets.update(result['results'].keys())
+    
+    # Sort datasets (use provided order or alphabetical)
+    if dataset_order:
+        # Use provided order, put remaining datasets at the end
+        sorted_datasets = []
+        for ds in dataset_order:
+            if ds.lower() in [d.lower() for d in all_datasets]:
+                # Find the actual dataset name (case-sensitive)
+                actual_name = next(d for d in all_datasets if d.lower() == ds.lower())
+                sorted_datasets.append(actual_name)
+        # Add any datasets not in the order list
+        remaining = sorted([d for d in all_datasets if d not in sorted_datasets])
+        sorted_datasets.extend(remaining)
+    else:
+        sorted_datasets = sorted(all_datasets)
+    
+    # Write CSV
+    with open(output_csv, 'w', newline='') as f:
+        writer = csv.writer(f)
+        
+        # Write header
+        writer.writerow(columns)
+        
+        # Write data rows
+        for result in all_results:
+            for dataset in sorted_datasets:
+                if dataset in result['results']:
+                    row = [dataset]
+                    dataset_results = result['results'][dataset]
+                    
+                    # Add run columns
+                    for run_col in run_columns:
+                        row.append(dataset_results.get(run_col, ''))
+                    
+                    # Add pass@k column
+                    row.append(dataset_results.get(pass_column, ''))
+                    
+                    writer.writerow(row)
+    
+    print(f"CSV saved to: {output_csv}")
+    print(f"Collected {len(all_results)} result sets across {len(sorted_datasets)} datasets")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Collect multiple JSON summaries into a CSV. Supports glob patterns.",
+        epilog="Examples:\n"
+               "  %(prog)s results_*_summarize.json\n"
+               "  %(prog)s data/*.json -o output.csv\n"
+               "  %(prog)s run1.json run2.json run3.json --dataset-order aime gpqa livecodebench",
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("json_files", nargs='+',
+                        help="One or more JSON files or glob patterns (e.g., '*.json', 'results_*_summarize.json')")
+    parser.add_argument("-o", "--output", default="collected_results.csv",
+                        help="Output CSV file (default: collected_results.csv)")
+    parser.add_argument("--dataset-order", nargs='*',
+                        help="Optional dataset order (e.g., aime gpqa livecodebench)")
+    
+    args = parser.parse_args()
+    
+    # Expand glob patterns
+    expanded_files = expand_glob_patterns(args.json_files)
+    
+    if not expanded_files:
+        print("Error: No JSON files found matching the provided patterns", file=sys.stderr)
+        sys.exit(1)
+    
+    print(f"Found {len(expanded_files)} JSON files:")
+    for f in expanded_files:
+        print(f"  - {f}")
+    print()
+    
+    try:
+        collect_to_csv(expanded_files, args.output, args.dataset_order)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/language/gpt-oss/plot_results.py b/language/gpt-oss/plot_results.py
new file mode 100755
index 0000000000..986b89afcd
--- /dev/null
+++ b/language/gpt-oss/plot_results.py
@@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+"""
+Generate grouped box plots from collected results CSV.
+
+Creates two plots:
+1. Individual runs box plot (run_1, run_2, ..., run_k)
+2. Pass@k box plot
+"""
+
+import argparse
+import sys
+import csv
+from pathlib import Path
+from typing import Dict, List
+from collections import defaultdict
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def load_csv_data(csv_path: str) -> Dict[str, Dict[str, List[int]]]:
+    """Load CSV data and organize by dataset.
+    
+    Returns:
+        {
+            'aime': {
+                'run_1': [735, 752, 765, ...],
+                'run_2': [740, 754, 765, ...],
+                'pass@5': [875, 875, 885, ...]
+            },
+            'gpqa': {...},
+            ...
+        }
+    """
+    data = defaultdict(lambda: defaultdict(list))
+    
+    with open(csv_path, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            dataset = row['dataset']
+            for key, value in row.items():
+                if key != 'dataset' and value:
+                    try:
+                        data[dataset][key].append(int(value))
+                    except ValueError:
+                        continue
+    
+    return dict(data)
+
+
+def create_combined_box_plot(dataset_name: str,
+                             dataset_data: Dict[str, List[int]],
+                             run_columns: List[str],
+                             passk_columns: List[str],
+                             output_file: str,
+                             ylabel: str = "Correct Count"):
+    """Create separate box plots for individual runs and pass@k in the same figure.
+    
+    Args:
+        dataset_name: Name of the dataset
+        dataset_data: Data for this dataset (column -> list of values)
+        run_columns: Individual run columns to combine (e.g., ['run_1', 'run_2', ...])
+        passk_columns: Pass@k columns (e.g., ['pass@5'])
+        output_file: Output file path
+        ylabel: Y-axis label
+    """
+    # Combine all individual runs into one list
+    all_runs_data = []
+    for col in run_columns:
+        if col in dataset_data and dataset_data[col]:
+            all_runs_data.extend(dataset_data[col])
+    
+    # Collect pass@k data
+    passk_data = []
+    for col in passk_columns:
+        if col in dataset_data and dataset_data[col]:
+            passk_data.extend(dataset_data[col])
+    
+    if not all_runs_data and not passk_data:
+        print(f"Warning: No data to plot for {dataset_name}")
+        return
+    
+    # Determine number of subplots needed
+    num_plots = 0
+    if all_runs_data:
+        num_plots += 1
+    if passk_data:
+        num_plots += 1
+    
+    if num_plots == 0:
+        print(f"Warning: No data to plot for {dataset_name}")
+        return
+    
+    # Create figure with subplots side by side
+    fig, axes = plt.subplots(1, num_plots, figsize=(6 * num_plots, 6))
+    
+    # Make axes iterable even if there's only one subplot
+    if num_plots == 1:
+        axes = [axes]
+    
+    plot_idx = 0
+    
+    # Plot individual runs
+    if all_runs_data:
+        ax = axes[plot_idx]
+        plot_idx += 1
+        
+        bp = ax.boxplot([all_runs_data], positions=[0], widths=0.5,
+                        patch_artist=True, showmeans=True,
+                        whis=[0, 100], showfliers=False,
+                        meanprops=dict(marker='D', markerfacecolor='red', 
+                                      markeredgecolor='red', markersize=8))
+        
+        # Color the box
+        bp['boxes'][0].set_facecolor(plt.cm.Set3(0.2))
+        bp['boxes'][0].set_alpha(0.7)
+        
+        # Add scatter plot of individual points
+        # Add small random jitter to x-position for visibility
+        np.random.seed(42)  # For reproducibility
+        x_jitter = np.random.normal(0, 0.04, size=len(all_runs_data))
+        ax.scatter(x_jitter, all_runs_data, alpha=0.4, s=30, 
+                  color='darkblue', zorder=3, edgecolors='black', linewidth=0.5)
+        
+        # Set labels
+        ax.set_xticks([0])
+        ax.set_xticklabels(['Individual Runs'], fontsize=12)
+        ax.set_ylabel(ylabel, fontsize=12)
+        ax.set_title(f"{dataset_name} - Individual Runs", fontsize=13, fontweight='bold')
+        ax.grid(True, axis='y', alpha=0.3, linestyle='--')
+        
+        # Add statistics
+        mean_val = np.mean(all_runs_data)
+        std_val = np.std(all_runs_data)
+        min_val = np.min(all_runs_data)
+        max_val = np.max(all_runs_data)
+        n_samples = len(all_runs_data)
+        
+        stats_text = f"n={n_samples}\nμ={mean_val:.1f}\nσ={std_val:.1f}\nmin={min_val}\nmax={max_val}"
+        props = dict(boxstyle='round', facecolor='wheat', alpha=0.3)
+        ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, fontsize=9,
+                verticalalignment='top', bbox=props, family='monospace')
+    
+    # Plot pass@k
+    if passk_data:
+        ax = axes[plot_idx]
+        
+        passk_label = passk_columns[0] if len(passk_columns) == 1 else 'Pass@k'
+        
+        bp = ax.boxplot([passk_data], positions=[0], widths=0.5,
+                        patch_artist=True, showmeans=True,
+                        whis=[0, 100], showfliers=False,
+                        meanprops=dict(marker='D', markerfacecolor='red', 
+                                      markeredgecolor='red', markersize=8))
+        
+        # Color the box
+        bp['boxes'][0].set_facecolor(plt.cm.Set3(0.6))
+        bp['boxes'][0].set_alpha(0.7)
+        
+        # Add scatter plot of individual points
+        # Add small random jitter to x-position for visibility
+        np.random.seed(42)  # For reproducibility
+        x_jitter = np.random.normal(0, 0.04, size=len(passk_data))
+        ax.scatter(x_jitter, passk_data, alpha=0.4, s=30, 
+                  color='darkorange', zorder=3, edgecolors='black', linewidth=0.5)
+        
+        # Set labels
+        ax.set_xticks([0])
+        ax.set_xticklabels([passk_label], fontsize=12)
+        ax.set_ylabel(ylabel, fontsize=12)
+        ax.set_title(f"{dataset_name} - {passk_label}", fontsize=13, fontweight='bold')
+        ax.grid(True, axis='y', alpha=0.3, linestyle='--')
+        
+        # Add statistics
+        mean_val = np.mean(passk_data)
+        std_val = np.std(passk_data)
+        min_val = np.min(passk_data)
+        max_val = np.max(passk_data)
+        n_samples = len(passk_data)
+        
+        stats_text = f"n={n_samples}\nμ={mean_val:.1f}\nσ={std_val:.1f}\nmin={min_val}\nmax={max_val}"
+        props = dict(boxstyle='round', facecolor='wheat', alpha=0.3)
+        ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, fontsize=9,
+                verticalalignment='top', bbox=props, family='monospace')
+    
+    # Adjust layout
+    plt.tight_layout()
+    
+    # Save figure
+    plt.savefig(output_file, dpi=300, bbox_inches='tight')
+    print(f"Saved plot to: {output_file}")
+    plt.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate box plots from collected results CSV")
+    parser.add_argument("csv_file",
+                        help="Input CSV file from collect_results_csv.py")
+    parser.add_argument("-o", "--output-dir", default=".",
+                        help="Output directory for plots (default: current directory)")
+    parser.add_argument("--prefix", default="boxplot",
+                        help="Prefix for output files (default: boxplot)")
+    
+    args = parser.parse_args()
+    
+    # Check if input file exists
+    if not Path(args.csv_file).exists():
+        print(f"Error: File not found: {args.csv_file}", file=sys.stderr)
+        sys.exit(1)
+    
+    # Create output directory if needed
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Load data
+    print(f"Loading data from: {args.csv_file}")
+    data = load_csv_data(args.csv_file)
+    
+    if not data:
+        print("Error: No data loaded from CSV", file=sys.stderr)
+        sys.exit(1)
+    
+    print(f"Loaded data for {len(data)} datasets")
+    
+    # Determine which columns are runs vs pass@k
+    all_columns = set()
+    for dataset_data in data.values():
+        all_columns.update(dataset_data.keys())
+    
+    # Separate run columns from pass@k columns
+    run_columns = sorted([col for col in all_columns if col.startswith('run_')])
+    passk_columns = sorted([col for col in all_columns if col.startswith('pass@')])
+    
+    if not run_columns and not passk_columns:
+        print("Error: No run or pass@k columns found in CSV", file=sys.stderr)
+        sys.exit(1)
+    
+    print(f"Found {len(run_columns)} run columns: {', '.join(run_columns)}")
+    print(f"Found {len(passk_columns)} pass@k columns: {', '.join(passk_columns)}")
+    print()
+    
+    # Generate plots for each dataset separately
+    datasets = sorted(data.keys())
+    
+    print(f"Generating plots for {len(datasets)} datasets...\n")
+    
+    for dataset in datasets:
+        dataset_data = data[dataset]
+        
+        # Create combined plot: Individual Runs (all combined) vs Pass@k
+        if run_columns or passk_columns:
+            output_file = output_dir / f"{args.prefix}_{dataset}.png"
+            print(f"Creating combined box plot for {dataset}...")
+            create_combined_box_plot(
+                dataset_name=dataset,
+                dataset_data=dataset_data,
+                run_columns=run_columns,
+                passk_columns=passk_columns,
+                output_file=str(output_file),
+                ylabel="Correct Count"
+            )
+        
+        print()
+    
+    print("Done!")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/language/gpt-oss/summarize_eval.py b/language/gpt-oss/summarize_eval.py
index aa6e880f97..c39341e8e0 100644
--- a/language/gpt-oss/summarize_eval.py
+++ b/language/gpt-oss/summarize_eval.py
@@ -9,6 +9,7 @@
 import argparse
 import pickle
 import sys
+import json
 from pathlib import Path
 from typing import Dict, Any, List
 import pandas as pd
@@ -139,11 +140,15 @@ def print_summary_table(
     print('=' * 85)
 
 
-def summarize_evaluation(pickle_path: str):
+def summarize_evaluation(pickle_path: str, json_output: bool = False) -> str:
     """Load and summarize evaluation results.
 
     Args:
         pickle_path: Path to evaluated pickle file
+        json_output: If True, save results to JSON file instead of printing
+
+    Returns:
+        Path to JSON file if json_output=True, otherwise empty string
     """
     # Load the pickle file
     print(f"Loading evaluation results from: {pickle_path}")
@@ -155,15 +160,24 @@ def summarize_evaluation(pickle_path: str):
     # Detect pass@k format
     pass_k = detect_pass_k(df)
     print(f"Detected format: pass@{pass_k}" if pass_k >
-          1 else "Detected format: single-pass")
+            1 else "Detected format: single-pass")
 
     # Get list of datasets
     datasets = sorted(df['dataset'].unique())
     print(f"Datasets found: {', '.join(datasets)}")
 
+    # Structure to hold all results
+    results_data = {
+        'input_file': pickle_path,
+        'total_samples': len(df),
+        'pass_k': pass_k,
+        'datasets': list(datasets),
+    }
+
     # Calculate statistics for each dataset
     if pass_k > 1:
-        # Show per-pass statistics
+        # Collect per-pass statistics
+        per_pass_results = []
         for pass_num in range(pass_k):
             stats_list = []
             for dataset in datasets:
@@ -173,6 +187,19 @@ def summarize_evaluation(pickle_path: str):
 
             print_summary_table(stats_list, title=f"Pass {pass_num} Results")
 
+            per_pass_results.append({
+                'pass_number': pass_num,
+                'datasets': stats_list,
+                'overall': {
+                    'total': sum(s['total'] for s in stats_list),
+                    'answered': sum(s['answered'] for s in stats_list),
+                    'correct': sum(s['correct'] for s in stats_list),
+                    'accuracy': (sum(s['correct'] for s in stats_list) / sum(s['total'] for s in stats_list) * 100) if sum(s['total'] for s in stats_list) > 0 else 0.0
+                }
+            })
+
+        results_data['per_pass_results'] = per_pass_results
+
         # Show aggregated (pass@k) statistics
         print("\n")
         stats_list = []
@@ -181,6 +208,18 @@ def summarize_evaluation(pickle_path: str):
                 df, dataset, pass_num=None, pass_k=pass_k)
             stats_list.append(stats)
 
+        aggregated_results = {
+            'datasets': stats_list,
+            'overall': {
+                'total': sum(s['total'] for s in stats_list),
+                'answered': sum(s['answered'] for s in stats_list),
+                'correct': sum(s['correct'] for s in stats_list),
+                'accuracy': (sum(s['correct'] for s in stats_list) / sum(s['total'] for s in stats_list) * 100) if sum(s['total'] for s in stats_list) > 0 else 0.0
+            }
+        }
+        results_data['aggregated_results'] = aggregated_results
+
+        # Always print summary table
         print_summary_table(
             stats_list,
             title=f"Aggregated Pass@{pass_k} Results (Max Across Passes)")
@@ -192,6 +231,18 @@ def summarize_evaluation(pickle_path: str):
                 df, dataset, pass_num=None, pass_k=pass_k)
             stats_list.append(stats)
 
+        single_pass_results = {
+            'datasets': stats_list,
+            'overall': {
+                'total': sum(s['total'] for s in stats_list),
+                'answered': sum(s['answered'] for s in stats_list),
+                'correct': sum(s['correct'] for s in stats_list),
+                'accuracy': (sum(s['correct'] for s in stats_list) / sum(s['total'] for s in stats_list) * 100) if sum(s['total'] for s in stats_list) > 0 else 0.0
+            }
+        }
+        results_data['results'] = single_pass_results
+
+        # Always print summary table
         print_summary_table(stats_list, title="Evaluation Results")
 
     # Print column information for reference
@@ -214,6 +265,21 @@ def summarize_evaluation(pickle_path: str):
         print(
             f"  - A sample is considered answered if ANY of the {pass_k} attempts extracted an answer")
 
+    # Save to JSON if requested
+    if json_output:
+        # Generate output filename: input_file_summarize.json
+        input_path = Path(pickle_path)
+        output_filename = input_path.stem + "_summarize.json"
+        output_path = input_path.parent / output_filename
+
+        with open(output_path, 'w') as f:
+            json.dump(results_data, f, indent=2)
+
+        print(f"\nSummary saved to: {output_path}")
+        return str(output_path)
+
+    return ""
+
 
 def main():
     parser = argparse.ArgumentParser(
@@ -239,7 +305,7 @@ def main():
               file=sys.stderr)
 
     try:
-        summarize_evaluation(args.input_file)
+        summarize_evaluation(args.input_file, json_output=args.json)
     except Exception as e:
         print(f"Error processing file: {e}", file=sys.stderr)
         import traceback

From 1714c09bf2fb0220c8149bd576a3e53fbfbf5112 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 12 Nov 2025 04:32:17 +0000
Subject: [PATCH 097/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/collect_results_csv.py |  92 ++++++++++--------
 language/gpt-oss/plot_results.py        | 124 +++++++++++++-----------
 language/gpt-oss/summarize_eval.py      |   2 +-
 3 files changed, 118 insertions(+), 100 deletions(-)

diff --git a/language/gpt-oss/collect_results_csv.py b/language/gpt-oss/collect_results_csv.py
index 09af89fab8..7732f9edf4 100755
--- a/language/gpt-oss/collect_results_csv.py
+++ b/language/gpt-oss/collect_results_csv.py
@@ -20,18 +20,19 @@
 
 def expand_glob_patterns(patterns: List[str]) -> List[str]:
     """Expand glob patterns to actual file paths.
-    
+
     Args:
         patterns: List of file paths or glob patterns (e.g., '*.json', 'results_*_summarize.json')
-    
+
     Returns:
         List of actual file paths (sorted)
     """
     expanded_files = []
-    
+
     for pattern in patterns:
         # If it's a literal file path that exists, use it directly
-        if Path(pattern).exists() and not any(c in pattern for c in ['*', '?', '[', ']']):
+        if Path(pattern).exists() and not any(
+                c in pattern for c in ['*', '?', '[', ']']):
             expanded_files.append(pattern)
         else:
             # Try to expand as a glob pattern
@@ -39,12 +40,17 @@ def expand_glob_patterns(patterns: List[str]) -> List[str]:
             if matches:
                 expanded_files.extend(matches)
             else:
-                # If no matches and it's not a glob pattern, report the file as missing
+                # If no matches and it's not a glob pattern, report the file as
+                # missing
                 if not any(c in pattern for c in ['*', '?', '[', ']']):
-                    print(f"Warning: File not found: {pattern}", file=sys.stderr)
+                    print(
+                        f"Warning: File not found: {pattern}",
+                        file=sys.stderr)
                 else:
-                    print(f"Warning: No files matched pattern: {pattern}", file=sys.stderr)
-    
+                    print(
+                        f"Warning: No files matched pattern: {pattern}",
+                        file=sys.stderr)
+
     # Remove duplicates and sort
     return sorted(set(expanded_files))
 
@@ -57,7 +63,7 @@ def load_json_summary(json_path: str) -> Dict[str, Any]:
 
 def extract_results(json_data: Dict[str, Any]) -> Dict[str, Dict[str, int]]:
     """Extract per-pass and aggregated correct counts by dataset.
-    
+
     Returns:
         Dictionary mapping dataset name to results:
         {
@@ -71,25 +77,25 @@ def extract_results(json_data: Dict[str, Any]) -> Dict[str, Dict[str, int]]:
     """
     pass_k = json_data['pass_k']
     results = defaultdict(dict)
-    
+
     # Extract per-pass results
     if 'per_pass_results' in json_data:
         for pass_result in json_data['per_pass_results']:
             pass_num = pass_result['pass_number']
             run_label = f"run_{pass_num + 1}"  # Convert 0-indexed to 1-indexed
-            
+
             for dataset_stat in pass_result['datasets']:
                 dataset_name = dataset_stat['dataset']
                 correct = dataset_stat['correct']
                 results[dataset_name][run_label] = correct
-    
+
     # Extract aggregated results
     if 'aggregated_results' in json_data:
         for dataset_stat in json_data['aggregated_results']['datasets']:
             dataset_name = dataset_stat['dataset']
             correct = dataset_stat['correct']
             results[dataset_name][f'pass@{pass_k}'] = correct
-    
+
     # Handle single-pass results
     elif 'results' in json_data:
         for dataset_stat in json_data['results']['datasets']:
@@ -97,14 +103,14 @@ def extract_results(json_data: Dict[str, Any]) -> Dict[str, Dict[str, int]]:
             correct = dataset_stat['correct']
             results[dataset_name]['run_1'] = correct
             results[dataset_name]['pass@1'] = correct
-    
+
     return dict(results)
 
 
-def collect_to_csv(json_files: List[str], output_csv: str, 
+def collect_to_csv(json_files: List[str], output_csv: str,
                    dataset_order: List[str] = None):
     """Collect results from multiple JSON files into a CSV.
-    
+
     Args:
         json_files: List of JSON file paths
         output_csv: Output CSV file path
@@ -112,39 +118,39 @@ def collect_to_csv(json_files: List[str], output_csv: str,
     """
     all_results = []
     pass_k = None
-    
+
     # Load all JSON files
     for json_path in json_files:
         json_data = load_json_summary(json_path)
-        
+
         # Determine pass@k value
         if pass_k is None:
             pass_k = json_data['pass_k']
         elif pass_k != json_data['pass_k']:
-            print(f"Warning: {json_path} has pass@{json_data['pass_k']} but expected pass@{pass_k}", 
+            print(f"Warning: {json_path} has pass@{json_data['pass_k']} but expected pass@{pass_k}",
                   file=sys.stderr)
-        
+
         # Extract results
         results = extract_results(json_data)
         all_results.append({
             'source_file': json_path,
             'results': results
         })
-    
+
     if not all_results:
         print("Error: No results to process", file=sys.stderr)
         return
-    
+
     # Determine column order
     run_columns = [f"run_{i+1}" for i in range(pass_k)]
     pass_column = f"pass@{pass_k}"
     columns = ['dataset'] + run_columns + [pass_column]
-    
+
     # Collect all unique datasets
     all_datasets = set()
     for result in all_results:
         all_datasets.update(result['results'].keys())
-    
+
     # Sort datasets (use provided order or alphabetical)
     if dataset_order:
         # Use provided order, put remaining datasets at the end
@@ -152,39 +158,42 @@ def collect_to_csv(json_files: List[str], output_csv: str,
         for ds in dataset_order:
             if ds.lower() in [d.lower() for d in all_datasets]:
                 # Find the actual dataset name (case-sensitive)
-                actual_name = next(d for d in all_datasets if d.lower() == ds.lower())
+                actual_name = next(
+                    d for d in all_datasets if d.lower() == ds.lower())
                 sorted_datasets.append(actual_name)
         # Add any datasets not in the order list
-        remaining = sorted([d for d in all_datasets if d not in sorted_datasets])
+        remaining = sorted(
+            [d for d in all_datasets if d not in sorted_datasets])
         sorted_datasets.extend(remaining)
     else:
         sorted_datasets = sorted(all_datasets)
-    
+
     # Write CSV
     with open(output_csv, 'w', newline='') as f:
         writer = csv.writer(f)
-        
+
         # Write header
         writer.writerow(columns)
-        
+
         # Write data rows
         for result in all_results:
             for dataset in sorted_datasets:
                 if dataset in result['results']:
                     row = [dataset]
                     dataset_results = result['results'][dataset]
-                    
+
                     # Add run columns
                     for run_col in run_columns:
                         row.append(dataset_results.get(run_col, ''))
-                    
+
                     # Add pass@k column
                     row.append(dataset_results.get(pass_column, ''))
-                    
+
                     writer.writerow(row)
-    
+
     print(f"CSV saved to: {output_csv}")
-    print(f"Collected {len(all_results)} result sets across {len(sorted_datasets)} datasets")
+    print(
+        f"Collected {len(all_results)} result sets across {len(sorted_datasets)} datasets")
 
 
 def main():
@@ -201,21 +210,23 @@ def main():
                         help="Output CSV file (default: collected_results.csv)")
     parser.add_argument("--dataset-order", nargs='*',
                         help="Optional dataset order (e.g., aime gpqa livecodebench)")
-    
+
     args = parser.parse_args()
-    
+
     # Expand glob patterns
     expanded_files = expand_glob_patterns(args.json_files)
-    
+
     if not expanded_files:
-        print("Error: No JSON files found matching the provided patterns", file=sys.stderr)
+        print(
+            "Error: No JSON files found matching the provided patterns",
+            file=sys.stderr)
         sys.exit(1)
-    
+
     print(f"Found {len(expanded_files)} JSON files:")
     for f in expanded_files:
         print(f"  - {f}")
     print()
-    
+
     try:
         collect_to_csv(expanded_files, args.output, args.dataset_order)
     except Exception as e:
@@ -227,4 +238,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-
diff --git a/language/gpt-oss/plot_results.py b/language/gpt-oss/plot_results.py
index 986b89afcd..85f2b08e38 100755
--- a/language/gpt-oss/plot_results.py
+++ b/language/gpt-oss/plot_results.py
@@ -19,7 +19,7 @@
 
 def load_csv_data(csv_path: str) -> Dict[str, Dict[str, List[int]]]:
     """Load CSV data and organize by dataset.
-    
+
     Returns:
         {
             'aime': {
@@ -32,7 +32,7 @@ def load_csv_data(csv_path: str) -> Dict[str, Dict[str, List[int]]]:
         }
     """
     data = defaultdict(lambda: defaultdict(list))
-    
+
     with open(csv_path, 'r') as f:
         reader = csv.DictReader(f)
         for row in reader:
@@ -43,7 +43,7 @@ def load_csv_data(csv_path: str) -> Dict[str, Dict[str, List[int]]]:
                         data[dataset][key].append(int(value))
                     except ValueError:
                         continue
-    
+
     return dict(data)
 
 
@@ -54,7 +54,7 @@ def create_combined_box_plot(dataset_name: str,
                              output_file: str,
                              ylabel: str = "Correct Count"):
     """Create separate box plots for individual runs and pass@k in the same figure.
-    
+
     Args:
         dataset_name: Name of the dataset
         dataset_data: Data for this dataset (column -> list of values)
@@ -68,123 +68,129 @@ def create_combined_box_plot(dataset_name: str,
     for col in run_columns:
         if col in dataset_data and dataset_data[col]:
             all_runs_data.extend(dataset_data[col])
-    
+
     # Collect pass@k data
     passk_data = []
     for col in passk_columns:
         if col in dataset_data and dataset_data[col]:
             passk_data.extend(dataset_data[col])
-    
+
     if not all_runs_data and not passk_data:
         print(f"Warning: No data to plot for {dataset_name}")
         return
-    
+
     # Determine number of subplots needed
     num_plots = 0
     if all_runs_data:
         num_plots += 1
     if passk_data:
         num_plots += 1
-    
+
     if num_plots == 0:
         print(f"Warning: No data to plot for {dataset_name}")
         return
-    
+
     # Create figure with subplots side by side
     fig, axes = plt.subplots(1, num_plots, figsize=(6 * num_plots, 6))
-    
+
     # Make axes iterable even if there's only one subplot
     if num_plots == 1:
         axes = [axes]
-    
+
     plot_idx = 0
-    
+
     # Plot individual runs
     if all_runs_data:
         ax = axes[plot_idx]
         plot_idx += 1
-        
+
         bp = ax.boxplot([all_runs_data], positions=[0], widths=0.5,
                         patch_artist=True, showmeans=True,
                         whis=[0, 100], showfliers=False,
-                        meanprops=dict(marker='D', markerfacecolor='red', 
-                                      markeredgecolor='red', markersize=8))
-        
+                        meanprops=dict(marker='D', markerfacecolor='red',
+                                       markeredgecolor='red', markersize=8))
+
         # Color the box
         bp['boxes'][0].set_facecolor(plt.cm.Set3(0.2))
         bp['boxes'][0].set_alpha(0.7)
-        
+
         # Add scatter plot of individual points
         # Add small random jitter to x-position for visibility
         np.random.seed(42)  # For reproducibility
         x_jitter = np.random.normal(0, 0.04, size=len(all_runs_data))
-        ax.scatter(x_jitter, all_runs_data, alpha=0.4, s=30, 
-                  color='darkblue', zorder=3, edgecolors='black', linewidth=0.5)
-        
+        ax.scatter(x_jitter, all_runs_data, alpha=0.4, s=30,
+                   color='darkblue', zorder=3, edgecolors='black', linewidth=0.5)
+
         # Set labels
         ax.set_xticks([0])
         ax.set_xticklabels(['Individual Runs'], fontsize=12)
         ax.set_ylabel(ylabel, fontsize=12)
-        ax.set_title(f"{dataset_name} - Individual Runs", fontsize=13, fontweight='bold')
+        ax.set_title(
+            f"{dataset_name} - Individual Runs",
+            fontsize=13,
+            fontweight='bold')
         ax.grid(True, axis='y', alpha=0.3, linestyle='--')
-        
+
         # Add statistics
         mean_val = np.mean(all_runs_data)
         std_val = np.std(all_runs_data)
         min_val = np.min(all_runs_data)
         max_val = np.max(all_runs_data)
         n_samples = len(all_runs_data)
-        
+
         stats_text = f"n={n_samples}\nμ={mean_val:.1f}\nσ={std_val:.1f}\nmin={min_val}\nmax={max_val}"
         props = dict(boxstyle='round', facecolor='wheat', alpha=0.3)
         ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, fontsize=9,
                 verticalalignment='top', bbox=props, family='monospace')
-    
+
     # Plot pass@k
     if passk_data:
         ax = axes[plot_idx]
-        
+
         passk_label = passk_columns[0] if len(passk_columns) == 1 else 'Pass@k'
-        
+
         bp = ax.boxplot([passk_data], positions=[0], widths=0.5,
                         patch_artist=True, showmeans=True,
                         whis=[0, 100], showfliers=False,
-                        meanprops=dict(marker='D', markerfacecolor='red', 
-                                      markeredgecolor='red', markersize=8))
-        
+                        meanprops=dict(marker='D', markerfacecolor='red',
+                                       markeredgecolor='red', markersize=8))
+
         # Color the box
         bp['boxes'][0].set_facecolor(plt.cm.Set3(0.6))
         bp['boxes'][0].set_alpha(0.7)
-        
+
         # Add scatter plot of individual points
         # Add small random jitter to x-position for visibility
         np.random.seed(42)  # For reproducibility
         x_jitter = np.random.normal(0, 0.04, size=len(passk_data))
-        ax.scatter(x_jitter, passk_data, alpha=0.4, s=30, 
-                  color='darkorange', zorder=3, edgecolors='black', linewidth=0.5)
-        
+        ax.scatter(x_jitter, passk_data, alpha=0.4, s=30,
+                   color='darkorange', zorder=3, edgecolors='black', linewidth=0.5)
+
         # Set labels
         ax.set_xticks([0])
         ax.set_xticklabels([passk_label], fontsize=12)
         ax.set_ylabel(ylabel, fontsize=12)
-        ax.set_title(f"{dataset_name} - {passk_label}", fontsize=13, fontweight='bold')
+        ax.set_title(
+            f"{dataset_name} - {passk_label}",
+            fontsize=13,
+            fontweight='bold')
         ax.grid(True, axis='y', alpha=0.3, linestyle='--')
-        
+
         # Add statistics
         mean_val = np.mean(passk_data)
         std_val = np.std(passk_data)
         min_val = np.min(passk_data)
         max_val = np.max(passk_data)
         n_samples = len(passk_data)
-        
+
         stats_text = f"n={n_samples}\nμ={mean_val:.1f}\nσ={std_val:.1f}\nmin={min_val}\nmax={max_val}"
         props = dict(boxstyle='round', facecolor='wheat', alpha=0.3)
         ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, fontsize=9,
                 verticalalignment='top', bbox=props, family='monospace')
-    
+
     # Adjust layout
     plt.tight_layout()
-    
+
     # Save figure
     plt.savefig(output_file, dpi=300, bbox_inches='tight')
     print(f"Saved plot to: {output_file}")
@@ -200,53 +206,56 @@ def main():
                         help="Output directory for plots (default: current directory)")
     parser.add_argument("--prefix", default="boxplot",
                         help="Prefix for output files (default: boxplot)")
-    
+
     args = parser.parse_args()
-    
+
     # Check if input file exists
     if not Path(args.csv_file).exists():
         print(f"Error: File not found: {args.csv_file}", file=sys.stderr)
         sys.exit(1)
-    
+
     # Create output directory if needed
     output_dir = Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
-    
+
     # Load data
     print(f"Loading data from: {args.csv_file}")
     data = load_csv_data(args.csv_file)
-    
+
     if not data:
         print("Error: No data loaded from CSV", file=sys.stderr)
         sys.exit(1)
-    
+
     print(f"Loaded data for {len(data)} datasets")
-    
+
     # Determine which columns are runs vs pass@k
     all_columns = set()
     for dataset_data in data.values():
         all_columns.update(dataset_data.keys())
-    
+
     # Separate run columns from pass@k columns
-    run_columns = sorted([col for col in all_columns if col.startswith('run_')])
-    passk_columns = sorted([col for col in all_columns if col.startswith('pass@')])
-    
+    run_columns = sorted(
+        [col for col in all_columns if col.startswith('run_')])
+    passk_columns = sorted(
+        [col for col in all_columns if col.startswith('pass@')])
+
     if not run_columns and not passk_columns:
         print("Error: No run or pass@k columns found in CSV", file=sys.stderr)
         sys.exit(1)
-    
+
     print(f"Found {len(run_columns)} run columns: {', '.join(run_columns)}")
-    print(f"Found {len(passk_columns)} pass@k columns: {', '.join(passk_columns)}")
+    print(
+        f"Found {len(passk_columns)} pass@k columns: {', '.join(passk_columns)}")
     print()
-    
+
     # Generate plots for each dataset separately
     datasets = sorted(data.keys())
-    
+
     print(f"Generating plots for {len(datasets)} datasets...\n")
-    
+
     for dataset in datasets:
         dataset_data = data[dataset]
-        
+
         # Create combined plot: Individual Runs (all combined) vs Pass@k
         if run_columns or passk_columns:
             output_file = output_dir / f"{args.prefix}_{dataset}.png"
@@ -259,12 +268,11 @@ def main():
                 output_file=str(output_file),
                 ylabel="Correct Count"
             )
-        
+
         print()
-    
+
     print("Done!")
 
 
 if __name__ == "__main__":
     main()
-
diff --git a/language/gpt-oss/summarize_eval.py b/language/gpt-oss/summarize_eval.py
index c39341e8e0..4098aed108 100644
--- a/language/gpt-oss/summarize_eval.py
+++ b/language/gpt-oss/summarize_eval.py
@@ -160,7 +160,7 @@ def summarize_evaluation(pickle_path: str, json_output: bool = False) -> str:
     # Detect pass@k format
     pass_k = detect_pass_k(df)
     print(f"Detected format: pass@{pass_k}" if pass_k >
-            1 else "Detected format: single-pass")
+          1 else "Detected format: single-pass")
 
     # Get list of datasets
     datasets = sorted(df['dataset'].unique())

From 19fcc8040f0691013a3cfad145c16d5035105aa2 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Tue, 11 Nov 2025 21:20:43 -0800
Subject: [PATCH 098/165] add overall number

---
 language/gpt-oss/collect_results_csv.py | 79 +++++++++++++++++++++----
 language/gpt-oss/plot_results.py        |  8 ++-
 2 files changed, 72 insertions(+), 15 deletions(-)

diff --git a/language/gpt-oss/collect_results_csv.py b/language/gpt-oss/collect_results_csv.py
index 7732f9edf4..d666855028 100755
--- a/language/gpt-oss/collect_results_csv.py
+++ b/language/gpt-oss/collect_results_csv.py
@@ -77,33 +77,81 @@ def extract_results(json_data: Dict[str, Any]) -> Dict[str, Dict[str, int]]:
     """
     pass_k = json_data['pass_k']
     results = defaultdict(dict)
-
+    overall_results = {}
+    
     # Extract per-pass results
     if 'per_pass_results' in json_data:
         for pass_result in json_data['per_pass_results']:
             pass_num = pass_result['pass_number']
             run_label = f"run_{pass_num + 1}"  # Convert 0-indexed to 1-indexed
-
+            
+            # Calculate sum of individual datasets for verification
+            sum_correct = 0
             for dataset_stat in pass_result['datasets']:
                 dataset_name = dataset_stat['dataset']
                 correct = dataset_stat['correct']
                 results[dataset_name][run_label] = correct
-
+                sum_correct += correct
+            
+            # Extract overall from JSON
+            if 'overall' in pass_result:
+                overall_correct = pass_result['overall']['correct']
+                overall_results[run_label] = overall_correct
+                
+                # Assert that the sum matches the overall
+                assert sum_correct == overall_correct, (
+                    f"Mismatch in {run_label}: sum of datasets ({sum_correct}) != "
+                    f"overall ({overall_correct})"
+                )
+    
     # Extract aggregated results
     if 'aggregated_results' in json_data:
+        # Calculate sum of individual datasets for verification
+        sum_correct = 0
         for dataset_stat in json_data['aggregated_results']['datasets']:
             dataset_name = dataset_stat['dataset']
             correct = dataset_stat['correct']
             results[dataset_name][f'pass@{pass_k}'] = correct
-
+            sum_correct += correct
+        
+        # Extract overall from JSON
+        if 'overall' in json_data['aggregated_results']:
+            overall_correct = json_data['aggregated_results']['overall']['correct']
+            overall_results[f'pass@{pass_k}'] = overall_correct
+            
+            # Assert that the sum matches the overall
+            assert sum_correct == overall_correct, (
+                f"Mismatch in pass@{pass_k}: sum of datasets ({sum_correct}) != "
+                f"overall ({overall_correct})"
+            )
+    
     # Handle single-pass results
     elif 'results' in json_data:
+        # Calculate sum of individual datasets for verification
+        sum_correct = 0
         for dataset_stat in json_data['results']['datasets']:
             dataset_name = dataset_stat['dataset']
             correct = dataset_stat['correct']
             results[dataset_name]['run_1'] = correct
             results[dataset_name]['pass@1'] = correct
-
+            sum_correct += correct
+        
+        # Extract overall from JSON if available
+        if 'overall' in json_data['results']:
+            overall_correct = json_data['results']['overall']['correct']
+            overall_results['run_1'] = overall_correct
+            overall_results['pass@1'] = overall_correct
+            
+            # Assert that the sum matches the overall
+            assert sum_correct == overall_correct, (
+                f"Mismatch in run_1: sum of datasets ({sum_correct}) != "
+                f"overall ({overall_correct})"
+            )
+    
+    # Add overall results
+    if overall_results:
+        results['overall'] = overall_results
+    
     return dict(results)
 
 
@@ -152,22 +200,27 @@ def collect_to_csv(json_files: List[str], output_csv: str,
         all_datasets.update(result['results'].keys())
 
     # Sort datasets (use provided order or alphabetical)
+    # Always put 'overall' at the end
+    all_datasets_no_overall = all_datasets - {'overall'}
+    
     if dataset_order:
         # Use provided order, put remaining datasets at the end
         sorted_datasets = []
         for ds in dataset_order:
-            if ds.lower() in [d.lower() for d in all_datasets]:
+            if ds.lower() in [d.lower() for d in all_datasets_no_overall]:
                 # Find the actual dataset name (case-sensitive)
-                actual_name = next(
-                    d for d in all_datasets if d.lower() == ds.lower())
+                actual_name = next(d for d in all_datasets_no_overall if d.lower() == ds.lower())
                 sorted_datasets.append(actual_name)
-        # Add any datasets not in the order list
-        remaining = sorted(
-            [d for d in all_datasets if d not in sorted_datasets])
+        # Add any datasets not in the order list (excluding 'overall')
+        remaining = sorted([d for d in all_datasets_no_overall if d not in sorted_datasets])
         sorted_datasets.extend(remaining)
     else:
-        sorted_datasets = sorted(all_datasets)
-
+        sorted_datasets = sorted(all_datasets_no_overall)
+    
+    # Add 'overall' at the end if it exists
+    if 'overall' in all_datasets:
+        sorted_datasets.append('overall')
+    
     # Write CSV
     with open(output_csv, 'w', newline='') as f:
         writer = csv.writer(f)
diff --git a/language/gpt-oss/plot_results.py b/language/gpt-oss/plot_results.py
index 85f2b08e38..5e3484dd88 100755
--- a/language/gpt-oss/plot_results.py
+++ b/language/gpt-oss/plot_results.py
@@ -249,8 +249,12 @@ def main():
     print()
 
     # Generate plots for each dataset separately
-    datasets = sorted(data.keys())
-
+    # Always put 'overall' at the end
+    datasets_no_overall = [d for d in data.keys() if d != 'overall']
+    datasets = sorted(datasets_no_overall)
+    if 'overall' in data:
+        datasets.append('overall')
+    
     print(f"Generating plots for {len(datasets)} datasets...\n")
 
     for dataset in datasets:

From 6cb76981cd3ec8a8efa270b19921841699fdcb99 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 12 Nov 2025 05:22:54 +0000
Subject: [PATCH 099/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/collect_results_csv.py | 36 +++++++++++++------------
 language/gpt-oss/plot_results.py        |  2 +-
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/language/gpt-oss/collect_results_csv.py b/language/gpt-oss/collect_results_csv.py
index d666855028..10c93a2a37 100755
--- a/language/gpt-oss/collect_results_csv.py
+++ b/language/gpt-oss/collect_results_csv.py
@@ -78,13 +78,13 @@ def extract_results(json_data: Dict[str, Any]) -> Dict[str, Dict[str, int]]:
     pass_k = json_data['pass_k']
     results = defaultdict(dict)
     overall_results = {}
-    
+
     # Extract per-pass results
     if 'per_pass_results' in json_data:
         for pass_result in json_data['per_pass_results']:
             pass_num = pass_result['pass_number']
             run_label = f"run_{pass_num + 1}"  # Convert 0-indexed to 1-indexed
-            
+
             # Calculate sum of individual datasets for verification
             sum_correct = 0
             for dataset_stat in pass_result['datasets']:
@@ -92,18 +92,18 @@ def extract_results(json_data: Dict[str, Any]) -> Dict[str, Dict[str, int]]:
                 correct = dataset_stat['correct']
                 results[dataset_name][run_label] = correct
                 sum_correct += correct
-            
+
             # Extract overall from JSON
             if 'overall' in pass_result:
                 overall_correct = pass_result['overall']['correct']
                 overall_results[run_label] = overall_correct
-                
+
                 # Assert that the sum matches the overall
                 assert sum_correct == overall_correct, (
                     f"Mismatch in {run_label}: sum of datasets ({sum_correct}) != "
                     f"overall ({overall_correct})"
                 )
-    
+
     # Extract aggregated results
     if 'aggregated_results' in json_data:
         # Calculate sum of individual datasets for verification
@@ -113,18 +113,18 @@ def extract_results(json_data: Dict[str, Any]) -> Dict[str, Dict[str, int]]:
             correct = dataset_stat['correct']
             results[dataset_name][f'pass@{pass_k}'] = correct
             sum_correct += correct
-        
+
         # Extract overall from JSON
         if 'overall' in json_data['aggregated_results']:
             overall_correct = json_data['aggregated_results']['overall']['correct']
             overall_results[f'pass@{pass_k}'] = overall_correct
-            
+
             # Assert that the sum matches the overall
             assert sum_correct == overall_correct, (
                 f"Mismatch in pass@{pass_k}: sum of datasets ({sum_correct}) != "
                 f"overall ({overall_correct})"
             )
-    
+
     # Handle single-pass results
     elif 'results' in json_data:
         # Calculate sum of individual datasets for verification
@@ -135,23 +135,23 @@ def extract_results(json_data: Dict[str, Any]) -> Dict[str, Dict[str, int]]:
             results[dataset_name]['run_1'] = correct
             results[dataset_name]['pass@1'] = correct
             sum_correct += correct
-        
+
         # Extract overall from JSON if available
         if 'overall' in json_data['results']:
             overall_correct = json_data['results']['overall']['correct']
             overall_results['run_1'] = overall_correct
             overall_results['pass@1'] = overall_correct
-            
+
             # Assert that the sum matches the overall
             assert sum_correct == overall_correct, (
                 f"Mismatch in run_1: sum of datasets ({sum_correct}) != "
                 f"overall ({overall_correct})"
             )
-    
+
     # Add overall results
     if overall_results:
         results['overall'] = overall_results
-    
+
     return dict(results)
 
 
@@ -202,25 +202,27 @@ def collect_to_csv(json_files: List[str], output_csv: str,
     # Sort datasets (use provided order or alphabetical)
     # Always put 'overall' at the end
     all_datasets_no_overall = all_datasets - {'overall'}
-    
+
     if dataset_order:
         # Use provided order, put remaining datasets at the end
         sorted_datasets = []
         for ds in dataset_order:
             if ds.lower() in [d.lower() for d in all_datasets_no_overall]:
                 # Find the actual dataset name (case-sensitive)
-                actual_name = next(d for d in all_datasets_no_overall if d.lower() == ds.lower())
+                actual_name = next(
+                    d for d in all_datasets_no_overall if d.lower() == ds.lower())
                 sorted_datasets.append(actual_name)
         # Add any datasets not in the order list (excluding 'overall')
-        remaining = sorted([d for d in all_datasets_no_overall if d not in sorted_datasets])
+        remaining = sorted(
+            [d for d in all_datasets_no_overall if d not in sorted_datasets])
         sorted_datasets.extend(remaining)
     else:
         sorted_datasets = sorted(all_datasets_no_overall)
-    
+
     # Add 'overall' at the end if it exists
     if 'overall' in all_datasets:
         sorted_datasets.append('overall')
-    
+
     # Write CSV
     with open(output_csv, 'w', newline='') as f:
         writer = csv.writer(f)
diff --git a/language/gpt-oss/plot_results.py b/language/gpt-oss/plot_results.py
index 5e3484dd88..2de422deb7 100755
--- a/language/gpt-oss/plot_results.py
+++ b/language/gpt-oss/plot_results.py
@@ -254,7 +254,7 @@ def main():
     datasets = sorted(datasets_no_overall)
     if 'overall' in data:
         datasets.append('overall')
-    
+
     print(f"Generating plots for {len(datasets)} datasets...\n")
 
     for dataset in datasets:

From b281727e43b332f406f82b17dd0a24613b1ba5f4 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 12 Nov 2025 05:28:13 +0000
Subject: [PATCH 100/165] add glob matching

---
 language/gpt-oss/summarize_eval.py | 92 ++++++++++++++++++++++++------
 1 file changed, 73 insertions(+), 19 deletions(-)

diff --git a/language/gpt-oss/summarize_eval.py b/language/gpt-oss/summarize_eval.py
index 4098aed108..739097c68c 100644
--- a/language/gpt-oss/summarize_eval.py
+++ b/language/gpt-oss/summarize_eval.py
@@ -10,11 +10,49 @@
 import pickle
 import sys
 import json
+import glob
 from pathlib import Path
 from typing import Dict, Any, List
 import pandas as pd
 
 
+def expand_glob_patterns(patterns: List[str]) -> List[str]:
+    """Expand glob patterns to actual file paths.
+
+    Args:
+        patterns: List of file paths or glob patterns (e.g., '*.pkl', 'data/*_evaluated.pkl')
+
+    Returns:
+        List of actual file paths (sorted)
+    """
+    expanded_files = []
+
+    for pattern in patterns:
+        # If it's a literal file path that exists, use it directly
+        if Path(pattern).exists() and not any(
+                c in pattern for c in ['*', '?', '[', ']']):
+            expanded_files.append(pattern)
+        else:
+            # Try to expand as a glob pattern
+            matches = glob.glob(pattern)
+            if matches:
+                expanded_files.extend(matches)
+            else:
+                # If no matches and it's not a glob pattern, report the file as
+                # missing
+                if not any(c in pattern for c in ['*', '?', '[', ']']):
+                    print(
+                        f"Warning: File not found: {pattern}",
+                        file=sys.stderr)
+                else:
+                    print(
+                        f"Warning: No files matched pattern: {pattern}",
+                        file=sys.stderr)
+
+    # Remove duplicates and sort
+    return sorted(set(expanded_files))
+
+
 def detect_pass_k(df: pd.DataFrame) -> int:
     """Detect if DataFrame has pass@k format and return k.
 
@@ -283,34 +321,50 @@ def summarize_evaluation(pickle_path: str, json_output: bool = False) -> str:
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Summarize evaluation results by dataset")
-    parser.add_argument("input_file",
-                        help="Path to evaluated pickle file from eval_accuracy.py")
+        description="Summarize evaluation results by dataset. Supports glob patterns.",
+        epilog="Examples:\n"
+               "  %(prog)s results_evaluated.pkl\n"
+               "  %(prog)s data/*_evaluated.pkl\n"
+               "  %(prog)s --json data/accuracy_eval_*_evaluated.pkl",
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("input_files", nargs='+',
+                        help="One or more paths to evaluated pickle files or glob patterns (e.g., '*.pkl', 'data/*_evaluated.pkl')")
     parser.add_argument("--json", action="store_true",
                         help="Output results in JSON format (for programmatic use)")
 
     args = parser.parse_args()
 
-    # Check if input file exists
-    if not Path(args.input_file).exists():
+    # Expand glob patterns
+    expanded_files = expand_glob_patterns(args.input_files)
+
+    if not expanded_files:
         print(
-            f"Error: Input file not found: {args.input_file}",
+            "Error: No files found matching the provided patterns",
             file=sys.stderr)
         sys.exit(1)
 
-    # Check if file has _evaluated suffix (warn if not)
-    if "_evaluated" not in args.input_file:
-        print(f"Warning: Input file does not contain '_evaluated' suffix. "
-              f"Make sure this is an evaluated pickle file from eval_accuracy.py",
-              file=sys.stderr)
-
-    try:
-        summarize_evaluation(args.input_file, json_output=args.json)
-    except Exception as e:
-        print(f"Error processing file: {e}", file=sys.stderr)
-        import traceback
-        traceback.print_exc()
-        sys.exit(1)
+    print(f"Found {len(expanded_files)} file(s) to process:")
+    for f in expanded_files:
+        print(f"  - {f}")
+    print()
+
+    # Process each file
+    for input_file in expanded_files:
+        # Check if file has _evaluated suffix (warn if not)
+        if "_evaluated" not in input_file:
+            print(f"Warning: Input file does not contain '_evaluated' suffix. "
+                  f"Make sure this is an evaluated pickle file from eval_accuracy.py",
+                  file=sys.stderr)
+
+        try:
+            summarize_evaluation(input_file, json_output=args.json)
+            print()  # Add spacing between files
+        except Exception as e:
+            print(f"Error processing file {input_file}: {e}", file=sys.stderr)
+            import traceback
+            traceback.print_exc()
+            # Continue processing other files
+            continue
 
 
 if __name__ == "__main__":

From 9a0c45ae777f771d405ae001961749842781bb7e Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 20 Nov 2025 19:08:22 +0000
Subject: [PATCH 101/165] rename

---
 language/gpt-oss/data/{fetch_all.py => fetch_accuracy_eval.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename language/gpt-oss/data/{fetch_all.py => fetch_accuracy_eval.py} (100%)

diff --git a/language/gpt-oss/data/fetch_all.py b/language/gpt-oss/data/fetch_accuracy_eval.py
similarity index 100%
rename from language/gpt-oss/data/fetch_all.py
rename to language/gpt-oss/data/fetch_accuracy_eval.py

From 35ea0e4c3419a5d5af11ffb7e6a950baa9dffd2b Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 20 Nov 2025 19:08:52 +0000
Subject: [PATCH 102/165] add pubmed tokenization

---
 language/gpt-oss/harmonize_inputs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/language/gpt-oss/harmonize_inputs.py b/language/gpt-oss/harmonize_inputs.py
index d433d731c4..510ec81195 100644
--- a/language/gpt-oss/harmonize_inputs.py
+++ b/language/gpt-oss/harmonize_inputs.py
@@ -442,6 +442,7 @@ def process_row(args):
         'aime1983': create_aime1983_prompt,
         'aime2025': create_aime1983_prompt,
         'arxiv_summarization': create_arxiv_summarization_prompt,
+        'pubmed_summarization': create_arxiv_summarization_prompt,
         'gpqa': create_gpqa_prompt,
         'livecodebench': create_livecodebench_prompt,
         'math500': create_math500_prompt,

From 1c73423860e84fd194f03c3a759b5fcbc12d5844 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 20 Nov 2025 19:09:28 +0000
Subject: [PATCH 103/165] updates

---
 language/gpt-oss/plot.py              | 74 +++++++++++++++++++----
 language/gpt-oss/plot_results.py      | 86 ++++++++++++++++++++++++---
 language/gpt-oss/run_server_trtllm.sh | 25 ++++----
 3 files changed, 157 insertions(+), 28 deletions(-)

diff --git a/language/gpt-oss/plot.py b/language/gpt-oss/plot.py
index 1c5da36983..b0ecda74fa 100644
--- a/language/gpt-oss/plot.py
+++ b/language/gpt-oss/plot.py
@@ -58,12 +58,22 @@ def create_per_dataset_histogram(df, column_name, title, filename, output_dir):
         ax.set_ylabel('Frequency', fontsize=10)
         ax.grid(True, alpha=0.3)
 
+        # Add percentile lines
+        p50 = dataset_data.quantile(0.50)
+        p75 = dataset_data.quantile(0.75)
+        p99 = dataset_data.quantile(0.99)
+        
+        ax.axvline(p50, color='green', linestyle='--', linewidth=2, alpha=0.7, label=f'50th: {p50:.1f}')
+        ax.axvline(p75, color='orange', linestyle='--', linewidth=2, alpha=0.7, label=f'75th: {p75:.1f}')
+        ax.axvline(p99, color='red', linestyle='--', linewidth=2, alpha=0.7, label=f'99th: {p99:.1f}')
+        ax.legend(loc='upper right', fontsize=8)
+        
         # Add statistics
         mean_val = dataset_data.mean()
         median_val = dataset_data.median()
         std_val = dataset_data.std()
         stats_text = f'Mean: {mean_val:.1f}\nMedian: {median_val:.1f}\nStd: {std_val:.1f}'
-        ax.text(0.98, 0.98, stats_text,
+        ax.text(0.98, 0.78, stats_text,
                 transform=ax.transAxes,
                 verticalalignment='top',
                 horizontalalignment='right',
@@ -80,7 +90,7 @@ def create_per_dataset_histogram(df, column_name, title, filename, output_dir):
     plt.close()
 
 
-def create_full_histogram(df, column_name, title, filename, output_dir):
+def create_full_histogram(df, column_name, title, filename, output_dir, save_bins=False):
     """Create a single histogram combining all datasets."""
     print(f"Creating {filename}...")
     print(f"  Total samples: {len(df)}")
@@ -88,7 +98,7 @@ def create_full_histogram(df, column_name, title, filename, output_dir):
     plt.figure(figsize=(12, 8))
 
     color = 'skyblue' if 'OSL' in title else 'lightcoral'
-    plt.hist(
+    counts, bin_edges, patches = plt.hist(
         df[column_name],
         bins=50,
         alpha=0.7,
@@ -107,6 +117,16 @@ def create_full_histogram(df, column_name, title, filename, output_dir):
     plt.ylabel('Frequency', fontsize=12)
     plt.grid(True, alpha=0.3)
 
+    # Add percentile lines
+    p50 = df[column_name].quantile(0.50)
+    p75 = df[column_name].quantile(0.75)
+    p99 = df[column_name].quantile(0.99)
+    
+    plt.axvline(p50, color='green', linestyle='--', linewidth=2, alpha=0.7, label=f'50th percentile: {p50:.1f}')
+    plt.axvline(p75, color='orange', linestyle='--', linewidth=2, alpha=0.7, label=f'75th percentile: {p75:.1f}')
+    plt.axvline(p99, color='red', linestyle='--', linewidth=2, alpha=0.7, label=f'99th percentile: {p99:.1f}')
+    plt.legend(loc='upper right', fontsize=10)
+
     # Add statistics
     mean_val = df[column_name].mean()
     median_val = df[column_name].median()
@@ -121,7 +141,7 @@ def create_full_histogram(df, column_name, title, filename, output_dir):
     stats_text += f'Min: {min_val}\n'
     stats_text += f'Max: {max_val}'
 
-    plt.text(0.98, 0.98, stats_text,
+    plt.text(0.98, 0.78, stats_text,
              transform=plt.gca().transAxes,
              verticalalignment='top',
              horizontalalignment='right',
@@ -132,6 +152,29 @@ def create_full_histogram(df, column_name, title, filename, output_dir):
     plt.savefig(f'{output_dir}/{filename}', dpi=300, bbox_inches='tight')
     print(f"  Saved to {output_dir}/{filename}")
     plt.close()
+    
+    # Save bin data to CSV if requested
+    if save_bins:
+        csv_filename = filename.replace('.png', '_bins.csv')
+        
+        # Create bin data DataFrame
+        bin_data = pd.DataFrame({
+            'bin_lower': bin_edges[:-1],
+            'bin_upper': bin_edges[1:],
+            'bin_center': (bin_edges[:-1] + bin_edges[1:]) / 2,
+            'count': counts.astype(int)
+        })
+        
+        csv_path = f'{output_dir}/{csv_filename}'
+        
+        # Save with header containing percentile information
+        with open(csv_path, 'w') as f:
+            f.write(f'# Percentiles: 50th={p50:.2f}, 75th={p75:.2f}, 99th={p99:.2f}\n')
+            f.write(f'# Mean={mean_val:.2f}, Median={median_val:.2f}, Std={std_val:.2f}\n')
+            f.write(f'# Min={min_val}, Max={max_val}, Total samples={len(df)}\n')
+            bin_data.to_csv(f, index=False)
+        
+        print(f"  Saved bin data to {csv_path}")
 
 
 def main():
@@ -161,6 +204,15 @@ def main():
     # Check if prompt_accuracy column exists
     has_accuracy = 'prompt_accuracy' in df.columns
     
+    # Determine which output length column to use
+    if 'tok_model_output_len' in df.columns:
+        output_len_col = 'tok_model_output_len'
+    elif 'tok_model_output_len_0' in df.columns:
+        output_len_col = 'tok_model_output_len_0'
+        print("\nNote: 'tok_model_output_len' not found, using 'tok_model_output_len_0' instead")
+    else:
+        raise ValueError("Neither 'tok_model_output_len' nor 'tok_model_output_len_0' column found in data")
+    
     if has_accuracy:
         # Filter for 100% accuracy
         df_100 = df[df['prompt_accuracy'] == 100.0].copy()
@@ -204,7 +256,8 @@ def main():
         df, 'tok_input_len',
         'Token Input Length (ISL) - All Data',
         '3_full_ISL.png',
-        args.output_dir)
+        args.output_dir,
+        save_bins=True)
 
     # 4. Full ISL histogram (accuracy == 100)
     if has_accuracy and len(df_100) > 0:
@@ -225,7 +278,7 @@ def main():
     # 5. Per dataset OSL histogram
     if has_dataset:
         create_per_dataset_histogram(
-            df, 'tok_model_output_len',
+            df, output_len_col,
             'Token Output Length (OSL)',
             '5_per_dataset_OSL.png',
             args.output_dir)
@@ -235,7 +288,7 @@ def main():
     # 6. Per dataset OSL histogram (accuracy == 100)
     if has_dataset and has_accuracy and len(df_100) > 0:
         create_per_dataset_histogram(
-            df_100, 'tok_model_output_len',
+            df_100, output_len_col,
             'Token Output Length (OSL) - 100% Accuracy',
             '6_per_dataset_OSL_acc100.png',
             args.output_dir)
@@ -248,15 +301,16 @@ def main():
 
     # 7. Full OSL histogram
     create_full_histogram(
-        df, 'tok_model_output_len',
+        df, output_len_col,
         'Token Output Length (OSL) - All Data',
         '7_full_OSL.png',
-        args.output_dir)
+        args.output_dir,
+        save_bins=True)
 
     # 8. Full OSL histogram (accuracy == 100)
     if has_accuracy and len(df_100) > 0:
         create_full_histogram(
-            df_100, 'tok_model_output_len',
+            df_100, output_len_col,
             'Token Output Length (OSL) - 100% Accuracy',
             '8_full_OSL_acc100.png',
             args.output_dir)
diff --git a/language/gpt-oss/plot_results.py b/language/gpt-oss/plot_results.py
index 2de422deb7..042ca0efc8 100755
--- a/language/gpt-oss/plot_results.py
+++ b/language/gpt-oss/plot_results.py
@@ -10,6 +10,7 @@
 import argparse
 import sys
 import csv
+import re
 from pathlib import Path
 from typing import Dict, List
 from collections import defaultdict
@@ -17,7 +18,7 @@
 import numpy as np
 
 
-def load_csv_data(csv_path: str) -> Dict[str, Dict[str, List[int]]]:
+def load_csv_data(csv_path: str) -> Dict[str, Dict[str, List[float]]]:
     """Load CSV data and organize by dataset.
 
     Returns:
@@ -25,7 +26,8 @@ def load_csv_data(csv_path: str) -> Dict[str, Dict[str, List[int]]]:
             'aime': {
                 'run_1': [735, 752, 765, ...],
                 'run_2': [740, 754, 765, ...],
-                'pass@5': [875, 875, 885, ...]
+                'pass@5': [875, 875, 885, ...],
+                'pass@1 with 5 repeats: (average of run_k)': [861, 857, ...]
             },
             'gpqa': {...},
             ...
@@ -40,7 +42,8 @@ def load_csv_data(csv_path: str) -> Dict[str, Dict[str, List[int]]]:
             for key, value in row.items():
                 if key != 'dataset' and value:
                     try:
-                        data[dataset][key].append(int(value))
+                        # Try to parse as float to support decimal values
+                        data[dataset][key].append(float(value))
                     except ValueError:
                         continue
 
@@ -48,12 +51,12 @@ def load_csv_data(csv_path: str) -> Dict[str, Dict[str, List[int]]]:
 
 
 def create_combined_box_plot(dataset_name: str,
-                             dataset_data: Dict[str, List[int]],
+                             dataset_data: Dict[str, List[float]],
                              run_columns: List[str],
                              passk_columns: List[str],
                              output_file: str,
                              ylabel: str = "Correct Count"):
-    """Create separate box plots for individual runs and pass@k in the same figure.
+    """Create separate box plots for individual runs, pass@k, and computed averages in the same figure.
 
     Args:
         dataset_name: Name of the dataset
@@ -75,7 +78,23 @@ def create_combined_box_plot(dataset_name: str,
         if col in dataset_data and dataset_data[col]:
             passk_data.extend(dataset_data[col])
 
-    if not all_runs_data and not passk_data:
+    # Compute averages from individual runs (average across runs for each trial)
+    average_data = []
+    if run_columns:
+        # Find the number of trials (minimum length across all run columns)
+        num_trials = min(len(dataset_data.get(col, [])) for col in run_columns if col in dataset_data)
+        
+        # For each trial, compute the average across all runs
+        for trial_idx in range(num_trials):
+            trial_values = []
+            for col in run_columns:
+                if col in dataset_data and trial_idx < len(dataset_data[col]):
+                    trial_values.append(dataset_data[col][trial_idx])
+            
+            if trial_values:
+                average_data.append(np.mean(trial_values))
+
+    if not all_runs_data and not passk_data and not average_data:
         print(f"Warning: No data to plot for {dataset_name}")
         return
 
@@ -85,6 +104,8 @@ def create_combined_box_plot(dataset_name: str,
         num_plots += 1
     if passk_data:
         num_plots += 1
+    if average_data:
+        num_plots += 1
 
     if num_plots == 0:
         print(f"Warning: No data to plot for {dataset_name}")
@@ -146,6 +167,7 @@ def create_combined_box_plot(dataset_name: str,
     # Plot pass@k
     if passk_data:
         ax = axes[plot_idx]
+        plot_idx += 1
 
         passk_label = passk_columns[0] if len(passk_columns) == 1 else 'Pass@k'
 
@@ -188,6 +210,51 @@ def create_combined_box_plot(dataset_name: str,
         ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, fontsize=9,
                 verticalalignment='top', bbox=props, family='monospace')
 
+    # Plot computed averages
+    if average_data:
+        ax = axes[plot_idx]
+
+        average_label = 'Pass@1 (avg of runs)'
+
+        bp = ax.boxplot([average_data], positions=[0], widths=0.5,
+                        patch_artist=True, showmeans=True,
+                        whis=[0, 100], showfliers=False,
+                        meanprops=dict(marker='D', markerfacecolor='red',
+                                       markeredgecolor='red', markersize=8))
+
+        # Color the box
+        bp['boxes'][0].set_facecolor(plt.cm.Set3(0.9))
+        bp['boxes'][0].set_alpha(0.7)
+
+        # Add scatter plot of individual points
+        # Add small random jitter to x-position for visibility
+        np.random.seed(42)  # For reproducibility
+        x_jitter = np.random.normal(0, 0.04, size=len(average_data))
+        ax.scatter(x_jitter, average_data, alpha=0.4, s=30,
+                   color='darkgreen', zorder=3, edgecolors='black', linewidth=0.5)
+
+        # Set labels
+        ax.set_xticks([0])
+        ax.set_xticklabels([average_label], fontsize=12)
+        ax.set_ylabel(ylabel, fontsize=12)
+        ax.set_title(
+            f"{dataset_name} - {average_label}",
+            fontsize=13,
+            fontweight='bold')
+        ax.grid(True, axis='y', alpha=0.3, linestyle='--')
+
+        # Add statistics
+        mean_val = np.mean(average_data)
+        std_val = np.std(average_data)
+        min_val = np.min(average_data)
+        max_val = np.max(average_data)
+        n_samples = len(average_data)
+
+        stats_text = f"n={n_samples}\nμ={mean_val:.1f}\nσ={std_val:.1f}\nmin={min_val:.1f}\nmax={max_val:.1f}"
+        props = dict(boxstyle='round', facecolor='wheat', alpha=0.3)
+        ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, fontsize=9,
+                verticalalignment='top', bbox=props, family='monospace')
+
     # Adjust layout
     plt.tight_layout()
 
@@ -234,10 +301,12 @@ def main():
         all_columns.update(dataset_data.keys())
 
     # Separate run columns from pass@k columns
+    # Use regex to match only exact pass@k format (e.g., pass@5, pass@10)
     run_columns = sorted(
         [col for col in all_columns if col.startswith('run_')])
+    passk_pattern = re.compile(r'^pass@\d+$')
     passk_columns = sorted(
-        [col for col in all_columns if col.startswith('pass@')])
+        [col for col in all_columns if passk_pattern.match(col)])
 
     if not run_columns and not passk_columns:
         print("Error: No run or pass@k columns found in CSV", file=sys.stderr)
@@ -246,6 +315,7 @@ def main():
     print(f"Found {len(run_columns)} run columns: {', '.join(run_columns)}")
     print(
         f"Found {len(passk_columns)} pass@k columns: {', '.join(passk_columns)}")
+    print(f"Will compute averages from individual runs")
     print()
 
     # Generate plots for each dataset separately
@@ -260,7 +330,7 @@ def main():
     for dataset in datasets:
         dataset_data = data[dataset]
 
-        # Create combined plot: Individual Runs (all combined) vs Pass@k
+        # Create combined plot: Individual Runs (all combined) vs Pass@k vs Computed Averages
         if run_columns or passk_columns:
             output_file = output_dir / f"{args.prefix}_{dataset}.png"
             print(f"Creating combined box plot for {dataset}...")
diff --git a/language/gpt-oss/run_server_trtllm.sh b/language/gpt-oss/run_server_trtllm.sh
index 233363a277..c27ce3949e 100755
--- a/language/gpt-oss/run_server_trtllm.sh
+++ b/language/gpt-oss/run_server_trtllm.sh
@@ -5,6 +5,7 @@ for var in $(compgen -v | grep '^SLURM_'); do unset "$var"; done
 model_path=openai/gpt-oss-120b
 extra_args=""
 output_dir=./data
+eagle_path=nvidia/gpt-oss-120b-Eagle3-v2
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -12,12 +13,16 @@ while [[ $# -gt 0 ]]; do
             model_path=$2
             shift 2
             ;;
-	--output_dir)
-	    output_dir=$2
-	    shift 2
-	    ;;
+        --output_dir)
+            output_dir=$2
+            shift 2
+            ;;
+        --eagle_path)
+            eagle_path=$2
+            shift 2
+            ;;
         *)
-	    extra_args="$extra_args $2"
+            extra_args="$extra_args $2"
             ;;
     esac
 done
@@ -29,11 +34,11 @@ enable_autotuner: false
 cuda_graph_config:
     max_batch_size: 256
     enable_padding: true
-      # speculative_config:
-      #     decoding_type: Eagle
-      #     max_draft_len: 3
-      #     speculative_model_dir: 
-      #     eagle3_layers_to_capture: [-1]
+speculative_config:
+    decoding_type: Eagle
+    max_draft_len: 4
+    speculative_model_dir: $eagle_path
+    eagle3_layers_to_capture: [-1]
 kv_cache_config:
     enable_block_reuse: false
     free_gpu_memory_fraction: 0.9

From 9a9194f0eb4dd1e861ac625326a94b5b46295411 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 20 Nov 2025 19:36:08 +0000
Subject: [PATCH 104/165] add tentative gpt-oss fields

---
 loadgen/mlperf.conf | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf
index 1b825514bd..390cceddd8 100644
--- a/loadgen/mlperf.conf
+++ b/loadgen/mlperf.conf
@@ -25,6 +25,7 @@ stable-diffusion-xl.*.performance_sample_count_override = 5000
 rgat.*.performance_sample_count_override = 788379
 pointpainting.*.performance_sample_count_override = 1024
 deepseek-r1.*.performance_sample_count_override = 4388
+gpt-oss.*.performance_sample_count_override = 8036
 whisper.*.performance_sample_count_override = 1633
 # set to 0 to let entire sample set to be performance sample
 3d-unet.*.performance_sample_count_override = 0
@@ -66,6 +67,7 @@ llama3_1-8b.*.sample_concatenate_permutation = 1
 llama3_1-8b-edge.*.sample_concatenate_permutation = 1
 llama3_1-8b-interactive.*.sample_concatenate_permutation = 1
 deepseek-r1.*.sample_concatenate_permutation = 1
+gpt-oss.*.sample_concatenate_permutation = 1
 whisper.*.sample_concatenate_permutation = 1
 
 *.Server.target_latency = 10
@@ -90,6 +92,7 @@ llama3_1-8b.*.use_token_latencies = 1
 llama3_1-8b-edge.*.use_token_latencies = 1
 llama3_1-8b-interactive.*.use_token_latencies = 1
 deepseek-r1.*.use_token_latencies = 1
+gpt-oss.*.use_token_latencies = 1
 whisper.*.use_token_latencies = 1
 
 # gptj benchmark infers token latencies
@@ -132,6 +135,10 @@ deepseek-r1.Server.target_latency = 0
 deepseek-r1.Server.ttft_latency = 2000
 deepseek-r1.Server.tpot_latency = 80
 
+gpt-oss.Server.target_latency = 0
+gpt-oss.Server.ttft_latency = 2000
+gpt-oss.Server.tpot_latency = 20
+
 *.Offline.target_latency_percentile = 90
 *.Offline.min_duration = 600000
 
@@ -155,6 +162,7 @@ llama3_1-8b-edge.Offline.min_query_count = 5000
 mixtral-8x7b.Offline.min_query_count = 15000
 rgat.Offline.min_query_count = 788379
 deepseek-r1.Offline.min_query_count = 4388
+gpt-oss.Offline.min_query_count = 8036
 whisper.Offline.min_query_count = 1633
 
 # These fields should be defined and overridden by user.conf.

From 1812c04b512c7386a531e3bb8ca20bc64c4f504a Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 20 Nov 2025 19:38:32 +0000
Subject: [PATCH 105/165] remove data dir

---
 language/gpt-oss/data/fetch_accuracy_eval.py | 186 -------------------
 1 file changed, 186 deletions(-)
 delete mode 100644 language/gpt-oss/data/fetch_accuracy_eval.py

diff --git a/language/gpt-oss/data/fetch_accuracy_eval.py b/language/gpt-oss/data/fetch_accuracy_eval.py
deleted file mode 100644
index 81413c229a..0000000000
--- a/language/gpt-oss/data/fetch_accuracy_eval.py
+++ /dev/null
@@ -1,186 +0,0 @@
-from datasets import load_dataset
-import pandas as pd
-import random
-import argparse
-import os
-import json
-
-parser = argparse.ArgumentParser(
-    description="Fetch and combine AIME, GPQA, and LiveCodeBench datasets")
-parser.add_argument(
-    "--output_path",
-    type=str,
-    default="./combined_dataset.pkl",
-    help="Full path to output pickle file")
-parser.add_argument(
-    "--lcb_folder",
-    type=str,
-    default="lcb",
-    help="Folder containing LiveCodeBench repo cloned from HuggingFace")
-args = parser.parse_args()
-
-# Ensure output folder exists
-output_dir = os.path.dirname(args.output_path)
-if output_dir:
-    os.makedirs(output_dir, exist_ok=True)
-
-random.seed(42)
-
-print("=" * 80)
-print("Fetching datasets...")
-print("=" * 80)
-
-# ============================================================================
-# 1. FETCH AIME DATASET
-# ============================================================================
-print("\n[1/3] Fetching AIME dataset...")
-df_aime = load_dataset("di-zhang-fdu/AIME_1983_2024")['train'].to_pandas()
-
-# Optional: AIME 2025 datasets
-# df_1_aime2025 = load_dataset("opencompass/AIME2025", "AIME2025-I")['test'].to_pandas()
-# df_2_aime2025 = load_dataset("opencompass/AIME2025", "AIME2025-II")['test'].to_pandas()
-# df_aime2025 = pd.concat([df_1_aime2025, df_2_aime2025], ignore_index=True)
-# df_aime2025.rename(columns={'answer': 'ground_truth'}, inplace=True)
-
-df_aime.rename(
-    columns={
-        'Answer': 'ground_truth',
-        'Question': 'question'},
-    inplace=True)
-df_aime.drop(columns=['Year', 'ID', 'Problem Number', 'Part'], inplace=True)
-df_aime['dataset'] = 'aime1983'
-
-print(f"   ✓ AIME dataset loaded: {len(df_aime)} samples")
-
-# ============================================================================
-# 2. FETCH GPQA DATASET
-# ============================================================================
-print("\n[2/3] Fetching GPQA dataset...")
-
-# Note: Login using `huggingface-cli login` to access this dataset if needed
-ds_diamond = load_dataset("Idavidrein/gpqa", "gpqa_diamond")
-
-# Optional: Other GPQA variants
-# ds_experts = load_dataset("Idavidrein/gpqa", "gpqa_experts")
-# ds_main = load_dataset("Idavidrein/gpqa", "gpqa_main")
-# ds_extended = load_dataset("Idavidrein/gpqa", "gpqa_extended")
-
-df_gpqa = ds_diamond['train'].to_pandas()
-# df_experts = ds_experts['train'].to_pandas()
-# df_main = ds_main['train'].to_pandas()
-# df_extended = ds_extended['train'].to_pandas()
-
-# df = pd.concat([df_diamond, df_main, df_extended], ignore_index=True)
-df_gpqa = df_gpqa[['Question',
-                   'Correct Answer',
-                   'High-level domain',
-                   'Incorrect Answer 1',
-                   'Incorrect Answer 2',
-                   'Incorrect Answer 3']].copy()
-
-# Format questions with multiple choice options
-for idx, row in df_gpqa.iterrows():
-    options = [str(row[col]) for col in ['Incorrect Answer 1',
-                                         'Incorrect Answer 2', 'Incorrect Answer 3']]
-    options.append(str(row['Correct Answer']))
-    random.shuffle(options)
-    answer_idx = options.index(str(row['Correct Answer']))
-
-    options = [option.strip() for option in options]
-    answer = chr(65 + answer_idx)
-
-    question = f"{row['Question']}\nA) {options[0]}\nB) {options[1]}\nC) {options[2]}\nD) {options[3]}"
-    df_gpqa.loc[idx, 'Question'] = question
-    df_gpqa.loc[idx, 'ground_truth'] = answer
-
-df_gpqa.rename(
-    columns={
-        'High-level domain': 'domain',
-        'Question': 'question'},
-    inplace=True)
-df_gpqa['dataset'] = 'gpqa'
-
-print(f"   ✓ GPQA dataset loaded: {len(df_gpqa)} samples")
-
-# ============================================================================
-# 3. FETCH LIVECODEBENCH DATASET
-# ============================================================================
-print("\n[3/3] Fetching LiveCodeBench dataset...")
-files = [
-    "test.jsonl",
-    "test2.jsonl",
-    "test3.jsonl",
-    "test4.jsonl",
-    "test5.jsonl"]
-files = [os.path.join(args.lcb_folder, file) for file in files]
-
-all_rows = []
-all_columns = set()
-
-for file in files:
-    if not os.path.exists(file):
-        raise FileNotFoundError(f"Error: File not found: {file}")
-
-    with open(file, "r", encoding="utf-8") as f:
-        for line in f:
-            row = json.loads(line)
-            all_columns.update(row.keys())
-            all_rows.append(row)
-
-all_columns = list(all_columns)
-
-df_lcb = pd.DataFrame(all_rows, columns=all_columns)
-df_lcb['dataset'] = 'livecodebench'
-df_lcb.drop(
-    columns=[
-        'private_test_cases',
-        'metadata',
-        'public_test_cases',
-        'contest_id',
-        'platform',
-        'difficulty',
-        'contest_date',
-        'question_title'],
-    inplace=True,
-    errors='ignore')  # Use errors='ignore' in case some columns don't exist
-
-starter_prompt = """
-### Format: You will use the following starter code to write the solution to the problem and enclose your code within delimiters.
-```python
-<<starter_code>>
-```
-"""
-
-for idx, row in df_lcb.iterrows():
-    starter_code = row['starter_code']
-    starter_prompt_filled = starter_prompt.replace(
-        '<<starter_code>>', starter_code)
-    df_lcb.loc[idx, 'question'] = df_lcb.loc[idx,
-                                             'question_content'] + starter_prompt_filled
-
-df_lcb.rename(columns={'question_id': 'ground_truth'}, inplace=True)
-
-print(f"   ✓ LiveCodeBench dataset loaded: {len(df_lcb)} samples")
-
-# ============================================================================
-# 4. COMBINE ALL DATASETS
-# ============================================================================
-print("\n" + "=" * 80)
-print("Combining datasets...")
-print("=" * 80)
-
-# Combine all dataframes
-df_combined = pd.concat([df_aime, df_gpqa, df_lcb], ignore_index=True)
-
-print(f"\nCombined dataset statistics:")
-print(f"  • Total samples: {len(df_combined)}")
-print(f"  • AIME samples: {len(df_aime)}")
-print(f"  • GPQA samples: {len(df_gpqa)}")
-print(f"  • LiveCodeBench samples: {len(df_lcb)}")
-print(f"\nDataset columns: {list(df_combined.columns)}")
-
-# Save combined dataset
-df_combined.to_pickle(args.output_path)
-
-print(f"\n✓ Combined dataset saved to: {args.output_path}")
-print("=" * 80)

From 16af1bf7c494240aa9805d75d432c3ae1ad56ff5 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 20 Nov 2025 19:39:11 +0000
Subject: [PATCH 106/165] create preprocess module

---
 language/gpt-oss/{ => preprocess}/harmonize_inputs.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename language/gpt-oss/{ => preprocess}/harmonize_inputs.py (100%)

diff --git a/language/gpt-oss/harmonize_inputs.py b/language/gpt-oss/preprocess/harmonize_inputs.py
similarity index 100%
rename from language/gpt-oss/harmonize_inputs.py
rename to language/gpt-oss/preprocess/harmonize_inputs.py

From 9b4a84c920e43e1791b6d4633c299d54cba6807c Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 20 Nov 2025 19:55:11 +0000
Subject: [PATCH 107/165] move things to archive

---
 language/gpt-oss/{ => archive}/collect_results_csv.py | 0
 language/gpt-oss/{ => archive}/plot.py                | 0
 language/gpt-oss/{ => archive}/plot_results.py        | 0
 language/gpt-oss/{ => archive}/summarize_eval.py      | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename language/gpt-oss/{ => archive}/collect_results_csv.py (100%)
 rename language/gpt-oss/{ => archive}/plot.py (100%)
 rename language/gpt-oss/{ => archive}/plot_results.py (100%)
 rename language/gpt-oss/{ => archive}/summarize_eval.py (100%)

diff --git a/language/gpt-oss/collect_results_csv.py b/language/gpt-oss/archive/collect_results_csv.py
similarity index 100%
rename from language/gpt-oss/collect_results_csv.py
rename to language/gpt-oss/archive/collect_results_csv.py
diff --git a/language/gpt-oss/plot.py b/language/gpt-oss/archive/plot.py
similarity index 100%
rename from language/gpt-oss/plot.py
rename to language/gpt-oss/archive/plot.py
diff --git a/language/gpt-oss/plot_results.py b/language/gpt-oss/archive/plot_results.py
similarity index 100%
rename from language/gpt-oss/plot_results.py
rename to language/gpt-oss/archive/plot_results.py
diff --git a/language/gpt-oss/summarize_eval.py b/language/gpt-oss/archive/summarize_eval.py
similarity index 100%
rename from language/gpt-oss/summarize_eval.py
rename to language/gpt-oss/archive/summarize_eval.py

From 9450d461a0392dc908d9fa81bb6be2b4f73ff2f0 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 20 Nov 2025 19:55:39 +0000
Subject: [PATCH 108/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/archive/plot.py         | 68 ++++++++++++++++--------
 language/gpt-oss/archive/plot_results.py | 13 +++--
 2 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/language/gpt-oss/archive/plot.py b/language/gpt-oss/archive/plot.py
index b0ecda74fa..d85d470c22 100644
--- a/language/gpt-oss/archive/plot.py
+++ b/language/gpt-oss/archive/plot.py
@@ -62,12 +62,30 @@ def create_per_dataset_histogram(df, column_name, title, filename, output_dir):
         p50 = dataset_data.quantile(0.50)
         p75 = dataset_data.quantile(0.75)
         p99 = dataset_data.quantile(0.99)
-        
-        ax.axvline(p50, color='green', linestyle='--', linewidth=2, alpha=0.7, label=f'50th: {p50:.1f}')
-        ax.axvline(p75, color='orange', linestyle='--', linewidth=2, alpha=0.7, label=f'75th: {p75:.1f}')
-        ax.axvline(p99, color='red', linestyle='--', linewidth=2, alpha=0.7, label=f'99th: {p99:.1f}')
+
+        ax.axvline(
+            p50,
+            color='green',
+            linestyle='--',
+            linewidth=2,
+            alpha=0.7,
+            label=f'50th: {p50:.1f}')
+        ax.axvline(
+            p75,
+            color='orange',
+            linestyle='--',
+            linewidth=2,
+            alpha=0.7,
+            label=f'75th: {p75:.1f}')
+        ax.axvline(
+            p99,
+            color='red',
+            linestyle='--',
+            linewidth=2,
+            alpha=0.7,
+            label=f'99th: {p99:.1f}')
         ax.legend(loc='upper right', fontsize=8)
-        
+
         # Add statistics
         mean_val = dataset_data.mean()
         median_val = dataset_data.median()
@@ -90,7 +108,8 @@ def create_per_dataset_histogram(df, column_name, title, filename, output_dir):
     plt.close()
 
 
-def create_full_histogram(df, column_name, title, filename, output_dir, save_bins=False):
+def create_full_histogram(df, column_name, title,
+                          filename, output_dir, save_bins=False):
     """Create a single histogram combining all datasets."""
     print(f"Creating {filename}...")
     print(f"  Total samples: {len(df)}")
@@ -121,10 +140,13 @@ def create_full_histogram(df, column_name, title, filename, output_dir, save_bin
     p50 = df[column_name].quantile(0.50)
     p75 = df[column_name].quantile(0.75)
     p99 = df[column_name].quantile(0.99)
-    
-    plt.axvline(p50, color='green', linestyle='--', linewidth=2, alpha=0.7, label=f'50th percentile: {p50:.1f}')
-    plt.axvline(p75, color='orange', linestyle='--', linewidth=2, alpha=0.7, label=f'75th percentile: {p75:.1f}')
-    plt.axvline(p99, color='red', linestyle='--', linewidth=2, alpha=0.7, label=f'99th percentile: {p99:.1f}')
+
+    plt.axvline(p50, color='green', linestyle='--', linewidth=2,
+                alpha=0.7, label=f'50th percentile: {p50:.1f}')
+    plt.axvline(p75, color='orange', linestyle='--', linewidth=2,
+                alpha=0.7, label=f'75th percentile: {p75:.1f}')
+    plt.axvline(p99, color='red', linestyle='--', linewidth=2,
+                alpha=0.7, label=f'99th percentile: {p99:.1f}')
     plt.legend(loc='upper right', fontsize=10)
 
     # Add statistics
@@ -152,11 +174,11 @@ def create_full_histogram(df, column_name, title, filename, output_dir, save_bin
     plt.savefig(f'{output_dir}/{filename}', dpi=300, bbox_inches='tight')
     print(f"  Saved to {output_dir}/{filename}")
     plt.close()
-    
+
     # Save bin data to CSV if requested
     if save_bins:
         csv_filename = filename.replace('.png', '_bins.csv')
-        
+
         # Create bin data DataFrame
         bin_data = pd.DataFrame({
             'bin_lower': bin_edges[:-1],
@@ -164,16 +186,19 @@ def create_full_histogram(df, column_name, title, filename, output_dir, save_bin
             'bin_center': (bin_edges[:-1] + bin_edges[1:]) / 2,
             'count': counts.astype(int)
         })
-        
+
         csv_path = f'{output_dir}/{csv_filename}'
-        
+
         # Save with header containing percentile information
         with open(csv_path, 'w') as f:
-            f.write(f'# Percentiles: 50th={p50:.2f}, 75th={p75:.2f}, 99th={p99:.2f}\n')
-            f.write(f'# Mean={mean_val:.2f}, Median={median_val:.2f}, Std={std_val:.2f}\n')
-            f.write(f'# Min={min_val}, Max={max_val}, Total samples={len(df)}\n')
+            f.write(
+                f'# Percentiles: 50th={p50:.2f}, 75th={p75:.2f}, 99th={p99:.2f}\n')
+            f.write(
+                f'# Mean={mean_val:.2f}, Median={median_val:.2f}, Std={std_val:.2f}\n')
+            f.write(
+                f'# Min={min_val}, Max={max_val}, Total samples={len(df)}\n')
             bin_data.to_csv(f, index=False)
-        
+
         print(f"  Saved bin data to {csv_path}")
 
 
@@ -203,7 +228,7 @@ def main():
 
     # Check if prompt_accuracy column exists
     has_accuracy = 'prompt_accuracy' in df.columns
-    
+
     # Determine which output length column to use
     if 'tok_model_output_len' in df.columns:
         output_len_col = 'tok_model_output_len'
@@ -211,8 +236,9 @@ def main():
         output_len_col = 'tok_model_output_len_0'
         print("\nNote: 'tok_model_output_len' not found, using 'tok_model_output_len_0' instead")
     else:
-        raise ValueError("Neither 'tok_model_output_len' nor 'tok_model_output_len_0' column found in data")
-    
+        raise ValueError(
+            "Neither 'tok_model_output_len' nor 'tok_model_output_len_0' column found in data")
+
     if has_accuracy:
         # Filter for 100% accuracy
         df_100 = df[df['prompt_accuracy'] == 100.0].copy()
diff --git a/language/gpt-oss/archive/plot_results.py b/language/gpt-oss/archive/plot_results.py
index 042ca0efc8..d3cc889825 100755
--- a/language/gpt-oss/archive/plot_results.py
+++ b/language/gpt-oss/archive/plot_results.py
@@ -78,19 +78,21 @@ def create_combined_box_plot(dataset_name: str,
         if col in dataset_data and dataset_data[col]:
             passk_data.extend(dataset_data[col])
 
-    # Compute averages from individual runs (average across runs for each trial)
+    # Compute averages from individual runs (average across runs for each
+    # trial)
     average_data = []
     if run_columns:
         # Find the number of trials (minimum length across all run columns)
-        num_trials = min(len(dataset_data.get(col, [])) for col in run_columns if col in dataset_data)
-        
+        num_trials = min(len(dataset_data.get(col, []))
+                         for col in run_columns if col in dataset_data)
+
         # For each trial, compute the average across all runs
         for trial_idx in range(num_trials):
             trial_values = []
             for col in run_columns:
                 if col in dataset_data and trial_idx < len(dataset_data[col]):
                     trial_values.append(dataset_data[col][trial_idx])
-            
+
             if trial_values:
                 average_data.append(np.mean(trial_values))
 
@@ -330,7 +332,8 @@ def main():
     for dataset in datasets:
         dataset_data = data[dataset]
 
-        # Create combined plot: Individual Runs (all combined) vs Pass@k vs Computed Averages
+        # Create combined plot: Individual Runs (all combined) vs Pass@k vs
+        # Computed Averages
         if run_columns or passk_columns:
             output_file = output_dir / f"{args.prefix}_{dataset}.png"
             print(f"Creating combined box plot for {dataset}...")

From 319e5f7f74f8da1d37cb752dbaaa74266a813d80 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 20 Nov 2025 19:56:14 +0000
Subject: [PATCH 109/165] rm unused scripts

---
 language/gpt-oss/preprocess.sh        |  7 ---
 language/gpt-oss/run_server_trtllm.sh | 64 ---------------------------
 2 files changed, 71 deletions(-)
 delete mode 100755 language/gpt-oss/preprocess.sh
 delete mode 100755 language/gpt-oss/run_server_trtllm.sh

diff --git a/language/gpt-oss/preprocess.sh b/language/gpt-oss/preprocess.sh
deleted file mode 100755
index e3842c184b..0000000000
--- a/language/gpt-oss/preprocess.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-pip install -r requirements.txt
-
-git clone https://huggingface.co/datasets/livecodebench/code_generation_lite data/lcb
-python3 data/fetch_all.py --output_path data/accuracy_eval_raw.pkl --lcb_folder data/lcb
-python3 harmonize_inputs.py --data-file data/accuracy_eval_raw.pkl --output-file data/accuracy_eval_tokenized.pkl --reasoning-effort high --num-processes 32
diff --git a/language/gpt-oss/run_server_trtllm.sh b/language/gpt-oss/run_server_trtllm.sh
deleted file mode 100755
index c27ce3949e..0000000000
--- a/language/gpt-oss/run_server_trtllm.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-for var in $(compgen -v | grep '^SLURM_'); do unset "$var"; done
-
-model_path=openai/gpt-oss-120b
-extra_args=""
-output_dir=./data
-eagle_path=nvidia/gpt-oss-120b-Eagle3-v2
-
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --model_path)
-            model_path=$2
-            shift 2
-            ;;
-        --output_dir)
-            output_dir=$2
-            shift 2
-            ;;
-        --eagle_path)
-            eagle_path=$2
-            shift 2
-            ;;
-        *)
-            extra_args="$extra_args $2"
-            ;;
-    esac
-done
-
-
-cat <<EOF > config.yml
-enable_attention_dp: false
-enable_autotuner: false
-cuda_graph_config:
-    max_batch_size: 256
-    enable_padding: true
-speculative_config:
-    decoding_type: Eagle
-    max_draft_len: 4
-    speculative_model_dir: $eagle_path
-    eagle3_layers_to_capture: [-1]
-kv_cache_config:
-    enable_block_reuse: false
-    free_gpu_memory_fraction: 0.9
-moe_config: 
-    backend: TRTLLM
-print_iter_log: true
-EOF
-
-
-gpu_count=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-
-set -x;
-
-for ((gpu=0; gpu<gpu_count; gpu++)); do
-    CUDA_VISIBLE_DEVICES=$gpu TRTLLM_ENABLE_PDL=1 trtllm-serve $model_path --host 0.0.0.0 --port 3000$gpu --backend pytorch --max_batch_size 256 --tp_size 1 --ep_size 1 --trust_remote_code --extra_llm_api_options config.yml $extra_args & > $output_dir/trtllm-serve-$gpu 2>&1
-done
-
-# num_servers=2
-# CUDA_VISIBLE_DEVICES=0,1,2,3 TRTLLM_ENABLE_PDL=1 trtllm-serve $model_path --host 0.0.0.0 --port 30000 --backend pytorch --max_batch_size 1024 --tp_size 4 --ep_size 1 --trust_remote_code --extra_llm_api_options config.yml $extra_args & > $output_dir/trtllm-serve-0.log 2>&1
-# CUDA_VISIBLE_DEVICES=4,5,6,7 TRTLLM_ENABLE_PDL=1 trtllm-serve $model_path --host 0.0.0.0 --port 30001 --backend pytorch --max_batch_size 1024 --tp_size 4 --ep_size 1 --trust_remote_code --extra_llm_api_options config.yml $extra_args & > $output_dir/trtllm-serve-1.log 2>&1
-
-wait
-

From 4d89b98ea2bdf836558f91865c3c50cba4ae3a93 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 20 Nov 2025 19:59:49 +0000
Subject: [PATCH 110/165] rm unused

---
 language/gpt-oss/run_infer_trtllm.py    | 642 ------------------------
 language/gpt-oss/run_pipeline_trtllm.sh |  39 --
 2 files changed, 681 deletions(-)
 delete mode 100644 language/gpt-oss/run_infer_trtllm.py
 delete mode 100755 language/gpt-oss/run_pipeline_trtllm.sh

diff --git a/language/gpt-oss/run_infer_trtllm.py b/language/gpt-oss/run_infer_trtllm.py
deleted file mode 100644
index adac3c6271..0000000000
--- a/language/gpt-oss/run_infer_trtllm.py
+++ /dev/null
@@ -1,642 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script to send text prompts to TensorRT-LLM server via OpenAI completions endpoint.
-Supports round-robin load balancing across multiple server endpoints.
-
-Usage:
-    python run_infer_trtllm.py --input-tokens tokenized_data.pkl [options]
-
-Arguments:
-    --input-tokens     Path to pickle file containing data with text_input column from harmony-tokens.py
-    --server-url       TensorRT-LLM server URL(s) - comma-separated for round-robin (e.g., "localhost:8000,localhost:8001")
-    --max-samples      Maximum number of samples to process (default: all)
-    --max-tokens       Maximum tokens to generate per request (default: 100)
-    --max-concurrency  Maximum number of concurrent requests (default: 256)
-    --output           Output pickle file for responses (optional)
-    --pass-k           Number of inference passes per sample for pass@k strategy (default: 1)
-
-Examples:
-    # Single server
-    python run_infer_trtllm.py --input-tokens data.pkl --server-url localhost:8000
-
-    # Multiple servers with round-robin
-    python run_infer_trtllm.py --input-tokens data.pkl --server-url localhost:8000,localhost:8001,localhost:8002
-"""
-
-import asyncio
-import argparse
-import time
-import logging
-from typing import List, Dict, Any, Tuple
-import pandas as pd
-from tqdm import tqdm
-from transformers import AutoTokenizer
-import httpx
-from openai import AsyncOpenAI
-
-# Set up logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-
-# Suppress verbose HTTP logs from httpx and openai
-logging.getLogger("httpx").setLevel(logging.WARNING)
-logging.getLogger("openai").setLevel(logging.WARNING)
-
-# Initialize tokenizer
-MODEL_NAME = "openai/gpt-oss-120b"
-tokenizer = None
-
-
-def get_tokenizer():
-    """Get or initialize the tokenizer."""
-    global tokenizer
-    if tokenizer is None:
-        logger.info(f"Loading tokenizer for {MODEL_NAME}...")
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-        logger.info("Tokenizer loaded successfully")
-    return tokenizer
-
-
-class TRTLLMClient:
-    """Client for TensorRT-LLM server using OpenAI-compatible endpoint with round-robin support."""
-
-    def __init__(self,
-                 server_urls: List[str] = None,
-                 temperature: float = 0.001,
-                 top_k: int = 1,
-                 top_p: float = 1.0,
-                 max_concurrency: int = 256,
-                 timeout: int = 1200):
-        # Support multiple server URLs for round-robin load balancing
-        if server_urls is None:
-            server_urls = ["localhost:8000"]
-        self.server_urls = server_urls
-        self.num_servers = len(server_urls)
-        self.current_server_index = 0
-
-        self.temperature = temperature
-        self.top_k = top_k
-        self.top_p = top_p
-        self.max_concurrency = max_concurrency
-        self.timeout = timeout
-        self.model_name = MODEL_NAME
-
-        # Initialize async OpenAI clients (one per server)
-        self.http_clients = []
-        self.async_clients = []
-        self.concurrency_semaphore = None
-
-        logger.info(
-            f"Initialized client with {self.num_servers} server(s): {', '.join(self.server_urls)}")
-
-    async def initialize(self):
-        """Initialize OpenAI clients for all servers."""
-        # Create semaphore for concurrency control
-        self.concurrency_semaphore = asyncio.Semaphore(self.max_concurrency)
-
-        # Create HTTP and OpenAI clients for each server
-        for server_url in self.server_urls:
-            # Setup HTTP client with proper connection limits for high
-            # concurrency
-            http_client = httpx.AsyncClient(
-                timeout=httpx.Timeout(self.timeout),
-                limits=httpx.Limits(
-                    max_keepalive_connections=self.max_concurrency * 2,
-                    max_connections=self.max_concurrency * 2,
-                ),
-                http2=True
-            )
-
-            # Setup OpenAI client with the configured HTTP client
-            async_client = AsyncOpenAI(
-                api_key='dummy',  # TensorRT-LLM server doesn't require real API key
-                base_url=f"http://{server_url}/v1/",
-                timeout=self.timeout,
-                max_retries=10,
-                http_client=http_client,
-            )
-
-            self.http_clients.append(http_client)
-            self.async_clients.append(async_client)
-
-        logger.info(f"Initialized {len(self.async_clients)} OpenAI client(s)")
-
-    def _get_next_client(self) -> AsyncOpenAI:
-        """Get the next client using round-robin selection."""
-        client = self.async_clients[self.current_server_index]
-        self.current_server_index = (
-            self.current_server_index + 1) % self.num_servers
-        return client
-
-    async def send_request(
-            self, prompt: str, max_tokens: int = 100,
-            sample_id: int = 0, pass_num: int = 0) -> Tuple[int, int, Dict[str, Any], float]:
-        """Send a single request to the TensorRT-LLM server using round-robin.
-
-        Args:
-            prompt: Text prompt to send
-            max_tokens: Maximum tokens to generate
-            sample_id: Sample identifier
-            pass_num: Pass number for pass@k strategy
-
-        Returns:
-            Tuple of (sample_id, pass_num, response, latency)
-        """
-        # Prepare generation parameters using OpenAI completions format (as per
-        # TensorRT-LLM docs)
-        extra_body = {
-            # TensorRT-LLM specific parameters
-            "min_tokens": 1,
-        }
-
-        # Only include top_k if it's not 0 (so it can default to None on server
-        # side)
-        if self.top_k != 0:
-            extra_body["top_k"] = self.top_k
-
-        gen_params = {
-            "model": self.model_name,
-            "prompt": prompt,
-            "max_tokens": max_tokens,
-            "temperature": self.temperature,
-            "top_p": self.top_p,
-            "stream": False,
-            "extra_body": extra_body,
-        }
-
-        try:
-            # Track latency: time from request sent to response received
-            start_time = time.time()
-
-            # Select client using round-robin
-            client = self._get_next_client()
-
-            # Use semaphore for concurrency control
-            async with self.concurrency_semaphore:
-                completion = await client.completions.create(**gen_params)
-
-            end_time = time.time()
-            latency = end_time - start_time
-
-            # Extract response text from completions format
-            response_text = completion.choices[0].text
-
-            # Tokenize the response to get output_ids (similar to SGLang
-            # format)
-            tokenizer = get_tokenizer()
-            output_ids = tokenizer.encode(
-                response_text, add_special_tokens=False)
-
-            # Format response similar to SGLang format for compatibility
-            response = {
-                "output_ids": output_ids,
-                "text": response_text,
-                "meta_info": {
-                    "completion_tokens": len(output_ids),
-                }
-            }
-
-            return sample_id, pass_num, response, latency
-
-        except Exception as e:
-            logger.error(f"Request {sample_id} (pass {pass_num}) failed: {e}")
-            return sample_id, pass_num, {"error": str(e)}, None
-
-    async def shutdown(self):
-        """Clean up resources for all clients."""
-        for http_client in self.http_clients:
-            if http_client:
-                await http_client.aclose()
-
-
-def load_tokenized_data(data_file: str) -> pd.DataFrame:
-    """Load data from pickle file produced by harmony-tokens.py."""
-    logger.info(f"Loading data from {data_file}")
-
-    # Load DataFrame from pickle
-    df = pd.read_pickle(data_file)
-    logger.info(f"Loaded DataFrame with shape: {df.shape}")
-
-    # Check if text_input column exists and has valid data
-    if 'text_input' in df.columns:
-        # Check for any None values in text_input
-        failed_mask = df['text_input'].isna()
-        failed_count = failed_mask.sum()
-
-        if failed_count > 0:
-            failed_indices = df[failed_mask].index.unique()
-            error_msg = f"Found {failed_count} samples with missing text_input at indices: {failed_indices.tolist()}"
-            logger.error(error_msg)
-            raise AssertionError(error_msg)
-
-        # Check first sample
-        first_text = df.iloc[0]['text_input']
-        if isinstance(first_text, str):
-            logger.info(
-                f"First sample text length: {len(first_text)} characters")
-        else:
-            logger.warning(
-                "text_input column exists but first sample is not a string")
-
-        logger.info(f"All {len(df)} samples have valid text_input")
-    else:
-        logger.error("No 'text_input' column found in DataFrame")
-        raise ValueError("DataFrame must contain 'text_input' column")
-
-    return df
-
-
-async def send_requests_async(
-        tokenized_df: pd.DataFrame, server_urls: List[str],
-        max_tokens: int = 100, max_concurrency: int = 256,
-        temperature: float = 0.001, top_k: int = 1, top_p: float = 1.0,
-        timeout: int = 1200, pass_k: int = 1):
-    """Send all requests to TensorRT-LLM server(s) asynchronously with round-robin load balancing.
-
-    Args:
-        server_urls: List of server URLs for round-robin load balancing
-        pass_k: Number of inference passes per sample for pass@k strategy
-
-    Returns:
-        tuple: (responses_by_pass, latencies_by_pass) - Dict mapping (sample_id, pass_num) to response/latency
-    """
-    num_samples = len(tokenized_df)
-    total_requests = num_samples * pass_k
-    logger.info(
-        f"Sending {total_requests} requests ({num_samples} samples × {pass_k} passes) with {max_concurrency} concurrent workers...")
-
-    # Initialize client with multiple servers for round-robin
-    client = TRTLLMClient(
-        server_urls=server_urls,
-        temperature=temperature,
-        top_k=top_k,
-        top_p=top_p,
-        max_concurrency=max_concurrency,
-        timeout=timeout
-    )
-    await client.initialize()
-
-    # Prepare all tasks - create pass_k requests per sample
-    tasks = []
-    for idx, row in tokenized_df.iterrows():
-        for pass_num in range(pass_k):
-            task = client.send_request(
-                row['text_input'],
-                max_tokens=max_tokens,
-                sample_id=idx,
-                pass_num=pass_num
-            )
-            tasks.append(task)
-
-    start_time = time.time()
-
-    # Execute all tasks concurrently with progress bar
-    results = []
-    for coro in tqdm(
-            asyncio.as_completed(tasks),
-            total=len(tasks),
-            desc="Sending requests",
-            unit="request"):
-        result = await coro
-        results.append(result)
-
-    # Group results by sample_id and pass_num
-    responses_by_pass = {}
-    latencies_by_pass = {}
-    for sample_id, pass_num, response, latency in results:
-        responses_by_pass[(sample_id, pass_num)] = response
-        latencies_by_pass[(sample_id, pass_num)] = latency
-
-    total_time = time.time() - start_time
-    logger.info(
-        f"Completed {total_requests} requests in {total_time:.2f} seconds")
-    logger.info(f"Average rate: {total_requests/total_time:.2f} requests/sec")
-
-    # Log latency statistics
-    valid_latencies = [
-        lat for lat in latencies_by_pass.values() if lat is not None]
-    if valid_latencies:
-        avg_latency = sum(valid_latencies) / len(valid_latencies)
-        min_latency = min(valid_latencies)
-        max_latency = max(valid_latencies)
-        logger.info(
-            f"Latency stats - Avg: {avg_latency:.3f}s, Min: {min_latency:.3f}s, Max: {max_latency:.3f}s")
-
-    # Shutdown client
-    await client.shutdown()
-
-    return responses_by_pass, latencies_by_pass
-
-
-def extract_response_ids(
-        responses_by_pass: Dict[tuple, Dict[str, Any]], tokenized_df: pd.DataFrame, pass_k: int) -> Dict[tuple, List[int]]:
-    """Extract response output_ids from TensorRT-LLM responses for all passes.
-
-    Args:
-        responses_by_pass: Dict mapping (sample_id, pass_num) to response
-        tokenized_df: DataFrame with samples
-        pass_k: Number of passes per sample
-
-    Returns:
-        Dict mapping (sample_id, pass_num) to output_ids list
-    """
-    logger.info("Extracting response output_ids...")
-
-    response_ids_by_pass = {}
-    total_responses = len(tokenized_df) * pass_k
-
-    with tqdm(total=total_responses, desc="Extracting responses", unit="response") as pbar:
-        for idx, row in tokenized_df.iterrows():
-            for pass_num in range(pass_k):
-                response = responses_by_pass.get((idx, pass_num), {})
-                response_id = []
-                if "error" not in response and "output_ids" in response:
-                    try:
-                        # TensorRT-LLM returns the generated token IDs in the
-                        # 'output_ids' field
-                        response_id = response["output_ids"]
-                    except Exception as e:
-                        logger.warning(
-                            f"Failed to extract response for sample {idx}, pass {pass_num}: {e}")
-                response_ids_by_pass[(idx, pass_num)] = response_id
-                pbar.update(1)
-
-    logger.info("Response output_ids extraction complete")
-    return response_ids_by_pass
-
-
-def detokenize_output_ids(
-        response_ids_by_pass: Dict[tuple, List[int]], pass_k: int) -> Dict[tuple, str]:
-    """Detokenize output_ids back to text using AutoTokenizer for all passes.
-
-    Args:
-        response_ids_by_pass: Dict mapping (sample_id, pass_num) to output_ids
-        pass_k: Number of passes per sample
-
-    Returns:
-        Dict mapping (sample_id, pass_num) to detokenized text
-    """
-    logger.info("Detokenizing output_ids to text...")
-
-    tokenizer = get_tokenizer()
-    detokenized_texts_by_pass = {}
-
-    for (sample_id, pass_num), token_ids in tqdm(
-            response_ids_by_pass.items(), desc="Detokenizing outputs", unit="output"):
-        try:
-            # Detokenize the token IDs back to text
-            text = tokenizer.decode(token_ids, skip_special_tokens=True)
-            detokenized_texts_by_pass[(sample_id, pass_num)] = text
-        except Exception as e:
-            logger.warning(
-                f"Failed to detokenize output for sample {sample_id}, pass {pass_num}: {e}")
-            detokenized_texts_by_pass[(sample_id, pass_num)] = ""
-
-    logger.info("Output detokenization complete")
-    return detokenized_texts_by_pass
-
-
-def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
-                   response_ids_by_pass: Dict[tuple, List[int]],
-                   detokenized_texts_by_pass: Dict[tuple, str],
-                   latencies_by_pass: Dict[tuple, float],
-                   tokenized_df: pd.DataFrame, pass_k: int, output_file: str = None) -> pd.DataFrame:
-    """Save all responses to DataFrame and optionally to pickle file.
-
-    Args:
-        responses_by_pass: Dict mapping (sample_id, pass_num) to response
-        response_ids_by_pass: Dict mapping (sample_id, pass_num) to output_ids
-        detokenized_texts_by_pass: Dict mapping (sample_id, pass_num) to text
-        latencies_by_pass: Dict mapping (sample_id, pass_num) to latency
-        tokenized_df: Original DataFrame with samples
-        pass_k: Number of passes per sample
-        output_file: Optional output pickle file
-
-    Returns:
-        DataFrame with columns for each pass (e.g., model_output_0, model_output_1, ...)
-    """
-    logger.info("Processing responses and updating DataFrame...")
-
-    # Work with the original DataFrame
-    result_df = tokenized_df.copy()
-
-    # Create columns for each pass with _0, _1, _2, ... suffixes
-    for pass_num in range(pass_k):
-        # Lists to store data for this pass
-        model_outputs = []
-        tok_model_outputs = []
-        tok_model_output_lens = []
-        infer_times = []
-
-        for idx in tokenized_df.index:
-            key = (idx, pass_num)
-            detokenized_text = detokenized_texts_by_pass.get(key, "")
-            response_ids = response_ids_by_pass.get(key, [])
-            latency = latencies_by_pass.get(key, None)
-
-            model_outputs.append(detokenized_text)
-            tok_model_outputs.append(response_ids)
-            tok_model_output_lens.append(len(response_ids))
-            infer_times.append(latency)
-
-        # Add columns with suffixes
-        result_df[f'model_output_{pass_num}'] = model_outputs
-        result_df[f'tok_model_output_{pass_num}'] = tok_model_outputs
-        result_df[f'tok_model_output_len_{pass_num}'] = tok_model_output_lens
-        result_df[f'infer_time_{pass_num}'] = infer_times
-
-    # Calculate output token lengths for logging
-    all_output_token_lengths = []
-    for idx in tokenized_df.index:
-        for pass_num in range(pass_k):
-            key = (idx, pass_num)
-            response = responses_by_pass.get(key, {})
-            response_ids = response_ids_by_pass.get(key, [])
-            try:
-                output_token_length = response.get(
-                    "meta_info", {}).get(
-                    "completion_tokens", len(response_ids))
-                all_output_token_lengths.append(output_token_length)
-            except Exception as e:
-                logger.warning(
-                    f"Failed to calculate output tokens for sample {idx}, pass {pass_num}: {e}")
-                all_output_token_lengths.append(len(response_ids))
-
-    logger.info(f"Updated DataFrame with shape: {result_df.shape}")
-    new_columns = [
-        f'model_output_{i}, tok_model_output_{i}, tok_model_output_len_{i}, infer_time_{i}' for i in range(pass_k)]
-    logger.info(f"Added columns for {pass_k} passes: {', '.join(new_columns)}")
-    if all_output_token_lengths:
-        logger.info(
-            f"Average output token length: {sum(all_output_token_lengths)/len(all_output_token_lengths):.1f}")
-
-    # Save to pickle file if output_file is provided
-    if output_file:
-        logger.info(f"Saving responses to {output_file}...")
-        result_df.to_pickle(output_file)
-        logger.info(f"Responses saved to {output_file}")
-
-    return result_df
-
-
-async def process_requests_async(tokenized_df: pd.DataFrame, server_urls: List[str],
-                                 max_samples: int = None, max_tokens: int = 100,
-                                 max_concurrency: int = 256, output_file: str = None,
-                                 temperature: float = 0.001, top_k: int = 1, top_p: float = 1.0,
-                                 timeout: int = 1200, pass_k: int = 1) -> pd.DataFrame:
-    """Main processing function that handles requests and response extraction.
-
-    Args:
-        server_urls: List of server URLs for round-robin load balancing
-        pass_k: Number of inference passes per sample for pass@k strategy
-    """
-
-    # Step 1: Limit samples if specified
-    if max_samples is not None:
-        tokenized_df = tokenized_df.head(max_samples)
-        logger.info(f"Limited to first {max_samples} samples")
-
-    # Step 2: Send all requests asynchronously (k passes per sample)
-    responses_by_pass, latencies_by_pass = await send_requests_async(
-        tokenized_df,
-        server_urls,
-        max_tokens,
-        max_concurrency,
-        temperature,
-        top_k,
-        top_p,
-        timeout,
-        pass_k)
-
-    # Step 3: Extract response output_ids for all passes
-    response_ids_by_pass = extract_response_ids(
-        responses_by_pass, tokenized_df, pass_k)
-
-    # Step 4: Detokenize output_ids to text for model_output for all passes
-    detokenized_texts_by_pass = detokenize_output_ids(
-        response_ids_by_pass, pass_k)
-
-    # Step 5: Save all results and return DataFrame
-    result_df = save_responses(
-        responses_by_pass,
-        response_ids_by_pass,
-        detokenized_texts_by_pass,
-        latencies_by_pass,
-        tokenized_df,
-        pass_k,
-        output_file)
-
-    return result_df
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Send text prompts to TensorRT-LLM server via OpenAI completions endpoint")
-    parser.add_argument("--input-tokens", required=True,
-                        help="Path to pickle file containing data with text_input column from harmony-tokens.py")
-    parser.add_argument("--server-url", default="localhost:8000",
-                        help="TensorRT-LLM server URL(s) - comma-separated for round-robin load balancing (default: localhost:8000)")
-    parser.add_argument("--max-samples", type=int, default=None,
-                        help="Maximum number of samples to process (default: all)")
-    parser.add_argument("--max-tokens", type=int, default=100,
-                        help="Maximum tokens to generate per request")
-    parser.add_argument("--max-concurrency", type=int, default=256,
-                        help="Maximum number of concurrent requests (default: 256)")
-    parser.add_argument("--output", default=None,
-                        help="Output pickle file for responses (optional)")
-    parser.add_argument("--pass-k", type=int, default=1,
-                        help="Number of inference passes per sample for pass@k strategy (default: 1)")
-    parser.add_argument("--temperature", type=float, default=0.001,
-                        help="Temperature for sampling (default: 0.001)")
-    parser.add_argument("--top-k", type=int, default=1,
-                        help="Top-k for sampling (default: 1)")
-    parser.add_argument("--top-p", type=float, default=1.0,
-                        help="Top-p for sampling (default: 1.0)")
-    parser.add_argument("--timeout", type=int, default=1200,
-                        help="Timeout for requests (default: 1200)")
-
-    args = parser.parse_args()
-
-    # Parse comma-separated server URLs
-    server_urls = [url.strip() for url in args.server_url.split(',')]
-    logger.info(
-        f"Configured {len(server_urls)} server(s) for round-robin load balancing")
-
-    # Test connection
-    async def test_connection():
-        logger.info(f"Testing server connection(s)...")
-        client = TRTLLMClient(
-            server_urls=server_urls,
-            temperature=args.temperature,
-            top_k=args.top_k,
-            top_p=args.top_p,
-            max_concurrency=1,
-            timeout=args.timeout
-        )
-        await client.initialize()
-
-        try:
-            _, _, test_response, _ = await client.send_request(
-                prompt="Test", max_tokens=5, sample_id=0, pass_num=0)
-            if "error" in test_response:
-                logger.error(
-                    f"Server connection failed: {test_response['error']}")
-                logger.error(
-                    "Make sure your TensorRT-LLM server(s) are running with OpenAI endpoint enabled.")
-                return False
-            logger.info("Server connection successful")
-            return True
-        finally:
-            await client.shutdown()
-
-    # Run connection test
-    if not asyncio.run(test_connection()):
-        return
-
-    # Load pre-tokenized data
-    tokenized_df = load_tokenized_data(args.input_tokens)
-
-    # Process requests and get result DataFrame
-    result_df = asyncio.run(process_requests_async(
-        tokenized_df, server_urls,
-        max_samples=args.max_samples,
-        max_tokens=args.max_tokens,
-        max_concurrency=args.max_concurrency,
-        output_file=args.output,
-        temperature=args.temperature,
-        top_k=args.top_k,
-        top_p=args.top_p,
-        timeout=args.timeout,
-        pass_k=args.pass_k))
-
-    # Print summary
-    logger.info(f"\nProcessing completed:")
-    logger.info(f"  - Total samples processed: {len(result_df)}")
-    logger.info(f"  - Number of passes per sample: {args.pass_k}")
-    logger.info(
-        f"  - Average input token length: {result_df['tok_input_len'].mean():.1f}")
-
-    # Calculate average output length across all passes
-    if args.pass_k == 1:
-        avg_output_len = result_df['tok_model_output_len_0'].mean()
-        logger.info(f"  - Average output token length: {avg_output_len:.1f}")
-    else:
-        all_output_lens = []
-        for i in range(args.pass_k):
-            all_output_lens.extend(
-                result_df[f'tok_model_output_len_{i}'].tolist())
-        avg_output_len = sum(all_output_lens) / \
-            len(all_output_lens) if all_output_lens else 0
-        logger.info(
-            f"  - Average output token length (across all passes): {avg_output_len:.1f}")
-
-    if args.output:
-        logger.info(f"  - Results saved to: {args.output}")
-    else:
-        logger.info("  - Results returned as DataFrame (not saved to file)")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/language/gpt-oss/run_pipeline_trtllm.sh b/language/gpt-oss/run_pipeline_trtllm.sh
deleted file mode 100755
index b4b692914d..0000000000
--- a/language/gpt-oss/run_pipeline_trtllm.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-#SBATCH --output slurm_logs/run_%j/stdout.txt
-
-output_dir=slurm_logs/run_$SLURM_JOBID
-
-srun_header="srun \
-        --container-mounts=$(pwd)/../:/work,/lustre/share/coreai_mlperf_inference/mlperf_inference_storage_clone:/home/mlperf_inference_storage/ \
-        --container-name=trtllm_gptoss_2 \
-        --container-mount-home --container-remap-root --container-workdir /work/gpt-oss"
-
-set -x
-
-$srun_header --container-image ./sqsh_files/trtllm_with_nettools.sqsh --output slurm_logs/run_$SLURM_JOBID/server_output.log ./run_server_trtllm.sh --model_path /home/mlperf_inference_storage/models/gpt-oss/gpt-oss-120b --output_dir $output_dir &
-
-sleep 20
-
-$srun_header --overlap /bin/bash -c '
-  while ! netstat -tuln | grep -q ":30000 .*LISTEN"; do
-    sleep 5
-  done
-'
-
-$srun_header --overlap /bin/bash -c '
-  while ! netstat -tuln | grep -q ":30007 .*LISTEN"; do
-    sleep 5
-  done
-'
-
-sleep 20
-
-$srun_header --overlap python3 run_infer_trtllm.py \
-	--input-tokens data/accuracy_eval_tokenized.pkl \
-	--output data/accuracy_eval_inferred_trtllm_job-$SLURM_JOBID-nongreedy_temp1_top-p1.pkl \
-	--max-tokens 32768 \
-	--server-url localhost:30000,localhost:30001,localhost:30002,localhost:30003,localhost:30004,localhost:30005,localhost:30006,localhost:30007 \
-	--max-concurrency 2048 \
-	--pass-k 5 \
-	--temperature 1.0 \
-	--top-p 1.0 --top-k 0 --timeout 2400

From be02519a9f00fbe8b51488bd62d828122348373a Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 20 Nov 2025 20:00:51 +0000
Subject: [PATCH 111/165] mv things

---
 language/gpt-oss/{ => archive}/run_infer.py | 0
 language/gpt-oss/{ => sglang}/run_server.sh | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename language/gpt-oss/{ => archive}/run_infer.py (100%)
 rename language/gpt-oss/{ => sglang}/run_server.sh (100%)

diff --git a/language/gpt-oss/run_infer.py b/language/gpt-oss/archive/run_infer.py
similarity index 100%
rename from language/gpt-oss/run_infer.py
rename to language/gpt-oss/archive/run_infer.py
diff --git a/language/gpt-oss/run_server.sh b/language/gpt-oss/sglang/run_server.sh
similarity index 100%
rename from language/gpt-oss/run_server.sh
rename to language/gpt-oss/sglang/run_server.sh

From 18a8444962798ac668196e67905eddd68519d21d Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 20 Nov 2025 21:50:16 +0000
Subject: [PATCH 112/165] add mlperf artifacts

---
 language/gpt-oss/backends/__init__.py       |  11 +
 language/gpt-oss/backends/base_backend.py   |  78 ++++
 language/gpt-oss/backends/sglang_backend.py | 337 ++++++++++++++
 language/gpt-oss/run_mlperf.py              | 463 ++++++++++++++++++++
 4 files changed, 889 insertions(+)
 create mode 100644 language/gpt-oss/backends/__init__.py
 create mode 100644 language/gpt-oss/backends/base_backend.py
 create mode 100644 language/gpt-oss/backends/sglang_backend.py
 create mode 100755 language/gpt-oss/run_mlperf.py

diff --git a/language/gpt-oss/backends/__init__.py b/language/gpt-oss/backends/__init__.py
new file mode 100644
index 0000000000..70eab9bf8c
--- /dev/null
+++ b/language/gpt-oss/backends/__init__.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python3
+"""Backend implementations for gpt-oss inference."""
+
+from .base_backend import BaseBackend
+from .sglang_backend import SGLangBackend
+
+__all__ = [
+    "BaseBackend",
+    "SGLangBackend",
+]
+
diff --git a/language/gpt-oss/backends/base_backend.py b/language/gpt-oss/backends/base_backend.py
new file mode 100644
index 0000000000..0f65434b58
--- /dev/null
+++ b/language/gpt-oss/backends/base_backend.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+"""Base backend class for gpt-oss inference."""
+
+import abc
+import logging
+from typing import List, Dict, Any, Optional
+
+logger = logging.getLogger(__name__)
+
+
+class BaseBackend(abc.ABC):
+    """Abstract base class for inference backends.
+    
+    All backends must implement this interface to work with the MLPerf SUT.
+    """
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """Initialize the backend.
+        
+        Args:
+            config: Optional configuration dictionary
+        """
+        self.config = config or {}
+        self.initialized = False
+        logger.info(f"Initializing {self.__class__.__name__}")
+
+    @abc.abstractmethod
+    def initialize(self) -> None:
+        """Initialize the backend (load model, connect to server, etc.)."""
+        raise NotImplementedError("Subclasses must implement initialize()")
+
+    @abc.abstractmethod
+    def generate(
+        self,
+        prompts: List[List[int]],
+        max_tokens: int = 100,
+        temperature: float = 0.001,
+        top_k: int = 1,
+        top_p: float = 1.0,
+        **kwargs
+    ) -> List[Dict[str, Any]]:
+        """Generate responses for a batch of prompts.
+        
+        Args:
+            prompts: List of token ID sequences
+            max_tokens: Maximum tokens to generate per prompt
+            temperature: Sampling temperature
+            top_k: Top-k sampling parameter
+            top_p: Top-p (nucleus) sampling parameter
+            **kwargs: Additional backend-specific parameters
+            
+        Returns:
+            List of response dictionaries with keys:
+                - output_ids: List of generated token IDs
+                - output_text: Generated text (optional)
+                - metadata: Additional metadata (latencies, etc.)
+        """
+        raise NotImplementedError("Subclasses must implement generate()")
+
+    @abc.abstractmethod
+    def cleanup(self) -> None:
+        """Clean up backend resources."""
+        raise NotImplementedError("Subclasses must implement cleanup()")
+
+    def __enter__(self):
+        """Context manager entry."""
+        self.initialize()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit."""
+        self.cleanup()
+
+    @property
+    def is_initialized(self) -> bool:
+        """Check if backend is initialized."""
+        return self.initialized
+
diff --git a/language/gpt-oss/backends/sglang_backend.py b/language/gpt-oss/backends/sglang_backend.py
new file mode 100644
index 0000000000..05c9600478
--- /dev/null
+++ b/language/gpt-oss/backends/sglang_backend.py
@@ -0,0 +1,337 @@
+#!/usr/bin/env python3
+"""SGLang backend implementation for gpt-oss."""
+
+import asyncio
+import json
+import logging
+import requests
+import time
+from typing import List, Dict, Any, Optional, AsyncIterator
+import aiohttp
+from .base_backend import BaseBackend
+
+logger = logging.getLogger(__name__)
+
+
+class SGLangBackend(BaseBackend):
+    """SGLang inference backend using HTTP API.
+    
+    Connects to an SGLang server running the gpt-oss model.
+    """
+
+    def __init__(
+        self,
+        server_url: str = "http://localhost:30000",
+        timeout: int = 1200,
+        **kwargs
+    ):
+        """Initialize SGLang backend.
+        
+        Args:
+            server_url: URL of the SGLang server
+            timeout: Request timeout in seconds
+            **kwargs: Additional configuration
+        """
+        config = {
+            "server_url": server_url,
+            "timeout": timeout,
+            **kwargs
+        }
+        super().__init__(config)
+        self.server_url = server_url
+        self.timeout = timeout
+        self.session = None
+
+    def initialize(self) -> None:
+        """Initialize connection to SGLang server."""
+        if self.initialized:
+            logger.warning("Backend already initialized")
+            return
+
+        logger.info(f"Connecting to SGLang server at {self.server_url}")
+        self.session = requests.Session()
+
+        # Test connection with a simple request
+        try:
+            test_response = self._send_request(
+                input_ids=[1, 2, 3],
+                max_tokens=5,
+                temperature=0.001,
+                top_k=1,
+                top_p=1.0
+            )
+            if "error" in test_response:
+                raise ConnectionError(
+                    f"Failed to connect to SGLang server: {test_response['error']}"
+                )
+            logger.info("Successfully connected to SGLang server")
+            self.initialized = True
+        except Exception as e:
+            logger.error(f"Failed to initialize SGLang backend: {e}")
+            raise
+
+    def _send_request(
+        self,
+        input_ids: List[int],
+        max_tokens: int,
+        temperature: float,
+        top_k: int,
+        top_p: float
+    ) -> Dict[str, Any]:
+        """Send a single request to the SGLang server.
+        
+        Args:
+            input_ids: Token IDs for the prompt
+            max_tokens: Maximum tokens to generate
+            temperature: Sampling temperature
+            top_k: Top-k parameter
+            top_p: Top-p parameter
+            
+        Returns:
+            Response dictionary from the server
+        """
+        payload = {
+            "input_ids": input_ids,
+            "sampling_params": {
+                "max_new_tokens": max_tokens,
+                "temperature": temperature,
+                "top_k": top_k,
+                "top_p": top_p,
+            }
+        }
+
+        try:
+            response = self.session.post(
+                f"{self.server_url}/generate",
+                json=payload,
+                timeout=self.timeout,
+            )
+            if response.status_code == 200:
+                return response.json()
+            else:
+                logger.error(
+                    f"Request failed with status {response.status_code}: {response.text}"
+                )
+                return {"error": f"HTTP {response.status_code}: {response.text}"}
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Request failed: {e}")
+            return {"error": str(e)}
+
+    def generate(
+        self,
+        prompts: List[List[int]],
+        max_tokens: int = 100,
+        temperature: float = 0.001,
+        top_k: int = 1,
+        top_p: float = 1.0,
+        **kwargs
+    ) -> List[Dict[str, Any]]:
+        """Generate responses for a batch of prompts.
+        
+        Args:
+            prompts: List of token ID sequences
+            max_tokens: Maximum tokens to generate per prompt
+            temperature: Sampling temperature
+            top_k: Top-k sampling parameter
+            top_p: Top-p (nucleus) sampling parameter
+            **kwargs: Additional parameters (ignored)
+            
+        Returns:
+            List of response dictionaries with keys:
+                - output_ids: List of generated token IDs
+                - output_text: Generated text (if available)
+                - metadata: Additional metadata (latencies, etc.)
+        """
+        if not self.initialized:
+            raise RuntimeError("Backend not initialized. Call initialize() first.")
+
+        results = []
+        for prompt_ids in prompts:
+            start_time = time.time()
+            response = self._send_request(
+                input_ids=prompt_ids,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p
+            )
+            end_time = time.time()
+            latency = end_time - start_time
+
+            # Extract output_ids from response
+            output_ids = []
+            output_text = ""
+            if "error" not in response:
+                output_ids = response.get("output_ids", [])
+                output_text = response.get("text", "")
+
+            result = {
+                "output_ids": output_ids,
+                "output_text": output_text,
+                "metadata": {
+                    "latency": latency,
+                    "completion_tokens": response.get("meta_info", {}).get(
+                        "completion_tokens", len(output_ids)
+                    ),
+                    "error": response.get("error"),
+                }
+            }
+            results.append(result)
+
+        return results
+
+    async def generate_stream(
+        self,
+        input_ids: List[int],
+        max_tokens: int = 100,
+        temperature: float = 0.001,
+        top_k: int = 1,
+        top_p: float = 1.0,
+        **kwargs
+    ) -> AsyncIterator[Dict[str, Any]]:
+        """Generate response with streaming support.
+        
+        Yields incremental responses as tokens are generated.
+        
+        Args:
+            input_ids: Token IDs for the prompt
+            max_tokens: Maximum tokens to generate
+            temperature: Sampling temperature
+            top_k: Top-k parameter
+            top_p: Top-p parameter
+            
+        Yields:
+            Dict with:
+                - delta_token_ids: List of new token IDs in this chunk
+                - delta_text: New text in this chunk
+                - is_first_token: True if this is the first token
+                - is_finished: True if generation is complete
+                - accumulated_token_ids: All tokens generated so far
+                - metadata: Additional info (TTFT, etc.)
+        """
+        if not self.initialized:
+            raise RuntimeError("Backend not initialized. Call initialize() first.")
+        
+        payload = {
+            "input_ids": input_ids,
+            "sampling_params": {
+                "max_new_tokens": max_tokens,
+                "temperature": temperature,
+                "top_k": top_k,
+                "top_p": top_p,
+            },
+            "stream": True  # Enable streaming
+        }
+        
+        start_time = time.time()
+        first_token_time = None
+        accumulated_token_ids = []
+        accumulated_text = ""
+        is_first = True
+        
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.post(
+                    f"{self.server_url}/generate",
+                    json=payload,
+                    timeout=aiohttp.ClientTimeout(total=self.timeout)
+                ) as response:
+                    if response.status != 200:
+                        error_text = await response.text()
+                        logger.error(f"Streaming request failed: {response.status} - {error_text}")
+                        yield {
+                            "delta_token_ids": [],
+                            "delta_text": "",
+                            "is_first_token": False,
+                            "is_finished": True,
+                            "accumulated_token_ids": [],
+                            "error": f"HTTP {response.status}: {error_text}",
+                            "metadata": {}
+                        }
+                        return
+                    
+                    # Read streaming response
+                    async for line in response.content:
+                        if not line:
+                            continue
+                        
+                        # SGLang sends data as "data: {...}\n\n"
+                        line_str = line.decode('utf-8').strip()
+                        if not line_str.startswith('data:'):
+                            continue
+                        
+                        try:
+                            json_str = line_str[5:].strip()  # Remove "data:" prefix
+                            if json_str == '[DONE]':
+                                break
+                            
+                            chunk = json.loads(json_str)
+                            
+                            # Extract token information from chunk
+                            delta_token_ids = chunk.get("token_ids", [])
+                            delta_text = chunk.get("text", "")
+                            
+                            if delta_token_ids:
+                                accumulated_token_ids.extend(delta_token_ids)
+                            if delta_text:
+                                accumulated_text += delta_text
+                            
+                            # Mark first token timing
+                            if is_first and delta_token_ids:
+                                first_token_time = time.time()
+                                is_first = False
+                            
+                            is_finished = chunk.get("finished", False)
+                            
+                            yield {
+                                "delta_token_ids": delta_token_ids,
+                                "delta_text": delta_text,
+                                "is_first_token": (first_token_time is not None and is_first is False and len(accumulated_token_ids) <= len(delta_token_ids)),
+                                "is_finished": is_finished,
+                                "accumulated_token_ids": accumulated_token_ids.copy(),
+                                "accumulated_text": accumulated_text,
+                                "metadata": {
+                                    "ttft_ms": (first_token_time - start_time) * 1000 if first_token_time else None,
+                                    "latency_ms": (time.time() - start_time) * 1000,
+                                    **chunk.get("meta_info", {})
+                                }
+                            }
+                            
+                            if is_finished:
+                                break
+                                
+                        except json.JSONDecodeError as e:
+                            logger.warning(f"Failed to parse streaming chunk: {e}")
+                            continue
+                            
+        except asyncio.TimeoutError:
+            logger.error(f"Streaming request timed out after {self.timeout}s")
+            yield {
+                "delta_token_ids": [],
+                "delta_text": "",
+                "is_first_token": False,
+                "is_finished": True,
+                "accumulated_token_ids": accumulated_token_ids,
+                "error": "Timeout",
+                "metadata": {}
+            }
+        except Exception as e:
+            logger.error(f"Streaming request failed: {e}", exc_info=True)
+            yield {
+                "delta_token_ids": [],
+                "delta_text": "",
+                "is_first_token": False,
+                "is_finished": True,
+                "accumulated_token_ids": accumulated_token_ids,
+                "error": str(e),
+                "metadata": {}
+            }
+
+    def cleanup(self) -> None:
+        """Clean up backend resources."""
+        if self.session:
+            self.session.close()
+            self.session = None
+        self.initialized = False
+        logger.info("SGLang backend cleaned up")
+
diff --git a/language/gpt-oss/run_mlperf.py b/language/gpt-oss/run_mlperf.py
new file mode 100755
index 0000000000..e22b16538e
--- /dev/null
+++ b/language/gpt-oss/run_mlperf.py
@@ -0,0 +1,463 @@
+#!/usr/bin/env python3
+"""MLPerf inference benchmark runner for gpt-oss.
+
+This script integrates the gpt-oss model with MLPerf LoadGen for
+performance and accuracy benchmarking.
+
+Usage:
+    # Offline scenario (performance)
+    python run_mlperf.py --mode offline --input-file data/accuracy_eval_tokenized.pkl
+    
+    # Server scenario (performance)
+    python run_mlperf.py --mode server --input-file data/accuracy_eval_tokenized.pkl
+    
+    # Accuracy mode
+    python run_mlperf.py --mode offline --accuracy --input-file data/accuracy_eval_tokenized.pkl
+"""
+
+import argparse
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import Optional
+
+import mlperf_loadgen as lg
+import pandas as pd
+from tqdm import tqdm
+
+from backends import SGLangBackend
+from mlperf import OfflineSUT, ServerSUT, QuerySampleLibrary
+from utils import load_tokenized_dataset, StandardTokenizer
+
+# Disable tokenizers parallelism to avoid forking issues
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
+def create_argument_parser() -> argparse.ArgumentParser:
+    """Create argument parser for MLPerf runner."""
+    parser = argparse.ArgumentParser(
+        description="Run MLPerf inference benchmarks for gpt-oss"
+    )
+
+    # Scenario selection
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default="offline",
+        choices=["offline", "server"],
+        help="MLPerf scenario mode"
+    )
+
+    # Dataset
+    parser.add_argument(
+        "--input-file",
+        type=str,
+        required=True,
+        help="Path to tokenized dataset (pickle file)"
+    )
+
+    parser.add_argument(
+        "--max-samples",
+        type=int,
+        default=None,
+        help="Maximum number of samples to use (None for all)"
+    )
+
+    # MLPerf configuration
+    parser.add_argument(
+        "--mlperf-conf",
+        type=str,
+        default="/home/scratch.shobhitv_coreai/mlcinf-repos/gpt-oss-perf/loadgen/mlperf.conf",
+        help="Path to MLPerf configuration file"
+    )
+
+    parser.add_argument(
+        "--user-conf",
+        type=str,
+        default="mlperf/user.conf",
+        help="Path to user configuration file"
+    )
+
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="Run accuracy mode instead of performance"
+    )
+
+    # Output configuration
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="mlperf_results",
+        help="Directory for MLPerf output logs"
+    )
+
+    # Backend configuration
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="sglang",
+        choices=["sglang"],
+        help="Backend to use for inference"
+    )
+
+    parser.add_argument(
+        "--server-url",
+        type=str,
+        default="http://localhost:30000",
+        help="Server URL for backend (SGLang)"
+    )
+
+    # Generation parameters
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=32768,
+        help="Maximum tokens to generate"
+    )
+
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.001,
+        help="Sampling temperature"
+    )
+
+    parser.add_argument(
+        "--top-k",
+        type=int,
+        default=1,
+        help="Top-k sampling parameter"
+    )
+
+    parser.add_argument(
+        "--top-p",
+        type=float,
+        default=1.0,
+        help="Top-p sampling parameter"
+    )
+
+    # Server scenario specific
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=1,
+        help="Number of worker threads (for server scenario)"
+    )
+
+    # Concurrency control
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=128,
+        help="Maximum concurrent requests to backend (SGLang handles batching internally)"
+    )
+
+    return parser
+
+
+def configure_loadgen(
+    scenario: str,
+    accuracy_mode: bool,
+    mlperf_conf: Optional[str] = None,
+    user_conf: Optional[str] = None,
+    log_dir: Optional[str] = None,
+    model_name: str = "gpt-oss"
+) -> lg.TestSettings:
+    """Configure LoadGen test settings.
+    
+    Args:
+        scenario: MLPerf scenario ("offline" or "server")
+        accuracy_mode: Whether to run in accuracy mode
+        mlperf_conf: Path to MLPerf config file
+        user_conf: Path to user config file
+        log_dir: Directory for logs
+        model_name: Model name for configuration
+        
+    Returns:
+        LoadGen TestSettings
+    """
+    settings = lg.TestSettings()
+
+    # Set scenario
+    if scenario.lower() == "offline":
+        settings.scenario = lg.TestScenario.Offline
+    elif scenario.lower() == "server":
+        settings.scenario = lg.TestScenario.Server
+    else:
+        raise ValueError(f"Unknown scenario: {scenario}")
+
+    # Set mode
+    if accuracy_mode:
+        settings.mode = lg.TestMode.AccuracyOnly
+    else:
+        settings.mode = lg.TestMode.PerformanceOnly
+
+    # Load configurations if files exist
+    if mlperf_conf and Path(mlperf_conf).exists():
+        logger.info(f"Loading MLPerf config from {mlperf_conf}")
+        settings.FromConfig(mlperf_conf, model_name, scenario.capitalize())
+    else:
+        logger.warning(f"MLPerf config not found: {mlperf_conf}")
+
+    if user_conf and Path(user_conf).exists():
+        logger.info(f"Loading user config from {user_conf}")
+        settings.FromConfig(user_conf, model_name, scenario.capitalize())
+    else:
+        logger.warning(f"User config not found: {user_conf}")
+
+    return settings
+
+
+def main():
+    """Main function."""
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    
+    # Track resources for cleanup
+    sut = None
+    qsl = None
+    backend = None
+    pbar = None
+    cleanup_done = False
+
+    def do_cleanup():
+        """Perform cleanup once and only once."""
+        nonlocal cleanup_done, pbar, sut, qsl, backend
+        
+        if cleanup_done:
+            return
+        cleanup_done = True
+        
+        logger.info("Performing cleanup...")
+        
+        # 1. Close progress bar first (before any LoadGen cleanup)
+        try:
+            if pbar is not None:
+                pbar.close()
+                pbar = None
+                logger.debug("  ✓ Progress bar closed")
+        except Exception as e:
+            logger.debug(f"  ! Error closing progress bar: {e}")
+        
+        # Small delay to let LoadGen internal threads finish
+        import time
+        time.sleep(0.5)
+        
+        # 2. Stop SUT (this will stop worker threads and flush)
+        try:
+            if sut is not None:
+                logger.info("  - Stopping SUT and worker threads...")
+                sut.stop()
+                sut = None
+                logger.info("    ✓ SUT stopped")
+        except Exception as e:
+            logger.warning(f"    ! Error stopping SUT: {e}")
+        
+        # 3. Destroy QSL
+        try:
+            if qsl is not None and qsl.qsl is not None:
+                logger.info("  - Destroying Query Sample Library...")
+                lg.DestroyQSL(qsl.qsl)
+                qsl.qsl = None
+                logger.info("    ✓ QSL destroyed")
+        except Exception as e:
+            logger.warning(f"    ! Error destroying QSL: {e}")
+        
+        # 4. Cleanup backend last
+        try:
+            if backend is not None and backend.initialized:
+                logger.info("  - Cleaning up backend connection...")
+                backend.cleanup()
+                backend = None
+                logger.info("    ✓ Backend cleaned up")
+        except Exception as e:
+            logger.warning(f"    ! Error cleaning up backend: {e}")
+
+    try:
+        # Create output directories
+        output_dir = Path(args.output_dir)
+        log_dir = output_dir / args.mode / ("accuracy" if args.accuracy else "performance")
+        log_dir.mkdir(parents=True, exist_ok=True)
+
+        logger.info("=" * 80)
+        logger.info("MLPerf Inference Benchmark Runner for GPT-OSS")
+        logger.info("=" * 80)
+        logger.info(f"Backend: {args.backend}")
+        logger.info(f"Mode: {args.mode}")
+        logger.info(f"Accuracy: {args.accuracy}")
+        logger.info(f"Input file: {args.input_file}")
+        logger.info(f"Output directory: {log_dir}")
+        logger.info("=" * 80)
+
+        # Load dataset
+        logger.info("Loading tokenized dataset...")
+        with tqdm(total=1, desc="Loading dataset", unit="file") as pbar:
+            dataset_info = load_tokenized_dataset(
+                args.input_file,
+                max_samples=args.max_samples
+            )
+            prompts = dataset_info["prompts"]
+            df = dataset_info["dataframe"]
+            pbar.update(1)
+        
+        logger.info(f"Loaded {len(prompts)} prompts from dataset")
+
+        # Initialize backend
+        logger.info(f"Initializing {args.backend} backend...")
+        if args.backend == "sglang":
+            backend = SGLangBackend(
+                server_url=args.server_url,
+                timeout=1200
+            )
+        else:
+            raise ValueError(f"Unknown backend: {args.backend}")
+
+        # Initialize backend
+        backend.initialize()
+        
+        # Create progress bar for real-time updates
+        pbar = tqdm(
+            total=len(prompts),
+            desc=f"MLPerf {args.mode}",
+            unit="query",
+            leave=True,
+            position=0,
+            mininterval=0.1,  # Update display every 0.1s minimum
+            smoothing=0.1      # Smooth display updates
+        )
+        
+        # Create SUT with progress bar
+        logger.info(f"Creating {args.mode} SUT...")
+        if args.mode == "offline":
+            sut = OfflineSUT(
+                backend=backend,
+                dataset=prompts,
+                max_tokens=args.max_tokens,
+                temperature=args.temperature,
+                top_k=args.top_k,
+                top_p=args.top_p,
+                name=f"gpt-oss_offline_sut",
+                progress_bar=pbar,
+                max_concurrency=args.max_concurrency
+            )
+        else:  # server
+            sut = ServerSUT(
+                backend=backend,
+                dataset=prompts,
+                max_tokens=args.max_tokens,
+                temperature=args.temperature,
+                top_k=args.top_k,
+                top_p=args.top_p,
+                num_workers=args.num_workers,
+                name=f"gpt-oss_server_sut",
+                progress_bar=pbar
+            )
+
+        # Create QSL
+        logger.info("Creating Query Sample Library...")
+        qsl = QuerySampleLibrary(prompts)
+        qsl.qsl = lg.ConstructQSL(
+            len(prompts),
+            len(prompts),
+            qsl.load_query_samples,
+            qsl.unload_query_samples
+        )
+        
+        # Configure LoadGen
+        settings = configure_loadgen(
+            scenario=args.mode,
+            accuracy_mode=args.accuracy,
+            mlperf_conf=args.mlperf_conf,
+            user_conf=args.user_conf,
+            log_dir=str(log_dir)
+        )
+
+        # Configure logging
+        log_settings = lg.LogSettings()
+        log_settings.log_output.outdir = str(log_dir)
+        log_settings.log_output.copy_summary_to_stdout = True
+        log_settings.enable_trace = False
+
+        # Start the SUT
+        logger.info("Starting SUT...")
+        sut.start()
+
+        # Run test
+        logger.info("Running LoadGen test...")
+        lg.StartTestWithLogSettings(
+            sut.sut,
+            qsl.qsl,
+            settings,
+            log_settings
+        )
+        logger.info("LoadGen test completed successfully")
+        
+        # Give LoadGen a moment to finish internal cleanup
+        import time
+        time.sleep(0.2)
+
+        # Flush queries
+        logger.info("Flushing queries...")
+        with tqdm(total=1, desc="Flushing queries", unit="batch") as pbar:
+            sut.flush_queries()
+            pbar.update(1)
+
+        # Get results
+        logger.info("Retrieving results...")
+        with tqdm(total=1, desc="Getting results", unit="batch") as pbar:
+            results = sut.get_results()
+            pbar.update(1)
+        logger.info(f"Retrieved {len(results)} results from SUT")
+
+        logger.info(f"MLPerf results saved to: {log_dir}")
+
+        # If in accuracy mode, prompt user to run evaluation
+        if args.accuracy:
+            logger.info("=" * 80)
+            logger.info("Accuracy mode completed!")
+            logger.info("To evaluate accuracy, run:")
+            logger.info(f"  python eval_accuracy.py --input-file {log_dir}/mlperf_log_accuracy.json")
+            logger.info("=" * 80)
+
+    except KeyboardInterrupt:
+        logger.info("\n" + "=" * 80)
+        logger.info("⚠️  Test interrupted by user (Ctrl+C)")
+        logger.info("=" * 80)
+        do_cleanup()
+        logger.info("=" * 80)
+        logger.info("✓ Cleanup completed successfully")
+        logger.info("=" * 80)
+        # Exit immediately to prevent finally block from running
+        os._exit(130)  # Use os._exit to skip finally block
+        
+    except Exception as e:
+        logger.error("\n" + "=" * 80)
+        logger.error(f"❌ Error during test: {e}")
+        logger.error("=" * 80)
+        logger.error("Stack trace:", exc_info=True)
+        do_cleanup()
+        logger.error("=" * 80)
+        # Exit immediately to prevent finally block from running
+        os._exit(1)
+    
+    finally:
+        # Only run cleanup if not already done (normal exit path)
+        if not cleanup_done:
+            do_cleanup()
+            logger.info("=" * 80)
+            logger.info("✓ Cleanup completed successfully")
+            logger.info("=" * 80)
+
+
+if __name__ == "__main__":
+    main()
+

From 199f476e4ba3e4b7da0351a35c15f18e33d94215 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 20 Nov 2025 21:51:45 +0000
Subject: [PATCH 113/165] add mlperf artifacts

---
 language/gpt-oss/mlperf/__init__.py    |  15 ++
 language/gpt-oss/mlperf/base_sut.py    |  91 +++++++
 language/gpt-oss/mlperf/offline_sut.py | 173 ++++++++++++
 language/gpt-oss/mlperf/qsl.py         |  66 +++++
 language/gpt-oss/mlperf/server_sut.py  | 349 +++++++++++++++++++++++++
 language/gpt-oss/mlperf/user.conf      |  22 ++
 6 files changed, 716 insertions(+)
 create mode 100644 language/gpt-oss/mlperf/__init__.py
 create mode 100644 language/gpt-oss/mlperf/base_sut.py
 create mode 100644 language/gpt-oss/mlperf/offline_sut.py
 create mode 100644 language/gpt-oss/mlperf/qsl.py
 create mode 100644 language/gpt-oss/mlperf/server_sut.py
 create mode 100644 language/gpt-oss/mlperf/user.conf

diff --git a/language/gpt-oss/mlperf/__init__.py b/language/gpt-oss/mlperf/__init__.py
new file mode 100644
index 0000000000..7426019422
--- /dev/null
+++ b/language/gpt-oss/mlperf/__init__.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+"""MLPerf inference integration for gpt-oss."""
+
+from .base_sut import BaseSUT
+from .offline_sut import OfflineSUT
+from .server_sut import ServerSUT
+from .qsl import QuerySampleLibrary
+
+__all__ = [
+    "BaseSUT",
+    "OfflineSUT",
+    "ServerSUT",
+    "QuerySampleLibrary",
+]
+
diff --git a/language/gpt-oss/mlperf/base_sut.py b/language/gpt-oss/mlperf/base_sut.py
new file mode 100644
index 0000000000..e1adf64720
--- /dev/null
+++ b/language/gpt-oss/mlperf/base_sut.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+"""Base System Under Test (SUT) class for MLPerf inference benchmarks."""
+
+import abc
+import logging
+from typing import List, Dict, Any, Optional
+import mlperf_loadgen as lg
+
+logger = logging.getLogger(__name__)
+
+
+class BaseSUT(abc.ABC):
+    """Base class for MLPerf inference System Under Test (SUT).
+    
+    This class defines the interface that all SUTs must implement for MLPerf
+    inference benchmarks. It provides two main methods:
+    - issue_queries: to enqueue prompt tokens
+    - flush_queries: to await completion of all issued queries
+    """
+
+    def __init__(self, backend, dataset: List[List[int]], name: str = "BaseSUT", progress_bar=None):
+        """Initialize the base SUT.
+        
+        Args:
+            backend: Backend instance for inference
+            dataset: List of tokenized prompts
+            name: Name of the SUT for logging purposes
+            progress_bar: Optional tqdm progress bar for real-time updates
+        """
+        self.backend = backend
+        self.dataset = dataset
+        self.name = name
+        self.sut = None
+        self.results = {}
+        self.progress_bar = progress_bar
+        logger.info(f"Initializing {self.name}")
+
+    @abc.abstractmethod
+    def issue_queries(self, query_samples: List[lg.QuerySample]) -> None:
+        """Issue queries to the SUT.
+        
+        This method should enqueue the provided query samples for processing.
+        It should return immediately without waiting for completion.
+        
+        Args:
+            query_samples: List of MLPerf LoadGen query samples to process
+        """
+        raise NotImplementedError("Subclasses must implement issue_queries")
+
+    @abc.abstractmethod
+    def flush_queries(self) -> None:
+        """Flush all pending queries.
+        
+        This method should wait for all previously issued queries to complete
+        before returning. It's called by LoadGen to ensure all work is done.
+        """
+        raise NotImplementedError("Subclasses must implement flush_queries")
+
+    def start(self) -> lg.ConstructSUT:
+        """Start the SUT and return the LoadGen SUT handle.
+        
+        Returns:
+            LoadGen SUT handle for use with LoadGen
+        """
+        self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
+        logger.info(f"{self.name} started")
+        return self.sut
+
+    def stop(self) -> None:
+        """Stop the SUT and clean up resources."""
+        if self.sut:
+            lg.DestroySUT(self.sut)
+            self.sut = None
+            logger.info(f"{self.name} stopped")
+
+    def get_results(self) -> Dict[int, Any]:
+        """Get all results from completed queries.
+        
+        Returns:
+            Dictionary mapping query IDs to results
+        """
+        return self.results
+
+    def __enter__(self):
+        """Context manager entry."""
+        return self.start()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit."""
+        self.stop()
+
diff --git a/language/gpt-oss/mlperf/offline_sut.py b/language/gpt-oss/mlperf/offline_sut.py
new file mode 100644
index 0000000000..dc50ab64fa
--- /dev/null
+++ b/language/gpt-oss/mlperf/offline_sut.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+"""Offline scenario SUT implementation for gpt-oss."""
+
+import logging
+import numpy as np
+import time
+from typing import List, Dict, Any
+import mlperf_loadgen as lg
+from tqdm import tqdm
+from .base_sut import BaseSUT
+
+logger = logging.getLogger(__name__)
+
+
+class OfflineSUT(BaseSUT):
+    """Offline scenario System Under Test.
+    
+    In the Offline scenario, all queries are issued at once and can be
+    processed in any order. This allows for maximum batching and throughput.
+    """
+
+    def __init__(
+        self,
+        backend,
+        dataset: List[List[int]],
+        max_tokens: int = 32768,
+        temperature: float = 0.001,
+        top_k: int = 1,
+        top_p: float = 1.0,
+        name: str = "OfflineSUT",
+        progress_bar=None,
+        max_concurrency: int = 128
+    ):
+        """Initialize the Offline SUT.
+        
+        Args:
+            backend: Backend instance for inference
+            dataset: List of tokenized prompts
+            max_tokens: Maximum tokens to generate
+            temperature: Sampling temperature
+            top_k: Top-k sampling parameter
+            top_p: Top-p sampling parameter
+            name: Name of the SUT
+            progress_bar: Optional tqdm progress bar for real-time updates
+            max_concurrency: Maximum concurrent requests to backend (SGLang does in-flight batching)
+        """
+        super().__init__(backend, dataset, name, progress_bar)
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        self.top_k = top_k
+        self.top_p = top_p
+        self.pending_queries = []
+        self.max_concurrency = max_concurrency
+        
+        logger.info(f"OfflineSUT configured with max_concurrency={max_concurrency} (backend handles batching)")
+
+    def issue_queries(self, query_samples: List[lg.QuerySample]) -> None:
+        """Issue queries to the SUT.
+        
+        In Offline mode, we accumulate all queries and process them in batch.
+        
+        Args:
+            query_samples: List of MLPerf LoadGen query samples
+        """
+        logger.info(f"Received {len(query_samples)} queries")
+        
+        # Store queries for batch processing
+        for qs in query_samples:
+            self.pending_queries.append(qs)
+
+    def flush_queries(self) -> None:
+        """Process all accumulated queries with concurrent requests.
+        
+        Sends individual requests concurrently up to max_concurrency limit.
+        SGLang handles batching internally via continuous batching.
+        """
+        if not self.pending_queries:
+            logger.info("No pending queries to flush")
+            return
+
+        logger.info(f"Flushing {len(self.pending_queries)} queries with max_concurrency={self.max_concurrency}")
+        start_time = time.time()
+
+        from concurrent.futures import ThreadPoolExecutor, as_completed
+        
+        def process_single_query(query_sample):
+            """Process a single query (backend batches automatically via continuous batching)."""
+            query_id = query_sample.id
+            input_ids = self.dataset[query_sample.index]
+            
+            # Call backend with single query
+            # SGLang will batch this with other concurrent requests automatically
+            responses = self.backend.generate(
+                prompts=[input_ids],  # Single query as list
+                max_tokens=self.max_tokens,
+                temperature=self.temperature,
+                top_k=self.top_k,
+                top_p=self.top_p
+            )
+            
+            return query_id, query_sample, responses[0]
+        
+        try:
+            # Process queries in parallel with max_concurrency
+            logger.info(f"Submitting {len(self.pending_queries)} queries to {self.max_concurrency} concurrent workers...")
+            with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
+                # Submit all queries at once
+                futures = [executor.submit(process_single_query, qs) for qs in self.pending_queries]
+                
+                # Process results as they complete
+                completed_count = 0
+                for future in as_completed(futures):
+                    try:
+                        query_id, query_sample, response = future.result()
+                        output_ids = response.get("output_ids", [])
+                        
+                        # Store results
+                        self.results[query_id] = {
+                            "output_ids": output_ids,
+                            "output_text": response.get("output_text", ""),
+                            "metadata": response.get("metadata", {})
+                        }
+
+                        # Convert output_ids to numpy array for LoadGen
+                        # LoadGen expects int32 token IDs as a contiguous array
+                        if output_ids:
+                            token_array = np.ascontiguousarray(output_ids, dtype=np.int32)
+                            output_data_ptr = token_array.ctypes.data
+                            output_data_size = token_array.nbytes
+                        else:
+                            # Empty response
+                            token_array = np.array([], dtype=np.int32)
+                            output_data_ptr = 0
+                            output_data_size = 0
+                        
+                        # Create response for LoadGen
+                        response_array = [
+                            lg.QuerySampleResponse(
+                                query_id,
+                                output_data_ptr,
+                                output_data_size
+                            )
+                        ]
+                        
+                        # Report completion to LoadGen
+                        lg.QuerySamplesComplete(response_array)
+                        
+                        # Update progress bar
+                        if self.progress_bar is not None:
+                            self.progress_bar.update(1)
+                            self.progress_bar.refresh()
+                        
+                        completed_count += 1
+                        # Log progress at debug level only (tqdm shows progress)
+                        if completed_count % 100 == 0:
+                            logger.debug(f"Completed {completed_count}/{len(self.pending_queries)} queries")
+                    
+                    except Exception as e:
+                        logger.error(f"Error processing query: {e}", exc_info=True)
+
+            elapsed = time.time() - start_time
+            logger.info(
+                f"Completed {len(self.pending_queries)} queries in {elapsed:.2f}s "
+                f"({len(self.pending_queries)/elapsed:.2f} QPS)"
+            )
+
+        except Exception as e:
+            logger.error(f"Error during concurrent flush: {e}", exc_info=True)
+            raise
+        finally:
+            # Clear pending queries
+            self.pending_queries = []
+
diff --git a/language/gpt-oss/mlperf/qsl.py b/language/gpt-oss/mlperf/qsl.py
new file mode 100644
index 0000000000..8a6395f9cb
--- /dev/null
+++ b/language/gpt-oss/mlperf/qsl.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+"""Query Sample Library for gpt-oss MLPerf integration."""
+
+import logging
+from typing import List
+import mlperf_loadgen as lg
+
+logger = logging.getLogger(__name__)
+
+
+class QuerySampleLibrary:
+    """Query Sample Library implementation.
+    
+    This class manages the dataset of samples that LoadGen will query.
+    """
+
+    def __init__(self, dataset: List[List[int]]):
+        """Initialize the Query Sample Library.
+        
+        Args:
+            dataset: List of tokenized prompts (list of token ID lists)
+        """
+        self.dataset = dataset
+        self.qsl = None
+        logger.info(f"Initializing QSL with {len(dataset)} samples")
+
+    def load_query_samples(self, sample_indices: List[int]) -> None:
+        """Load specified query samples into memory.
+        
+        Args:
+            sample_indices: List of sample indices to load
+        """
+        # For this implementation, all samples are already in memory
+        logger.info(f"Loading {len(sample_indices)} query samples")
+
+    def unload_query_samples(self, sample_indices: List[int]) -> None:
+        """Unload specified query samples from memory.
+        
+        Args:
+            sample_indices: List of sample indices to unload
+        """
+        # For this implementation, we keep all samples in memory
+        logger.info(f"Unloading {len(sample_indices)} query samples")
+
+    def __len__(self) -> int:
+        """Return the number of samples in the dataset."""
+        return len(self.dataset)
+
+    def __enter__(self):
+        """Context manager entry."""
+        self.qsl = lg.ConstructQSL(
+            len(self.dataset),
+            len(self.dataset),  # performance sample count
+            self.load_query_samples,
+            self.unload_query_samples
+        )
+        logger.info("QSL constructed")
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit."""
+        if self.qsl:
+            lg.DestroyQSL(self.qsl)
+            self.qsl = None
+            logger.info("QSL destroyed")
+
diff --git a/language/gpt-oss/mlperf/server_sut.py b/language/gpt-oss/mlperf/server_sut.py
new file mode 100644
index 0000000000..53e72319f6
--- /dev/null
+++ b/language/gpt-oss/mlperf/server_sut.py
@@ -0,0 +1,349 @@
+#!/usr/bin/env python3
+"""Server scenario SUT implementation with streaming support for gpt-oss."""
+
+import asyncio
+import logging
+import numpy as np
+import queue
+import threading
+import time
+from dataclasses import dataclass
+from typing import List, Dict, Any, Optional
+import mlperf_loadgen as lg
+
+from .base_sut import BaseSUT
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class StreamingQueryState:
+    """State for a streaming query."""
+    query_sample: lg.QuerySample
+    query_id: int
+    input_ids: List[int]
+    accumulated_tokens: List[int]
+    accumulated_text: str
+    first_token_received: bool
+    first_token_time: Optional[float]
+    start_time: float
+    finished: bool
+
+
+class ServerSUT(BaseSUT):
+    """Server scenario SUT with streaming support.
+    
+    Properly reports FirstTokenComplete and QuerySamplesComplete to LoadGen.
+    """
+
+    def __init__(
+        self,
+        backend,
+        dataset: List[List[int]],
+        max_tokens: int = 32768,
+        temperature: float = 0.001,
+        top_k: int = 1,
+        top_p: float = 1.0,
+        num_workers: int = 1,
+        name: str = "ServerSUT",
+        progress_bar=None
+    ):
+        """Initialize the Server SUT.
+        
+        Args:
+            backend: Backend instance for inference (must support streaming)
+            dataset: List of tokenized prompts
+            max_tokens: Maximum tokens to generate
+            temperature: Sampling temperature
+            top_k: Top-k sampling parameter
+            top_p: Top-p sampling parameter
+            num_workers: Number of worker threads
+            name: Name of the SUT
+            progress_bar: Optional tqdm progress bar for real-time updates
+        """
+        super().__init__(backend, dataset, name, progress_bar)
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        self.top_k = top_k
+        self.top_p = top_p
+        self.num_workers = num_workers
+        
+        # Query queue and streaming state
+        self.query_queue = queue.Queue()
+        self.active_streams: Dict[int, StreamingQueryState] = {}
+        self.active_streams_lock = threading.Lock()
+        
+        # Worker threads
+        self.workers = []
+        self.should_stop = threading.Event()
+        
+        # Progress tracking
+        self.queries_completed = 0
+        self.progress_lock = threading.Lock()
+        
+        # Event loop for async streaming
+        self.loop = None
+        self.loop_thread = None
+        
+        logger.info(f"ServerSUT configured with num_workers={num_workers} (streaming enabled)")
+
+    def start(self) -> lg.ConstructSUT:
+        """Start the SUT and worker threads."""
+        # Start event loop thread for async streaming
+        self._start_event_loop()
+        
+        # Start worker threads
+        self._start_workers()
+        
+        # Create LoadGen SUT
+        self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries, self.name)
+        logger.info(f"{self.name} started with streaming support")
+        return self.sut
+
+    def _start_event_loop(self):
+        """Start the asyncio event loop in a separate thread."""
+        def run_loop():
+            self.loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(self.loop)
+            self.loop.run_forever()
+        
+        self.loop_thread = threading.Thread(target=run_loop, daemon=True)
+        self.loop_thread.start()
+        
+        # Wait for loop to be ready
+        while self.loop is None:
+            time.sleep(0.001)
+        
+        logger.info("Async event loop started")
+
+    def _start_workers(self):
+        """Start worker threads for processing queries."""
+        for i in range(self.num_workers):
+            worker = threading.Thread(
+                target=self._worker_thread,
+                name=f"ServerWorker-{i}",
+                daemon=True
+            )
+            self.workers.append(worker)
+            worker.start()
+        logger.info(f"Started {self.num_workers} worker threads")
+
+    def _worker_thread(self):
+        """Worker thread that processes queries from the queue."""
+        try:
+            while not self.should_stop.is_set():
+                try:
+                    query_sample = self.query_queue.get(timeout=0.1)
+                except queue.Empty:
+                    continue
+                except KeyboardInterrupt:
+                    logger.info("Worker thread interrupted, exiting gracefully...")
+                    break
+                
+                # Schedule async streaming processing
+                if self.loop and not self.should_stop.is_set():
+                    future = asyncio.run_coroutine_threadsafe(
+                        self._process_streaming_query(query_sample),
+                        self.loop
+                    )
+                    # Don't wait for completion - it happens asynchronously
+                    
+        except Exception as e:
+            logger.error(f"Worker thread error: {e}", exc_info=True)
+
+    async def _process_streaming_query(self, query_sample: lg.QuerySample):
+        """Process a single query with streaming support.
+        
+        Args:
+            query_sample: MLPerf LoadGen query sample
+        """
+        query_id = query_sample.id
+        sample_idx = query_sample.index
+        input_ids = self.dataset[sample_idx]
+        
+        # Initialize streaming state
+        state = StreamingQueryState(
+            query_sample=query_sample,
+            query_id=query_id,
+            input_ids=input_ids,
+            accumulated_tokens=[],
+            accumulated_text="",
+            first_token_received=False,
+            first_token_time=None,
+            start_time=time.time(),
+            finished=False
+        )
+        
+        with self.active_streams_lock:
+            self.active_streams[query_id] = state
+        
+        try:
+            # Stream tokens from backend
+            async for chunk in self.backend.generate_stream(
+                input_ids=input_ids,
+                max_tokens=self.max_tokens,
+                temperature=self.temperature,
+                top_k=self.top_k,
+                top_p=self.top_p
+            ):
+                # Update state
+                if chunk.get("delta_token_ids"):
+                    state.accumulated_tokens.extend(chunk["delta_token_ids"])
+                if chunk.get("delta_text"):
+                    state.accumulated_text += chunk["delta_text"]
+                
+                # Send FirstTokenComplete on first token
+                if chunk.get("is_first_token") and not state.first_token_received:
+                    state.first_token_received = True
+                    state.first_token_time = time.time()
+                    await self._send_first_token_complete(state)
+                
+                # Check if finished
+                if chunk.get("is_finished"):
+                    state.finished = True
+                    await self._send_final_response(state)
+                    break
+            
+            # If no explicit finish signal, send final response
+            if not state.finished:
+                state.finished = True
+                await self._send_final_response(state)
+                
+        except Exception as e:
+            logger.error(f"Error processing streaming query {query_id}: {e}", exc_info=True)
+            # Send empty response to unblock LoadGen
+            try:
+                await self._send_final_response(state)
+            except:
+                pass
+        finally:
+            # Clean up
+            with self.active_streams_lock:
+                self.active_streams.pop(query_id, None)
+
+    async def _send_first_token_complete(self, state: StreamingQueryState):
+        """Send FirstTokenComplete to LoadGen for TTFT measurement."""
+        try:
+            logger.debug(f"First token for query {state.query_id} at {state.first_token_time - state.start_time:.3f}s")
+            
+            # Convert tokens to numpy array
+            if state.accumulated_tokens:
+                token_array = np.ascontiguousarray(state.accumulated_tokens, dtype=np.int32)
+            else:
+                # Need at least an empty array
+                token_array = np.array([], dtype=np.int32)
+            
+            # Create response
+            response = lg.QuerySampleResponse(
+                state.query_id,
+                token_array.ctypes.data if token_array.size > 0 else 0,
+                token_array.nbytes,
+                len(token_array)
+            )
+            
+            # Report to LoadGen
+            lg.FirstTokenComplete([response])
+            
+        except Exception as e:
+            logger.error(f"Error sending FirstTokenComplete for query {state.query_id}: {e}", exc_info=True)
+
+    async def _send_final_response(self, state: StreamingQueryState):
+        """Send final QuerySamplesComplete to LoadGen."""
+        try:
+            logger.debug(f"Final response for query {state.query_id}, {len(state.accumulated_tokens)} tokens")
+            
+            # Store results
+            self.results[state.query_id] = {
+                "output_ids": state.accumulated_tokens,
+                "output_text": state.accumulated_text,
+                "metadata": {
+                    "latency": time.time() - state.start_time,
+                    "ttft": state.first_token_time - state.start_time if state.first_token_time else None,
+                }
+            }
+            
+            # Convert tokens to numpy array
+            if state.accumulated_tokens:
+                token_array = np.ascontiguousarray(state.accumulated_tokens, dtype=np.int32)
+            else:
+                token_array = np.array([], dtype=np.int32)
+            
+            # Create response
+            response = lg.QuerySampleResponse(
+                state.query_id,
+                token_array.ctypes.data if token_array.size > 0 else 0,
+                token_array.nbytes,
+                len(token_array)
+            )
+            
+            # Report to LoadGen
+            lg.QuerySamplesComplete([response])
+            
+            # Update progress bar (force refresh for async updates)
+            if self.progress_bar is not None:
+                with self.progress_lock:
+                    self.queries_completed += 1
+                    self.progress_bar.update(1)
+                    self.progress_bar.refresh()  # Force redraw from async context
+                    
+        except Exception as e:
+            logger.error(f"Error sending final response for query {state.query_id}: {e}", exc_info=True)
+
+    def issue_queries(self, query_samples: List[lg.QuerySample]) -> None:
+        """Issue queries to the SUT.
+        
+        In Server mode, queries are added to a queue for worker threads.
+        
+        Args:
+            query_samples: List of MLPerf LoadGen query samples
+        """
+        for qs in query_samples:
+            self.query_queue.put(qs)
+
+    def flush_queries(self) -> None:
+        """Flush all pending queries.
+        
+        Wait for all issued queries to complete.
+        """
+        logger.info("Flushing server queries...")
+        
+        # Wait for queue to empty and all streams to complete
+        while True:
+            queue_empty = self.query_queue.empty()
+            
+            with self.active_streams_lock:
+                no_active_streams = len(self.active_streams) == 0
+            
+            if queue_empty and no_active_streams:
+                break
+            
+            time.sleep(0.01)
+        
+        logger.info("Server queries flushed")
+
+    def stop(self) -> None:
+        """Stop the SUT and clean up resources."""
+        if self.should_stop.is_set():
+            logger.info(f"{self.name} already stopping or stopped.")
+            return
+        
+        logger.info(f"Stopping {self.name}...")
+        self.should_stop.set()
+        
+        # Wait for workers
+        for i, worker in enumerate(self.workers):
+            logger.info(f"Waiting for worker {i+1}/{len(self.workers)}...")
+            worker.join(timeout=5)
+            if worker.is_alive():
+                logger.warning(f"Worker {i+1} did not terminate gracefully")
+        
+        # Stop event loop
+        if self.loop:
+            self.loop.call_soon_threadsafe(self.loop.stop)
+            if self.loop_thread:
+                self.loop_thread.join(timeout=2)
+        
+        logger.info("All workers stopped")
+        
+        # Destroy LoadGen SUT
+        super().stop()
diff --git a/language/gpt-oss/mlperf/user.conf b/language/gpt-oss/mlperf/user.conf
new file mode 100644
index 0000000000..9dc487f39f
--- /dev/null
+++ b/language/gpt-oss/mlperf/user.conf
@@ -0,0 +1,22 @@
+# User configuration for gpt-oss MLPerf inference
+# Override default settings from mlperf.conf here
+
+# Offline scenario
+gpt-oss.Offline.target_qps = 1.0
+gpt-oss.Offline.min_duration = 60000
+gpt-oss.Offline.min_query_count = 100
+
+# Server scenario
+gpt-oss.Server.target_qps = 1.0
+gpt-oss.Server.min_duration = 60000
+gpt-oss.Server.target_latency = 0
+gpt-oss.Server.ttft_latency = 2000
+gpt-oss.Server.tpot_latency = 20
+
+# SingleStream scenario (if needed)
+gpt-oss.SingleStream.target_latency = 1000
+gpt-oss.SingleStream.min_duration = 60000
+
+# Performance sample count
+# gpt-oss.*.performance_sample_count_override = 8036
+

From 425ce75551be9e339cd96b666cbbbba277b08777 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 20 Nov 2025 21:52:14 +0000
Subject: [PATCH 114/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/backends/__init__.py       |   1 -
 language/gpt-oss/backends/base_backend.py   |   9 +-
 language/gpt-oss/backends/sglang_backend.py |  64 +++++-----
 language/gpt-oss/mlperf/__init__.py         |   1 -
 language/gpt-oss/mlperf/base_sut.py         |  18 +--
 language/gpt-oss/mlperf/offline_sut.py      |  66 +++++-----
 language/gpt-oss/mlperf/qsl.py              |   9 +-
 language/gpt-oss/mlperf/server_sut.py       | 130 +++++++++++---------
 language/gpt-oss/run_mlperf.py              |  45 +++----
 9 files changed, 185 insertions(+), 158 deletions(-)

diff --git a/language/gpt-oss/backends/__init__.py b/language/gpt-oss/backends/__init__.py
index 70eab9bf8c..3f68dc171c 100644
--- a/language/gpt-oss/backends/__init__.py
+++ b/language/gpt-oss/backends/__init__.py
@@ -8,4 +8,3 @@
     "BaseBackend",
     "SGLangBackend",
 ]
-
diff --git a/language/gpt-oss/backends/base_backend.py b/language/gpt-oss/backends/base_backend.py
index 0f65434b58..228de1ced8 100644
--- a/language/gpt-oss/backends/base_backend.py
+++ b/language/gpt-oss/backends/base_backend.py
@@ -10,13 +10,13 @@
 
 class BaseBackend(abc.ABC):
     """Abstract base class for inference backends.
-    
+
     All backends must implement this interface to work with the MLPerf SUT.
     """
 
     def __init__(self, config: Optional[Dict[str, Any]] = None):
         """Initialize the backend.
-        
+
         Args:
             config: Optional configuration dictionary
         """
@@ -40,7 +40,7 @@ def generate(
         **kwargs
     ) -> List[Dict[str, Any]]:
         """Generate responses for a batch of prompts.
-        
+
         Args:
             prompts: List of token ID sequences
             max_tokens: Maximum tokens to generate per prompt
@@ -48,7 +48,7 @@ def generate(
             top_k: Top-k sampling parameter
             top_p: Top-p (nucleus) sampling parameter
             **kwargs: Additional backend-specific parameters
-            
+
         Returns:
             List of response dictionaries with keys:
                 - output_ids: List of generated token IDs
@@ -75,4 +75,3 @@ def __exit__(self, exc_type, exc_val, exc_tb):
     def is_initialized(self) -> bool:
         """Check if backend is initialized."""
         return self.initialized
-
diff --git a/language/gpt-oss/backends/sglang_backend.py b/language/gpt-oss/backends/sglang_backend.py
index 05c9600478..0d932d35bf 100644
--- a/language/gpt-oss/backends/sglang_backend.py
+++ b/language/gpt-oss/backends/sglang_backend.py
@@ -15,7 +15,7 @@
 
 class SGLangBackend(BaseBackend):
     """SGLang inference backend using HTTP API.
-    
+
     Connects to an SGLang server running the gpt-oss model.
     """
 
@@ -26,7 +26,7 @@ def __init__(
         **kwargs
     ):
         """Initialize SGLang backend.
-        
+
         Args:
             server_url: URL of the SGLang server
             timeout: Request timeout in seconds
@@ -79,14 +79,14 @@ def _send_request(
         top_p: float
     ) -> Dict[str, Any]:
         """Send a single request to the SGLang server.
-        
+
         Args:
             input_ids: Token IDs for the prompt
             max_tokens: Maximum tokens to generate
             temperature: Sampling temperature
             top_k: Top-k parameter
             top_p: Top-p parameter
-            
+
         Returns:
             Response dictionary from the server
         """
@@ -127,7 +127,7 @@ def generate(
         **kwargs
     ) -> List[Dict[str, Any]]:
         """Generate responses for a batch of prompts.
-        
+
         Args:
             prompts: List of token ID sequences
             max_tokens: Maximum tokens to generate per prompt
@@ -135,7 +135,7 @@ def generate(
             top_k: Top-k sampling parameter
             top_p: Top-p (nucleus) sampling parameter
             **kwargs: Additional parameters (ignored)
-            
+
         Returns:
             List of response dictionaries with keys:
                 - output_ids: List of generated token IDs
@@ -143,7 +143,8 @@ def generate(
                 - metadata: Additional metadata (latencies, etc.)
         """
         if not self.initialized:
-            raise RuntimeError("Backend not initialized. Call initialize() first.")
+            raise RuntimeError(
+                "Backend not initialized. Call initialize() first.")
 
         results = []
         for prompt_ids in prompts:
@@ -190,16 +191,16 @@ async def generate_stream(
         **kwargs
     ) -> AsyncIterator[Dict[str, Any]]:
         """Generate response with streaming support.
-        
+
         Yields incremental responses as tokens are generated.
-        
+
         Args:
             input_ids: Token IDs for the prompt
             max_tokens: Maximum tokens to generate
             temperature: Sampling temperature
             top_k: Top-k parameter
             top_p: Top-p parameter
-            
+
         Yields:
             Dict with:
                 - delta_token_ids: List of new token IDs in this chunk
@@ -210,8 +211,9 @@ async def generate_stream(
                 - metadata: Additional info (TTFT, etc.)
         """
         if not self.initialized:
-            raise RuntimeError("Backend not initialized. Call initialize() first.")
-        
+            raise RuntimeError(
+                "Backend not initialized. Call initialize() first.")
+
         payload = {
             "input_ids": input_ids,
             "sampling_params": {
@@ -222,13 +224,13 @@ async def generate_stream(
             },
             "stream": True  # Enable streaming
         }
-        
+
         start_time = time.time()
         first_token_time = None
         accumulated_token_ids = []
         accumulated_text = ""
         is_first = True
-        
+
         try:
             async with aiohttp.ClientSession() as session:
                 async with session.post(
@@ -238,7 +240,8 @@ async def generate_stream(
                 ) as response:
                     if response.status != 200:
                         error_text = await response.text()
-                        logger.error(f"Streaming request failed: {response.status} - {error_text}")
+                        logger.error(
+                            f"Streaming request failed: {response.status} - {error_text}")
                         yield {
                             "delta_token_ids": [],
                             "delta_text": "",
@@ -249,40 +252,41 @@ async def generate_stream(
                             "metadata": {}
                         }
                         return
-                    
+
                     # Read streaming response
                     async for line in response.content:
                         if not line:
                             continue
-                        
+
                         # SGLang sends data as "data: {...}\n\n"
                         line_str = line.decode('utf-8').strip()
                         if not line_str.startswith('data:'):
                             continue
-                        
+
                         try:
-                            json_str = line_str[5:].strip()  # Remove "data:" prefix
+                            # Remove "data:" prefix
+                            json_str = line_str[5:].strip()
                             if json_str == '[DONE]':
                                 break
-                            
+
                             chunk = json.loads(json_str)
-                            
+
                             # Extract token information from chunk
                             delta_token_ids = chunk.get("token_ids", [])
                             delta_text = chunk.get("text", "")
-                            
+
                             if delta_token_ids:
                                 accumulated_token_ids.extend(delta_token_ids)
                             if delta_text:
                                 accumulated_text += delta_text
-                            
+
                             # Mark first token timing
                             if is_first and delta_token_ids:
                                 first_token_time = time.time()
                                 is_first = False
-                            
+
                             is_finished = chunk.get("finished", False)
-                            
+
                             yield {
                                 "delta_token_ids": delta_token_ids,
                                 "delta_text": delta_text,
@@ -296,14 +300,15 @@ async def generate_stream(
                                     **chunk.get("meta_info", {})
                                 }
                             }
-                            
+
                             if is_finished:
                                 break
-                                
+
                         except json.JSONDecodeError as e:
-                            logger.warning(f"Failed to parse streaming chunk: {e}")
+                            logger.warning(
+                                f"Failed to parse streaming chunk: {e}")
                             continue
-                            
+
         except asyncio.TimeoutError:
             logger.error(f"Streaming request timed out after {self.timeout}s")
             yield {
@@ -334,4 +339,3 @@ def cleanup(self) -> None:
             self.session = None
         self.initialized = False
         logger.info("SGLang backend cleaned up")
-
diff --git a/language/gpt-oss/mlperf/__init__.py b/language/gpt-oss/mlperf/__init__.py
index 7426019422..c5aaa0d243 100644
--- a/language/gpt-oss/mlperf/__init__.py
+++ b/language/gpt-oss/mlperf/__init__.py
@@ -12,4 +12,3 @@
     "ServerSUT",
     "QuerySampleLibrary",
 ]
-
diff --git a/language/gpt-oss/mlperf/base_sut.py b/language/gpt-oss/mlperf/base_sut.py
index e1adf64720..cb87f03c1f 100644
--- a/language/gpt-oss/mlperf/base_sut.py
+++ b/language/gpt-oss/mlperf/base_sut.py
@@ -11,16 +11,17 @@
 
 class BaseSUT(abc.ABC):
     """Base class for MLPerf inference System Under Test (SUT).
-    
+
     This class defines the interface that all SUTs must implement for MLPerf
     inference benchmarks. It provides two main methods:
     - issue_queries: to enqueue prompt tokens
     - flush_queries: to await completion of all issued queries
     """
 
-    def __init__(self, backend, dataset: List[List[int]], name: str = "BaseSUT", progress_bar=None):
+    def __init__(
+            self, backend, dataset: List[List[int]], name: str = "BaseSUT", progress_bar=None):
         """Initialize the base SUT.
-        
+
         Args:
             backend: Backend instance for inference
             dataset: List of tokenized prompts
@@ -38,10 +39,10 @@ def __init__(self, backend, dataset: List[List[int]], name: str = "BaseSUT", pro
     @abc.abstractmethod
     def issue_queries(self, query_samples: List[lg.QuerySample]) -> None:
         """Issue queries to the SUT.
-        
+
         This method should enqueue the provided query samples for processing.
         It should return immediately without waiting for completion.
-        
+
         Args:
             query_samples: List of MLPerf LoadGen query samples to process
         """
@@ -50,7 +51,7 @@ def issue_queries(self, query_samples: List[lg.QuerySample]) -> None:
     @abc.abstractmethod
     def flush_queries(self) -> None:
         """Flush all pending queries.
-        
+
         This method should wait for all previously issued queries to complete
         before returning. It's called by LoadGen to ensure all work is done.
         """
@@ -58,7 +59,7 @@ def flush_queries(self) -> None:
 
     def start(self) -> lg.ConstructSUT:
         """Start the SUT and return the LoadGen SUT handle.
-        
+
         Returns:
             LoadGen SUT handle for use with LoadGen
         """
@@ -75,7 +76,7 @@ def stop(self) -> None:
 
     def get_results(self) -> Dict[int, Any]:
         """Get all results from completed queries.
-        
+
         Returns:
             Dictionary mapping query IDs to results
         """
@@ -88,4 +89,3 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Context manager exit."""
         self.stop()
-
diff --git a/language/gpt-oss/mlperf/offline_sut.py b/language/gpt-oss/mlperf/offline_sut.py
index dc50ab64fa..986f5d9287 100644
--- a/language/gpt-oss/mlperf/offline_sut.py
+++ b/language/gpt-oss/mlperf/offline_sut.py
@@ -14,7 +14,7 @@
 
 class OfflineSUT(BaseSUT):
     """Offline scenario System Under Test.
-    
+
     In the Offline scenario, all queries are issued at once and can be
     processed in any order. This allows for maximum batching and throughput.
     """
@@ -32,7 +32,7 @@ def __init__(
         max_concurrency: int = 128
     ):
         """Initialize the Offline SUT.
-        
+
         Args:
             backend: Backend instance for inference
             dataset: List of tokenized prompts
@@ -51,26 +51,27 @@ def __init__(
         self.top_p = top_p
         self.pending_queries = []
         self.max_concurrency = max_concurrency
-        
-        logger.info(f"OfflineSUT configured with max_concurrency={max_concurrency} (backend handles batching)")
+
+        logger.info(
+            f"OfflineSUT configured with max_concurrency={max_concurrency} (backend handles batching)")
 
     def issue_queries(self, query_samples: List[lg.QuerySample]) -> None:
         """Issue queries to the SUT.
-        
+
         In Offline mode, we accumulate all queries and process them in batch.
-        
+
         Args:
             query_samples: List of MLPerf LoadGen query samples
         """
         logger.info(f"Received {len(query_samples)} queries")
-        
+
         # Store queries for batch processing
         for qs in query_samples:
             self.pending_queries.append(qs)
 
     def flush_queries(self) -> None:
         """Process all accumulated queries with concurrent requests.
-        
+
         Sends individual requests concurrently up to max_concurrency limit.
         SGLang handles batching internally via continuous batching.
         """
@@ -78,18 +79,20 @@ def flush_queries(self) -> None:
             logger.info("No pending queries to flush")
             return
 
-        logger.info(f"Flushing {len(self.pending_queries)} queries with max_concurrency={self.max_concurrency}")
+        logger.info(
+            f"Flushing {len(self.pending_queries)} queries with max_concurrency={self.max_concurrency}")
         start_time = time.time()
 
         from concurrent.futures import ThreadPoolExecutor, as_completed
-        
+
         def process_single_query(query_sample):
             """Process a single query (backend batches automatically via continuous batching)."""
             query_id = query_sample.id
             input_ids = self.dataset[query_sample.index]
-            
+
             # Call backend with single query
-            # SGLang will batch this with other concurrent requests automatically
+            # SGLang will batch this with other concurrent requests
+            # automatically
             responses = self.backend.generate(
                 prompts=[input_ids],  # Single query as list
                 max_tokens=self.max_tokens,
@@ -97,23 +100,27 @@ def process_single_query(query_sample):
                 top_k=self.top_k,
                 top_p=self.top_p
             )
-            
+
             return query_id, query_sample, responses[0]
-        
+
         try:
             # Process queries in parallel with max_concurrency
-            logger.info(f"Submitting {len(self.pending_queries)} queries to {self.max_concurrency} concurrent workers...")
+            logger.info(
+                f"Submitting {len(self.pending_queries)} queries to {self.max_concurrency} concurrent workers...")
             with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
                 # Submit all queries at once
-                futures = [executor.submit(process_single_query, qs) for qs in self.pending_queries]
-                
+                futures = [
+                    executor.submit(
+                        process_single_query,
+                        qs) for qs in self.pending_queries]
+
                 # Process results as they complete
                 completed_count = 0
                 for future in as_completed(futures):
                     try:
                         query_id, query_sample, response = future.result()
                         output_ids = response.get("output_ids", [])
-                        
+
                         # Store results
                         self.results[query_id] = {
                             "output_ids": output_ids,
@@ -124,7 +131,8 @@ def process_single_query(query_sample):
                         # Convert output_ids to numpy array for LoadGen
                         # LoadGen expects int32 token IDs as a contiguous array
                         if output_ids:
-                            token_array = np.ascontiguousarray(output_ids, dtype=np.int32)
+                            token_array = np.ascontiguousarray(
+                                output_ids, dtype=np.int32)
                             output_data_ptr = token_array.ctypes.data
                             output_data_size = token_array.nbytes
                         else:
@@ -132,7 +140,7 @@ def process_single_query(query_sample):
                             token_array = np.array([], dtype=np.int32)
                             output_data_ptr = 0
                             output_data_size = 0
-                        
+
                         # Create response for LoadGen
                         response_array = [
                             lg.QuerySampleResponse(
@@ -141,22 +149,25 @@ def process_single_query(query_sample):
                                 output_data_size
                             )
                         ]
-                        
+
                         # Report completion to LoadGen
                         lg.QuerySamplesComplete(response_array)
-                        
+
                         # Update progress bar
                         if self.progress_bar is not None:
                             self.progress_bar.update(1)
                             self.progress_bar.refresh()
-                        
+
                         completed_count += 1
-                        # Log progress at debug level only (tqdm shows progress)
+                        # Log progress at debug level only (tqdm shows
+                        # progress)
                         if completed_count % 100 == 0:
-                            logger.debug(f"Completed {completed_count}/{len(self.pending_queries)} queries")
-                    
+                            logger.debug(
+                                f"Completed {completed_count}/{len(self.pending_queries)} queries")
+
                     except Exception as e:
-                        logger.error(f"Error processing query: {e}", exc_info=True)
+                        logger.error(
+                            f"Error processing query: {e}", exc_info=True)
 
             elapsed = time.time() - start_time
             logger.info(
@@ -170,4 +181,3 @@ def process_single_query(query_sample):
         finally:
             # Clear pending queries
             self.pending_queries = []
-
diff --git a/language/gpt-oss/mlperf/qsl.py b/language/gpt-oss/mlperf/qsl.py
index 8a6395f9cb..e7b06a1bb8 100644
--- a/language/gpt-oss/mlperf/qsl.py
+++ b/language/gpt-oss/mlperf/qsl.py
@@ -10,13 +10,13 @@
 
 class QuerySampleLibrary:
     """Query Sample Library implementation.
-    
+
     This class manages the dataset of samples that LoadGen will query.
     """
 
     def __init__(self, dataset: List[List[int]]):
         """Initialize the Query Sample Library.
-        
+
         Args:
             dataset: List of tokenized prompts (list of token ID lists)
         """
@@ -26,7 +26,7 @@ def __init__(self, dataset: List[List[int]]):
 
     def load_query_samples(self, sample_indices: List[int]) -> None:
         """Load specified query samples into memory.
-        
+
         Args:
             sample_indices: List of sample indices to load
         """
@@ -35,7 +35,7 @@ def load_query_samples(self, sample_indices: List[int]) -> None:
 
     def unload_query_samples(self, sample_indices: List[int]) -> None:
         """Unload specified query samples from memory.
-        
+
         Args:
             sample_indices: List of sample indices to unload
         """
@@ -63,4 +63,3 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             lg.DestroyQSL(self.qsl)
             self.qsl = None
             logger.info("QSL destroyed")
-
diff --git a/language/gpt-oss/mlperf/server_sut.py b/language/gpt-oss/mlperf/server_sut.py
index 53e72319f6..880a9a0c25 100644
--- a/language/gpt-oss/mlperf/server_sut.py
+++ b/language/gpt-oss/mlperf/server_sut.py
@@ -32,7 +32,7 @@ class StreamingQueryState:
 
 class ServerSUT(BaseSUT):
     """Server scenario SUT with streaming support.
-    
+
     Properly reports FirstTokenComplete and QuerySamplesComplete to LoadGen.
     """
 
@@ -49,7 +49,7 @@ def __init__(
         progress_bar=None
     ):
         """Initialize the Server SUT.
-        
+
         Args:
             backend: Backend instance for inference (must support streaming)
             dataset: List of tokenized prompts
@@ -67,36 +67,40 @@ def __init__(
         self.top_k = top_k
         self.top_p = top_p
         self.num_workers = num_workers
-        
+
         # Query queue and streaming state
         self.query_queue = queue.Queue()
         self.active_streams: Dict[int, StreamingQueryState] = {}
         self.active_streams_lock = threading.Lock()
-        
+
         # Worker threads
         self.workers = []
         self.should_stop = threading.Event()
-        
+
         # Progress tracking
         self.queries_completed = 0
         self.progress_lock = threading.Lock()
-        
+
         # Event loop for async streaming
         self.loop = None
         self.loop_thread = None
-        
-        logger.info(f"ServerSUT configured with num_workers={num_workers} (streaming enabled)")
+
+        logger.info(
+            f"ServerSUT configured with num_workers={num_workers} (streaming enabled)")
 
     def start(self) -> lg.ConstructSUT:
         """Start the SUT and worker threads."""
         # Start event loop thread for async streaming
         self._start_event_loop()
-        
+
         # Start worker threads
         self._start_workers()
-        
+
         # Create LoadGen SUT
-        self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries, self.name)
+        self.sut = lg.ConstructSUT(
+            self.issue_queries,
+            self.flush_queries,
+            self.name)
         logger.info(f"{self.name} started with streaming support")
         return self.sut
 
@@ -106,14 +110,14 @@ def run_loop():
             self.loop = asyncio.new_event_loop()
             asyncio.set_event_loop(self.loop)
             self.loop.run_forever()
-        
+
         self.loop_thread = threading.Thread(target=run_loop, daemon=True)
         self.loop_thread.start()
-        
+
         # Wait for loop to be ready
         while self.loop is None:
             time.sleep(0.001)
-        
+
         logger.info("Async event loop started")
 
     def _start_workers(self):
@@ -137,9 +141,10 @@ def _worker_thread(self):
                 except queue.Empty:
                     continue
                 except KeyboardInterrupt:
-                    logger.info("Worker thread interrupted, exiting gracefully...")
+                    logger.info(
+                        "Worker thread interrupted, exiting gracefully...")
                     break
-                
+
                 # Schedule async streaming processing
                 if self.loop and not self.should_stop.is_set():
                     future = asyncio.run_coroutine_threadsafe(
@@ -147,20 +152,20 @@ def _worker_thread(self):
                         self.loop
                     )
                     # Don't wait for completion - it happens asynchronously
-                    
+
         except Exception as e:
             logger.error(f"Worker thread error: {e}", exc_info=True)
 
     async def _process_streaming_query(self, query_sample: lg.QuerySample):
         """Process a single query with streaming support.
-        
+
         Args:
             query_sample: MLPerf LoadGen query sample
         """
         query_id = query_sample.id
         sample_idx = query_sample.index
         input_ids = self.dataset[sample_idx]
-        
+
         # Initialize streaming state
         state = StreamingQueryState(
             query_sample=query_sample,
@@ -173,10 +178,10 @@ async def _process_streaming_query(self, query_sample: lg.QuerySample):
             start_time=time.time(),
             finished=False
         )
-        
+
         with self.active_streams_lock:
             self.active_streams[query_id] = state
-        
+
         try:
             # Stream tokens from backend
             async for chunk in self.backend.generate_stream(
@@ -191,30 +196,33 @@ async def _process_streaming_query(self, query_sample: lg.QuerySample):
                     state.accumulated_tokens.extend(chunk["delta_token_ids"])
                 if chunk.get("delta_text"):
                     state.accumulated_text += chunk["delta_text"]
-                
+
                 # Send FirstTokenComplete on first token
-                if chunk.get("is_first_token") and not state.first_token_received:
+                if chunk.get(
+                        "is_first_token") and not state.first_token_received:
                     state.first_token_received = True
                     state.first_token_time = time.time()
                     await self._send_first_token_complete(state)
-                
+
                 # Check if finished
                 if chunk.get("is_finished"):
                     state.finished = True
                     await self._send_final_response(state)
                     break
-            
+
             # If no explicit finish signal, send final response
             if not state.finished:
                 state.finished = True
                 await self._send_final_response(state)
-                
+
         except Exception as e:
-            logger.error(f"Error processing streaming query {query_id}: {e}", exc_info=True)
+            logger.error(
+                f"Error processing streaming query {query_id}: {e}",
+                exc_info=True)
             # Send empty response to unblock LoadGen
             try:
                 await self._send_final_response(state)
-            except:
+            except BaseException:
                 pass
         finally:
             # Clean up
@@ -224,15 +232,17 @@ async def _process_streaming_query(self, query_sample: lg.QuerySample):
     async def _send_first_token_complete(self, state: StreamingQueryState):
         """Send FirstTokenComplete to LoadGen for TTFT measurement."""
         try:
-            logger.debug(f"First token for query {state.query_id} at {state.first_token_time - state.start_time:.3f}s")
-            
+            logger.debug(
+                f"First token for query {state.query_id} at {state.first_token_time - state.start_time:.3f}s")
+
             # Convert tokens to numpy array
             if state.accumulated_tokens:
-                token_array = np.ascontiguousarray(state.accumulated_tokens, dtype=np.int32)
+                token_array = np.ascontiguousarray(
+                    state.accumulated_tokens, dtype=np.int32)
             else:
                 # Need at least an empty array
                 token_array = np.array([], dtype=np.int32)
-            
+
             # Create response
             response = lg.QuerySampleResponse(
                 state.query_id,
@@ -240,18 +250,21 @@ async def _send_first_token_complete(self, state: StreamingQueryState):
                 token_array.nbytes,
                 len(token_array)
             )
-            
+
             # Report to LoadGen
             lg.FirstTokenComplete([response])
-            
+
         except Exception as e:
-            logger.error(f"Error sending FirstTokenComplete for query {state.query_id}: {e}", exc_info=True)
+            logger.error(
+                f"Error sending FirstTokenComplete for query {state.query_id}: {e}",
+                exc_info=True)
 
     async def _send_final_response(self, state: StreamingQueryState):
         """Send final QuerySamplesComplete to LoadGen."""
         try:
-            logger.debug(f"Final response for query {state.query_id}, {len(state.accumulated_tokens)} tokens")
-            
+            logger.debug(
+                f"Final response for query {state.query_id}, {len(state.accumulated_tokens)} tokens")
+
             # Store results
             self.results[state.query_id] = {
                 "output_ids": state.accumulated_tokens,
@@ -261,13 +274,14 @@ async def _send_final_response(self, state: StreamingQueryState):
                     "ttft": state.first_token_time - state.start_time if state.first_token_time else None,
                 }
             }
-            
+
             # Convert tokens to numpy array
             if state.accumulated_tokens:
-                token_array = np.ascontiguousarray(state.accumulated_tokens, dtype=np.int32)
+                token_array = np.ascontiguousarray(
+                    state.accumulated_tokens, dtype=np.int32)
             else:
                 token_array = np.array([], dtype=np.int32)
-            
+
             # Create response
             response = lg.QuerySampleResponse(
                 state.query_id,
@@ -275,25 +289,27 @@ async def _send_final_response(self, state: StreamingQueryState):
                 token_array.nbytes,
                 len(token_array)
             )
-            
+
             # Report to LoadGen
             lg.QuerySamplesComplete([response])
-            
+
             # Update progress bar (force refresh for async updates)
             if self.progress_bar is not None:
                 with self.progress_lock:
                     self.queries_completed += 1
                     self.progress_bar.update(1)
                     self.progress_bar.refresh()  # Force redraw from async context
-                    
+
         except Exception as e:
-            logger.error(f"Error sending final response for query {state.query_id}: {e}", exc_info=True)
+            logger.error(
+                f"Error sending final response for query {state.query_id}: {e}",
+                exc_info=True)
 
     def issue_queries(self, query_samples: List[lg.QuerySample]) -> None:
         """Issue queries to the SUT.
-        
+
         In Server mode, queries are added to a queue for worker threads.
-        
+
         Args:
             query_samples: List of MLPerf LoadGen query samples
         """
@@ -302,23 +318,23 @@ def issue_queries(self, query_samples: List[lg.QuerySample]) -> None:
 
     def flush_queries(self) -> None:
         """Flush all pending queries.
-        
+
         Wait for all issued queries to complete.
         """
         logger.info("Flushing server queries...")
-        
+
         # Wait for queue to empty and all streams to complete
         while True:
             queue_empty = self.query_queue.empty()
-            
+
             with self.active_streams_lock:
                 no_active_streams = len(self.active_streams) == 0
-            
+
             if queue_empty and no_active_streams:
                 break
-            
+
             time.sleep(0.01)
-        
+
         logger.info("Server queries flushed")
 
     def stop(self) -> None:
@@ -326,24 +342,24 @@ def stop(self) -> None:
         if self.should_stop.is_set():
             logger.info(f"{self.name} already stopping or stopped.")
             return
-        
+
         logger.info(f"Stopping {self.name}...")
         self.should_stop.set()
-        
+
         # Wait for workers
         for i, worker in enumerate(self.workers):
             logger.info(f"Waiting for worker {i+1}/{len(self.workers)}...")
             worker.join(timeout=5)
             if worker.is_alive():
                 logger.warning(f"Worker {i+1} did not terminate gracefully")
-        
+
         # Stop event loop
         if self.loop:
             self.loop.call_soon_threadsafe(self.loop.stop)
             if self.loop_thread:
                 self.loop_thread.join(timeout=2)
-        
+
         logger.info("All workers stopped")
-        
+
         # Destroy LoadGen SUT
         super().stop()
diff --git a/language/gpt-oss/run_mlperf.py b/language/gpt-oss/run_mlperf.py
index e22b16538e..d30b1cd4ee 100755
--- a/language/gpt-oss/run_mlperf.py
+++ b/language/gpt-oss/run_mlperf.py
@@ -7,10 +7,10 @@
 Usage:
     # Offline scenario (performance)
     python run_mlperf.py --mode offline --input-file data/accuracy_eval_tokenized.pkl
-    
+
     # Server scenario (performance)
     python run_mlperf.py --mode server --input-file data/accuracy_eval_tokenized.pkl
-    
+
     # Accuracy mode
     python run_mlperf.py --mode offline --accuracy --input-file data/accuracy_eval_tokenized.pkl
 """
@@ -173,7 +173,7 @@ def configure_loadgen(
     model_name: str = "gpt-oss"
 ) -> lg.TestSettings:
     """Configure LoadGen test settings.
-    
+
     Args:
         scenario: MLPerf scenario ("offline" or "server")
         accuracy_mode: Whether to run in accuracy mode
@@ -181,7 +181,7 @@ def configure_loadgen(
         user_conf: Path to user config file
         log_dir: Directory for logs
         model_name: Model name for configuration
-        
+
     Returns:
         LoadGen TestSettings
     """
@@ -221,7 +221,7 @@ def main():
     """Main function."""
     parser = create_argument_parser()
     args = parser.parse_args()
-    
+
     # Track resources for cleanup
     sut = None
     qsl = None
@@ -232,13 +232,13 @@ def main():
     def do_cleanup():
         """Perform cleanup once and only once."""
         nonlocal cleanup_done, pbar, sut, qsl, backend
-        
+
         if cleanup_done:
             return
         cleanup_done = True
-        
+
         logger.info("Performing cleanup...")
-        
+
         # 1. Close progress bar first (before any LoadGen cleanup)
         try:
             if pbar is not None:
@@ -247,11 +247,11 @@ def do_cleanup():
                 logger.debug("  ✓ Progress bar closed")
         except Exception as e:
             logger.debug(f"  ! Error closing progress bar: {e}")
-        
+
         # Small delay to let LoadGen internal threads finish
         import time
         time.sleep(0.5)
-        
+
         # 2. Stop SUT (this will stop worker threads and flush)
         try:
             if sut is not None:
@@ -261,7 +261,7 @@ def do_cleanup():
                 logger.info("    ✓ SUT stopped")
         except Exception as e:
             logger.warning(f"    ! Error stopping SUT: {e}")
-        
+
         # 3. Destroy QSL
         try:
             if qsl is not None and qsl.qsl is not None:
@@ -271,7 +271,7 @@ def do_cleanup():
                 logger.info("    ✓ QSL destroyed")
         except Exception as e:
             logger.warning(f"    ! Error destroying QSL: {e}")
-        
+
         # 4. Cleanup backend last
         try:
             if backend is not None and backend.initialized:
@@ -285,7 +285,8 @@ def do_cleanup():
     try:
         # Create output directories
         output_dir = Path(args.output_dir)
-        log_dir = output_dir / args.mode / ("accuracy" if args.accuracy else "performance")
+        log_dir = output_dir / args.mode / \
+            ("accuracy" if args.accuracy else "performance")
         log_dir.mkdir(parents=True, exist_ok=True)
 
         logger.info("=" * 80)
@@ -308,7 +309,7 @@ def do_cleanup():
             prompts = dataset_info["prompts"]
             df = dataset_info["dataframe"]
             pbar.update(1)
-        
+
         logger.info(f"Loaded {len(prompts)} prompts from dataset")
 
         # Initialize backend
@@ -323,7 +324,7 @@ def do_cleanup():
 
         # Initialize backend
         backend.initialize()
-        
+
         # Create progress bar for real-time updates
         pbar = tqdm(
             total=len(prompts),
@@ -334,7 +335,7 @@ def do_cleanup():
             mininterval=0.1,  # Update display every 0.1s minimum
             smoothing=0.1      # Smooth display updates
         )
-        
+
         # Create SUT with progress bar
         logger.info(f"Creating {args.mode} SUT...")
         if args.mode == "offline":
@@ -371,7 +372,7 @@ def do_cleanup():
             qsl.load_query_samples,
             qsl.unload_query_samples
         )
-        
+
         # Configure LoadGen
         settings = configure_loadgen(
             scenario=args.mode,
@@ -400,7 +401,7 @@ def do_cleanup():
             log_settings
         )
         logger.info("LoadGen test completed successfully")
-        
+
         # Give LoadGen a moment to finish internal cleanup
         import time
         time.sleep(0.2)
@@ -425,7 +426,8 @@ def do_cleanup():
             logger.info("=" * 80)
             logger.info("Accuracy mode completed!")
             logger.info("To evaluate accuracy, run:")
-            logger.info(f"  python eval_accuracy.py --input-file {log_dir}/mlperf_log_accuracy.json")
+            logger.info(
+                f"  python eval_accuracy.py --input-file {log_dir}/mlperf_log_accuracy.json")
             logger.info("=" * 80)
 
     except KeyboardInterrupt:
@@ -438,7 +440,7 @@ def do_cleanup():
         logger.info("=" * 80)
         # Exit immediately to prevent finally block from running
         os._exit(130)  # Use os._exit to skip finally block
-        
+
     except Exception as e:
         logger.error("\n" + "=" * 80)
         logger.error(f"❌ Error during test: {e}")
@@ -448,7 +450,7 @@ def do_cleanup():
         logger.error("=" * 80)
         # Exit immediately to prevent finally block from running
         os._exit(1)
-    
+
     finally:
         # Only run cleanup if not already done (normal exit path)
         if not cleanup_done:
@@ -460,4 +462,3 @@ def do_cleanup():
 
 if __name__ == "__main__":
     main()
-

From 7cdc7cbc89063de1d91f9e965605eb833b6cbb25 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 20 Nov 2025 21:58:04 +0000
Subject: [PATCH 115/165] add utils, gitignore

---
 language/gpt-oss/.gitignore            |   3 +
 language/gpt-oss/utils/__init__.py     |  10 ++
 language/gpt-oss/utils/tokenization.py | 133 +++++++++++++++++++++++++
 3 files changed, 146 insertions(+)
 create mode 100644 language/gpt-oss/.gitignore
 create mode 100644 language/gpt-oss/utils/__init__.py
 create mode 100644 language/gpt-oss/utils/tokenization.py

diff --git a/language/gpt-oss/.gitignore b/language/gpt-oss/.gitignore
new file mode 100644
index 0000000000..78317dc552
--- /dev/null
+++ b/language/gpt-oss/.gitignore
@@ -0,0 +1,3 @@
+*venv*
+*.pkl
+*.csv
\ No newline at end of file
diff --git a/language/gpt-oss/utils/__init__.py b/language/gpt-oss/utils/__init__.py
new file mode 100644
index 0000000000..fc1dd7cb1e
--- /dev/null
+++ b/language/gpt-oss/utils/__init__.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python3
+"""Utilities for gpt-oss MLPerf integration."""
+
+from .tokenization import StandardTokenizer, load_tokenized_dataset
+
+__all__ = [
+    "StandardTokenizer",
+    "load_tokenized_dataset",
+]
+
diff --git a/language/gpt-oss/utils/tokenization.py b/language/gpt-oss/utils/tokenization.py
new file mode 100644
index 0000000000..bb89bda445
--- /dev/null
+++ b/language/gpt-oss/utils/tokenization.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+"""Tokenization utilities for gpt-oss."""
+
+import logging
+from typing import List, Dict, Any, Optional
+import pandas as pd
+from transformers import AutoTokenizer
+
+logger = logging.getLogger(__name__)
+
+MODEL_NAME = "openai/gpt-oss-120b"
+
+
+class StandardTokenizer:
+    """Standard tokenizer wrapper for gpt-oss model."""
+
+    def __init__(self, model_name: str = MODEL_NAME):
+        """Initialize the tokenizer.
+        
+        Args:
+            model_name: HuggingFace model name or path
+        """
+        self.model_name = model_name
+        self.tokenizer = None
+        logger.info(f"Initializing tokenizer for {model_name}")
+
+    def load(self) -> None:
+        """Load the tokenizer."""
+        if self.tokenizer is None:
+            logger.info(f"Loading tokenizer from {self.model_name}")
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            logger.info("Tokenizer loaded successfully")
+
+    def encode(self, text: str) -> List[int]:
+        """Encode text to token IDs.
+        
+        Args:
+            text: Input text
+            
+        Returns:
+            List of token IDs
+        """
+        if self.tokenizer is None:
+            self.load()
+        return self.tokenizer.encode(text)
+
+    def decode(self, token_ids: List[int], skip_special_tokens: bool = True) -> str:
+        """Decode token IDs to text.
+        
+        Args:
+            token_ids: List of token IDs
+            skip_special_tokens: Whether to skip special tokens
+            
+        Returns:
+            Decoded text
+        """
+        if self.tokenizer is None:
+            self.load()
+        return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
+
+    def __call__(self, text: str) -> List[int]:
+        """Encode text to token IDs (callable interface).
+        
+        Args:
+            text: Input text
+            
+        Returns:
+            List of token IDs
+        """
+        return self.encode(text)
+
+
+def load_tokenized_dataset(
+    dataset_path: str,
+    max_samples: Optional[int] = None
+) -> Dict[str, Any]:
+    """Load a tokenized dataset from pickle file.
+    
+    Args:
+        dataset_path: Path to the pickle file containing tokenized data
+        max_samples: Maximum number of samples to load (None for all)
+        
+    Returns:
+        Dictionary containing:
+            - prompts: List of tokenized prompts
+            - dataframe: Original DataFrame
+            - metadata: Additional metadata
+    """
+    logger.info(f"Loading tokenized dataset from {dataset_path}")
+    
+    # Load DataFrame from pickle
+    df = pd.read_pickle(dataset_path)
+    logger.info(f"Loaded DataFrame with shape: {df.shape}")
+
+    # Limit samples if specified
+    if max_samples is not None:
+        df = df.head(max_samples)
+        logger.info(f"Limited to {max_samples} samples")
+
+    # Extract tokenized prompts
+    if 'tok_input' not in df.columns:
+        raise ValueError("Dataset must have 'tok_input' column with tokenized prompts")
+
+    # Verify tokenization
+    failed_mask = df['tok_input'].isna()
+    if failed_mask.any():
+        failed_count = failed_mask.sum()
+        logger.error(f"Found {failed_count} samples with failed tokenization")
+        raise ValueError(f"{failed_count} samples have invalid tokenization")
+
+    prompts = df['tok_input'].tolist()
+    logger.info(f"Loaded {len(prompts)} tokenized prompts")
+
+    # Log statistics
+    prompt_lengths = [len(p) for p in prompts]
+    logger.info(
+        f"Prompt length stats - "
+        f"min: {min(prompt_lengths)}, "
+        f"max: {max(prompt_lengths)}, "
+        f"mean: {sum(prompt_lengths)/len(prompt_lengths):.1f}"
+    )
+
+    return {
+        "prompts": prompts,
+        "dataframe": df,
+        "metadata": {
+            "num_samples": len(prompts),
+            "min_length": min(prompt_lengths),
+            "max_length": max(prompt_lengths),
+            "mean_length": sum(prompt_lengths) / len(prompt_lengths)
+        }
+    }
+

From ab90695ed23023431ee2d3863452b161a6476d6e Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 20 Nov 2025 21:58:44 +0000
Subject: [PATCH 116/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/utils/__init__.py     |  1 -
 language/gpt-oss/utils/tokenization.py | 30 ++++++++++++++------------
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/language/gpt-oss/utils/__init__.py b/language/gpt-oss/utils/__init__.py
index fc1dd7cb1e..9b3b53963d 100644
--- a/language/gpt-oss/utils/__init__.py
+++ b/language/gpt-oss/utils/__init__.py
@@ -7,4 +7,3 @@
     "StandardTokenizer",
     "load_tokenized_dataset",
 ]
-
diff --git a/language/gpt-oss/utils/tokenization.py b/language/gpt-oss/utils/tokenization.py
index bb89bda445..a64d77a5f9 100644
--- a/language/gpt-oss/utils/tokenization.py
+++ b/language/gpt-oss/utils/tokenization.py
@@ -16,7 +16,7 @@ class StandardTokenizer:
 
     def __init__(self, model_name: str = MODEL_NAME):
         """Initialize the tokenizer.
-        
+
         Args:
             model_name: HuggingFace model name or path
         """
@@ -33,10 +33,10 @@ def load(self) -> None:
 
     def encode(self, text: str) -> List[int]:
         """Encode text to token IDs.
-        
+
         Args:
             text: Input text
-            
+
         Returns:
             List of token IDs
         """
@@ -44,26 +44,28 @@ def encode(self, text: str) -> List[int]:
             self.load()
         return self.tokenizer.encode(text)
 
-    def decode(self, token_ids: List[int], skip_special_tokens: bool = True) -> str:
+    def decode(self, token_ids: List[int],
+               skip_special_tokens: bool = True) -> str:
         """Decode token IDs to text.
-        
+
         Args:
             token_ids: List of token IDs
             skip_special_tokens: Whether to skip special tokens
-            
+
         Returns:
             Decoded text
         """
         if self.tokenizer is None:
             self.load()
-        return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
+        return self.tokenizer.decode(
+            token_ids, skip_special_tokens=skip_special_tokens)
 
     def __call__(self, text: str) -> List[int]:
         """Encode text to token IDs (callable interface).
-        
+
         Args:
             text: Input text
-            
+
         Returns:
             List of token IDs
         """
@@ -75,11 +77,11 @@ def load_tokenized_dataset(
     max_samples: Optional[int] = None
 ) -> Dict[str, Any]:
     """Load a tokenized dataset from pickle file.
-    
+
     Args:
         dataset_path: Path to the pickle file containing tokenized data
         max_samples: Maximum number of samples to load (None for all)
-        
+
     Returns:
         Dictionary containing:
             - prompts: List of tokenized prompts
@@ -87,7 +89,7 @@ def load_tokenized_dataset(
             - metadata: Additional metadata
     """
     logger.info(f"Loading tokenized dataset from {dataset_path}")
-    
+
     # Load DataFrame from pickle
     df = pd.read_pickle(dataset_path)
     logger.info(f"Loaded DataFrame with shape: {df.shape}")
@@ -99,7 +101,8 @@ def load_tokenized_dataset(
 
     # Extract tokenized prompts
     if 'tok_input' not in df.columns:
-        raise ValueError("Dataset must have 'tok_input' column with tokenized prompts")
+        raise ValueError(
+            "Dataset must have 'tok_input' column with tokenized prompts")
 
     # Verify tokenization
     failed_mask = df['tok_input'].isna()
@@ -130,4 +133,3 @@ def load_tokenized_dataset(
             "mean_length": sum(prompt_lengths) / len(prompt_lengths)
         }
     }
-

From 62e4d471b9e89cb0b425d0bad0d3be4d919b9a3e Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 20 Nov 2025 22:26:07 +0000
Subject: [PATCH 117/165] update README

---
 language/gpt-oss/README.md | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/language/gpt-oss/README.md b/language/gpt-oss/README.md
index 4dff1ea501..935f37b4fa 100644
--- a/language/gpt-oss/README.md
+++ b/language/gpt-oss/README.md
@@ -4,16 +4,10 @@ This is the reference implementation for GPT-OSS-120B. This is a proposal and is
 ## Model and Dataset download
 
 * Model: `openai/gpt-oss-120b`, commit id: [`b5c939d`](https://huggingface.co/openai/gpt-oss-120b/tree/b5c939de8f754692c1647ca79fbf85e8c1e70f8a)
-* Dataset: For now, we are re-using the `deepseek-r1` dataset.
+* Dataset: Please request access at [this link](https://drive.google.com/drive/folders/1DCfEXHqe69okrqKbSyV-8VUw413JqpPY?usp=drive_link) - **this is a tentative dataset**
 
 ## Environment setup
-Work on reference implementation is done using the sglang container `lmsysorg/sglang:v0.5.4.post2`. For enroot setup, a script is provided under [`setup_enroot.sh`](./setup_enroot.sh). For all sections below, we shall assume this environment is instantiated.
-
-## Fetch accuracy eval dataset
-To create the accruracy_eval pkl file: 
-```bash
-$ ./preprocess.sh
-```
+Work on reference implementation is done using the sglang containers at [https://hub.docker.com/r/lmsysorg/sglang/tags](https://hub.docker.com/r/lmsysorg/sglang/tags). For enroot setup, a script is provided under [`setup_enroot.sh`](./setup_enroot.sh). For all sections below, we shall assume this environment is instantiated.
 
 This does the following: 
 - clones `https://huggingface.co/datasets/livecodebench/code_generation_lite` under `data/lcb`

From 292c49d8ad0c812447f93bb32d8b04b06b7b44ad Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 20 Nov 2025 22:26:44 +0000
Subject: [PATCH 118/165] fix request pool size

---
 language/gpt-oss/backends/sglang_backend.py | 13 +++++++++++++
 language/gpt-oss/mlperf/offline_sut.py      |  7 +++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/language/gpt-oss/backends/sglang_backend.py b/language/gpt-oss/backends/sglang_backend.py
index 0d932d35bf..30ec8ffac4 100644
--- a/language/gpt-oss/backends/sglang_backend.py
+++ b/language/gpt-oss/backends/sglang_backend.py
@@ -49,7 +49,20 @@ def initialize(self) -> None:
             return
 
         logger.info(f"Connecting to SGLang server at {self.server_url}")
+        
+        # Create session with larger connection pool for high concurrency
+        # Default pool size is 10, but we may have 100s-1000s of concurrent requests
         self.session = requests.Session()
+        
+        # Increase connection pool size to support high concurrency
+        adapter = requests.adapters.HTTPAdapter(
+            pool_connections=100,  # Number of connection pools to cache
+            pool_maxsize=1000,     # Maximum number of connections to save in the pool
+            max_retries=3,         # Retry failed requests
+            pool_block=False       # Don't block when pool is full, create new connections
+        )
+        self.session.mount('http://', adapter)
+        self.session.mount('https://', adapter)
 
         # Test connection with a simple request
         try:
diff --git a/language/gpt-oss/mlperf/offline_sut.py b/language/gpt-oss/mlperf/offline_sut.py
index 986f5d9287..40a29da2f8 100644
--- a/language/gpt-oss/mlperf/offline_sut.py
+++ b/language/gpt-oss/mlperf/offline_sut.py
@@ -135,18 +135,21 @@ def process_single_query(query_sample):
                                 output_ids, dtype=np.int32)
                             output_data_ptr = token_array.ctypes.data
                             output_data_size = token_array.nbytes
+                            n_tokens = len(output_ids)
                         else:
                             # Empty response
                             token_array = np.array([], dtype=np.int32)
                             output_data_ptr = 0
                             output_data_size = 0
+                            n_tokens = 0
 
-                        # Create response for LoadGen
+                        # Create response for LoadGen with token count
                         response_array = [
                             lg.QuerySampleResponse(
                                 query_id,
                                 output_data_ptr,
-                                output_data_size
+                                output_data_size,
+                                n_tokens  # Number of output tokens for tokens/sec metric
                             )
                         ]
 

From 98585b806e90e87bb2ee8e7c3dcdc201e9830dfe Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 20 Nov 2025 22:27:16 +0000
Subject: [PATCH 119/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/backends/sglang_backend.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/language/gpt-oss/backends/sglang_backend.py b/language/gpt-oss/backends/sglang_backend.py
index 30ec8ffac4..78040ad960 100644
--- a/language/gpt-oss/backends/sglang_backend.py
+++ b/language/gpt-oss/backends/sglang_backend.py
@@ -49,11 +49,12 @@ def initialize(self) -> None:
             return
 
         logger.info(f"Connecting to SGLang server at {self.server_url}")
-        
+
         # Create session with larger connection pool for high concurrency
-        # Default pool size is 10, but we may have 100s-1000s of concurrent requests
+        # Default pool size is 10, but we may have 100s-1000s of concurrent
+        # requests
         self.session = requests.Session()
-        
+
         # Increase connection pool size to support high concurrency
         adapter = requests.adapters.HTTPAdapter(
             pool_connections=100,  # Number of connection pools to cache

From 21a80341b284fc24c8eef19c7cfcdfff7b99d364 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Fri, 21 Nov 2025 02:55:24 +0000
Subject: [PATCH 120/165] add setup

---
 language/gpt-oss/setup.sh | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100755 language/gpt-oss/setup.sh

diff --git a/language/gpt-oss/setup.sh b/language/gpt-oss/setup.sh
new file mode 100755
index 0000000000..23188a0cbd
--- /dev/null
+++ b/language/gpt-oss/setup.sh
@@ -0,0 +1,3 @@
+pip install -r requirements.txt
+git_dir=$(git rev-parse --show-toplevel)
+pip install $git_dir/loadgen
\ No newline at end of file

From 734d8f42b86e2dea1a4e2564a0749d4231e39267 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Fri, 21 Nov 2025 02:58:01 +0000
Subject: [PATCH 121/165] updates

---
 language/gpt-oss/backends/sglang_backend.py | 74 ++++++++++++++++-----
 language/gpt-oss/mlperf/offline_sut.py      |  5 ++
 language/gpt-oss/mlperf/server_sut.py       |  9 ++-
 language/gpt-oss/run_mlperf.py              | 20 ++++--
 language/gpt-oss/sglang/run_server.sh       |  7 +-
 loadgen/mlperf.conf                         |  2 +-
 6 files changed, 93 insertions(+), 24 deletions(-)

diff --git a/language/gpt-oss/backends/sglang_backend.py b/language/gpt-oss/backends/sglang_backend.py
index 78040ad960..e15578e93d 100644
--- a/language/gpt-oss/backends/sglang_backend.py
+++ b/language/gpt-oss/backends/sglang_backend.py
@@ -23,6 +23,7 @@ def __init__(
         self,
         server_url: str = "http://localhost:30000",
         timeout: int = 1200,
+        max_pool_size: int = 2000,  # Default pool size for high concurrency
         **kwargs
     ):
         """Initialize SGLang backend.
@@ -30,16 +31,19 @@ def __init__(
         Args:
             server_url: URL of the SGLang server
             timeout: Request timeout in seconds
+            max_pool_size: Maximum connection pool size (should be >= max_concurrency)
             **kwargs: Additional configuration
         """
         config = {
             "server_url": server_url,
             "timeout": timeout,
+            "max_pool_size": max_pool_size,
             **kwargs
         }
         super().__init__(config)
         self.server_url = server_url
         self.timeout = timeout
+        self.max_pool_size = max_pool_size
         self.session = None
 
     def initialize(self) -> None:
@@ -49,18 +53,19 @@ def initialize(self) -> None:
             return
 
         logger.info(f"Connecting to SGLang server at {self.server_url}")
-
+        logger.info(f"Configuring connection pool with max_pool_size={self.max_pool_size}")
         # Create session with larger connection pool for high concurrency
         # Default pool size is 10, but we may have 100s-1000s of concurrent
         # requests
         self.session = requests.Session()
 
         # Increase connection pool size to support high concurrency
+        # pool_maxsize should be >= max_concurrency to avoid "pool is full" warnings
         adapter = requests.adapters.HTTPAdapter(
-            pool_connections=100,  # Number of connection pools to cache
-            pool_maxsize=1000,     # Maximum number of connections to save in the pool
-            max_retries=3,         # Retry failed requests
-            pool_block=False       # Don't block when pool is full, create new connections
+            pool_connections=min(100, self.max_pool_size // 10),  # Number of connection pools to cache
+            pool_maxsize=self.max_pool_size,     # Maximum number of connections in the pool
+            max_retries=3,                       # Retry failed requests
+            pool_block=False                     # Don't block when pool is full, create new connections
         )
         self.session.mount('http://', adapter)
         self.session.mount('https://', adapter)
@@ -217,12 +222,18 @@ async def generate_stream(
 
         Yields:
             Dict with:
-                - delta_token_ids: List of new token IDs in this chunk
+                - delta_token_ids: List of new token IDs in this chunk (estimated during streaming)
                 - delta_text: New text in this chunk
                 - is_first_token: True if this is the first token
                 - is_finished: True if generation is complete
-                - accumulated_token_ids: All tokens generated so far
+                - accumulated_token_ids: All tokens generated so far (accurate in final chunk)
                 - metadata: Additional info (TTFT, etc.)
+                
+        Note:
+            SGLang's streaming API returns text incrementally but may not return token_ids 
+            per chunk. We estimate token counts from text length during streaming (~4 chars/token).
+            The FINAL chunk (when finished=True) should contain the complete 'output_ids' array
+            with accurate token IDs, which we use for LoadGen's token/sec metrics.
         """
         if not self.initialized:
             raise RuntimeError(
@@ -285,22 +296,55 @@ async def generate_stream(
 
                             chunk = json.loads(json_str)
 
-                            # Extract token information from chunk
-                            delta_token_ids = chunk.get("token_ids", [])
+                            # Extract text delta
                             delta_text = chunk.get("text", "")
-
-                            if delta_token_ids:
-                                accumulated_token_ids.extend(delta_token_ids)
+                            
+                            # Check if this is the final chunk
+                            is_finished = chunk.get("finished", False)
+                            
+                            # For token IDs:
+                            # - SGLang may not return incremental token_ids in each streaming chunk
+                            # - But should return full output_ids in the FINAL chunk when finished=True
+                            # - For intermediate chunks, we estimate token count from text length
+                            delta_token_ids = []
+                            
+                            if is_finished and "output_ids" in chunk:
+                                # Final chunk: use the complete output_ids from SGLang
+                                all_output_ids = chunk["output_ids"]
+                                accumulated_token_ids = all_output_ids
+                                delta_token_ids = []  # Already accumulated
+                                logger.debug(f"Received final output_ids: {len(all_output_ids)} tokens")
+                            elif is_finished and "output_ids" not in chunk:
+                                # Final chunk but no output_ids - log warning
+                                logger.warning(
+                                    f"Final chunk received but no 'output_ids' field found. "
+                                    f"Using estimated token count: {len(accumulated_token_ids)}. "
+                                    f"For accurate token metrics, ensure SGLang returns 'output_ids' in streaming responses."
+                                )
+                            elif "token_ids" in chunk:
+                                # Incremental token IDs (if SGLang provides them)
+                                delta_token_ids = chunk["token_ids"]
+                            elif delta_text:
+                                # Estimate token count from text delta (rough approximation)
+                                # Average ~4 characters per token for English text
+                                estimated_tokens = max(1, len(delta_text) // 4)
+                                # Create placeholder token IDs for counting purposes
+                                # These are NOT real token IDs, just for LoadGen metrics
+                                delta_token_ids = [0] * estimated_tokens
+
+                            # Update accumulated state (only if not already set by final chunk)
+                            if not (is_finished and "output_ids" in chunk):
+                                # Accumulate incrementally (estimated or incremental token IDs)
+                                if delta_token_ids:
+                                    accumulated_token_ids.extend(delta_token_ids)
                             if delta_text:
                                 accumulated_text += delta_text
 
                             # Mark first token timing
-                            if is_first and delta_token_ids:
+                            if is_first and (delta_token_ids or delta_text):
                                 first_token_time = time.time()
                                 is_first = False
 
-                            is_finished = chunk.get("finished", False)
-
                             yield {
                                 "delta_token_ids": delta_token_ids,
                                 "delta_text": delta_text,
diff --git a/language/gpt-oss/mlperf/offline_sut.py b/language/gpt-oss/mlperf/offline_sut.py
index 40a29da2f8..c518293117 100644
--- a/language/gpt-oss/mlperf/offline_sut.py
+++ b/language/gpt-oss/mlperf/offline_sut.py
@@ -65,6 +65,11 @@ def issue_queries(self, query_samples: List[lg.QuerySample]) -> None:
         """
         logger.info(f"Received {len(query_samples)} queries")
 
+        # Update progress bar total (Offline gets all queries at once)
+        if self.progress_bar is not None:
+            self.progress_bar.total = len(query_samples)
+            self.progress_bar.refresh()
+
         # Store queries for batch processing
         for qs in query_samples:
             self.pending_queries.append(qs)
diff --git a/language/gpt-oss/mlperf/server_sut.py b/language/gpt-oss/mlperf/server_sut.py
index 880a9a0c25..024c57e1b4 100644
--- a/language/gpt-oss/mlperf/server_sut.py
+++ b/language/gpt-oss/mlperf/server_sut.py
@@ -99,8 +99,7 @@ def start(self) -> lg.ConstructSUT:
         # Create LoadGen SUT
         self.sut = lg.ConstructSUT(
             self.issue_queries,
-            self.flush_queries,
-            self.name)
+            self.flush_queries)
         logger.info(f"{self.name} started with streaming support")
         return self.sut
 
@@ -313,6 +312,12 @@ def issue_queries(self, query_samples: List[lg.QuerySample]) -> None:
         Args:
             query_samples: List of MLPerf LoadGen query samples
         """
+        # Update progress bar total dynamically as queries arrive
+        if self.progress_bar is not None:
+            with self.progress_lock:
+                self.progress_bar.total = (self.progress_bar.total or 0) + len(query_samples)
+                self.progress_bar.refresh()
+
         for qs in query_samples:
             self.query_queue.put(qs)
 
diff --git a/language/gpt-oss/run_mlperf.py b/language/gpt-oss/run_mlperf.py
index d30b1cd4ee..ff2a017175 100755
--- a/language/gpt-oss/run_mlperf.py
+++ b/language/gpt-oss/run_mlperf.py
@@ -19,6 +19,7 @@
 import logging
 import os
 import sys
+import threading
 from pathlib import Path
 from typing import Optional
 
@@ -315,25 +316,34 @@ def do_cleanup():
         # Initialize backend
         logger.info(f"Initializing {args.backend} backend...")
         if args.backend == "sglang":
+            # Set pool size to match max_concurrency with small safety margin
+            # This prevents "connection pool is full" warnings
+            pool_size = int(args.max_concurrency * 1.1)  # 10% safety margin
             backend = SGLangBackend(
                 server_url=args.server_url,
-                timeout=1200
+                timeout=1200,
+                max_pool_size=pool_size
             )
+            logger.info(f"Backend configured with connection pool size: {pool_size} (from max_concurrency={args.max_concurrency})")
         else:
             raise ValueError(f"Unknown backend: {args.backend}")
 
         # Initialize backend
         backend.initialize()
 
-        # Create progress bar for real-time updates
+        # Create progress bar early so subsequent logs print below it
+        # Total will be dynamically updated by SUT based on actual queries from LoadGen:
+        # - Offline: Set once when all queries arrive
+        # - Server: Incremented as queries arrive
         pbar = tqdm(
-            total=len(prompts),
+            total=0,  # Will be updated dynamically by SUT
             desc=f"MLPerf {args.mode}",
             unit="query",
             leave=True,
             position=0,
-            mininterval=0.1,  # Update display every 0.1s minimum
-            smoothing=0.1      # Smooth display updates
+            mininterval=0.1,
+            smoothing=0.1,
+            dynamic_ncols=True
         )
 
         # Create SUT with progress bar
diff --git a/language/gpt-oss/sglang/run_server.sh b/language/gpt-oss/sglang/run_server.sh
index fe02d68682..3fd01b6629 100755
--- a/language/gpt-oss/sglang/run_server.sh
+++ b/language/gpt-oss/sglang/run_server.sh
@@ -5,6 +5,7 @@ pip install -r requirements.txt
 dp=1
 model_path=openai/gpt-oss-120b
 eagle_path=""
+stream_interval=500
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -20,6 +21,10 @@ while [[ $# -gt 0 ]]; do
             eagle_path=$2
             shift 2
             ;;
+        --stream_interval)
+            stream_interval=$2
+            shift 2
+            ;;
         *)
             echo "Unknown argument: $1"
             exit 1
@@ -36,7 +41,7 @@ args=" --model-path $model_path \
     --mem-fraction-static 0.85 \
     --chunked-prefill-size 16384 \
     --ep-size=1 \
-    --stream-interval 500 "
+    --stream-interval $stream_interval "
 
 if [ -n "$eagle_path" ]; then
     args="$args --speculative-draft-model-path $eagle_path \
diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf
index 390cceddd8..e6b356b490 100644
--- a/loadgen/mlperf.conf
+++ b/loadgen/mlperf.conf
@@ -168,5 +168,5 @@ whisper.Offline.min_query_count = 1633
 # These fields should be defined and overridden by user.conf.
 *.SingleStream.target_latency = 10
 *.MultiStream.target_latency = 80
-*.Server.target_qps = 1.0
+*.Server.target_qps = 8.0
 *.Offline.target_qps = 1.0

From e3e22b86310fab4361d8dac65ff43f610a8c505c Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Fri, 21 Nov 2025 03:57:07 +0000
Subject: [PATCH 122/165] server scenario fix; gpt-oss -> gpt-oss-120b

---
 language/gpt-oss/README.md                  | 60 ++++++++-----
 language/gpt-oss/backends/sglang_backend.py | 95 +++++++++++----------
 language/gpt-oss/mlperf/user.conf           | 29 ++-----
 language/gpt-oss/run_mlperf.py              |  6 +-
 loadgen/mlperf.conf                         | 14 +--
 5 files changed, 112 insertions(+), 92 deletions(-)

diff --git a/language/gpt-oss/README.md b/language/gpt-oss/README.md
index 935f37b4fa..b6845d259c 100644
--- a/language/gpt-oss/README.md
+++ b/language/gpt-oss/README.md
@@ -9,34 +9,56 @@ This is the reference implementation for GPT-OSS-120B. This is a proposal and is
 ## Environment setup
 Work on reference implementation is done using the sglang containers at [https://hub.docker.com/r/lmsysorg/sglang/tags](https://hub.docker.com/r/lmsysorg/sglang/tags). For enroot setup, a script is provided under [`setup_enroot.sh`](./setup_enroot.sh). For all sections below, we shall assume this environment is instantiated.
 
-This does the following: 
-- clones `https://huggingface.co/datasets/livecodebench/code_generation_lite` under `data/lcb`
-- creates a `data/accuracy_eval_raw.pkl` with `aime1983-2024, gpqa_diamond, lcb-v1_v5` samples.
-- converts the prompt into harmony format, and tokenizes them under `data/accuracy_eval_tokenized.pkl` using `HIGH` reasoning effort. 
-  - This step uses multiprocessing with a default of 32 parallel workers (hardcoded). Please reduce this if you see `pyo3_runtime.PanicException` errors. 
+Once in the environment, install additional requirements using [`setup.sh`](./setup.sh): 
+```bash
+./setup.sh
+```
 
 ## Running the reference implementation: SGLang
+Use [`./sglang/run_server.sh`](./sglang/run_server.sh) to launch an SGLang server hosting `gpt-oss-120b`.
 
 ### Run the server
 ```bash
 ./run_server.sh \
-  --model_path path_to_gpt_oss_120b_model \  # optional, defaults to fetching from HF
-  --dp N  # optional, defaults to 1. Set this to number of accelerators
+  --model_path path/to/gpt-oss-120b/model \
+  --dp N  \
+  --stream_interval 100 \
+  --eagle_path optional/path/to/eagle/head
 ```
 The script uses `python3 -m sglang.launch_server` tp instantiate the model, with `tp=pp=ep=1`, and `dp` as specified. 
 
+Then, run a benchmark script that uses the client to send/recv requests.
 ### Run the inference
 ```bash
-python3 run_infer.py \
-    --input-tokens data/accuracy_eval_tokenized.pkl \
-    --max-tokens 32768 \
-    --max-concurrency 4096 \
-    --timeout 2400 \
-    --output data/accuracy_eval_inferred.pkl \
-    --pass-k 5
-```
+python3 run_mlperf.py --help
+Run MLPerf inference benchmarks for gpt-oss
 
-### Evaluate the responses
-```bash
-python3 eval_accuracy.py --input-file data/accuracy_eval_inferred.pkl
-```
\ No newline at end of file
+options:
+  -h, --help            show this help message and exit
+  --mode {offline,server}
+                        MLPerf scenario mode
+  --input-file INPUT_FILE
+                        Path to tokenized dataset (pickle file)
+  --max-samples MAX_SAMPLES
+                        Maximum number of samples to use (None for all)
+  --mlperf-conf MLPERF_CONF
+                        Path to MLPerf configuration file
+  --user-conf USER_CONF
+                        Path to user configuration file
+  --accuracy            Run accuracy mode instead of performance
+  --output-dir OUTPUT_DIR
+                        Directory for MLPerf output logs
+  --backend {sglang}    Backend to use for inference
+  --server-url SERVER_URL
+                        Server URL for backend (SGLang)
+  --max-tokens MAX_TOKENS
+                        Maximum tokens to generate
+  --temperature TEMPERATURE
+                        Sampling temperature
+  --top-k TOP_K         Top-k sampling parameter
+  --top-p TOP_P         Top-p sampling parameter
+  --num-workers NUM_WORKERS
+                        Number of worker threads (for server scenario)
+  --max-concurrency MAX_CONCURRENCY
+                        Maximum concurrent requests to backend (SGLang handles batching internally)
+```
diff --git a/language/gpt-oss/backends/sglang_backend.py b/language/gpt-oss/backends/sglang_backend.py
index e15578e93d..c17ab37451 100644
--- a/language/gpt-oss/backends/sglang_backend.py
+++ b/language/gpt-oss/backends/sglang_backend.py
@@ -222,18 +222,20 @@ async def generate_stream(
 
         Yields:
             Dict with:
-                - delta_token_ids: List of new token IDs in this chunk (estimated during streaming)
+                - delta_token_ids: List of new token IDs in this chunk
                 - delta_text: New text in this chunk
                 - is_first_token: True if this is the first token
                 - is_finished: True if generation is complete
-                - accumulated_token_ids: All tokens generated so far (accurate in final chunk)
-                - metadata: Additional info (TTFT, etc.)
+                - accumulated_token_ids: All tokens generated so far
+                - metadata: Additional info (TTFT, completion_tokens, etc.)
                 
         Note:
-            SGLang's streaming API returns text incrementally but may not return token_ids 
-            per chunk. We estimate token counts from text length during streaming (~4 chars/token).
-            The FINAL chunk (when finished=True) should contain the complete 'output_ids' array
-            with accurate token IDs, which we use for LoadGen's token/sec metrics.
+            SGLang's streaming API behavior:
+            - Returns 'output_ids', 'text', and 'meta_info' in each chunk
+            - 'output_ids' can have retractions (length can decrease between chunks)
+            - 'meta_info.completion_tokens' is the RELIABLE cumulative token count
+            - 'finish_reason' in meta_info indicates completion (not a 'finished' flag)
+            - We use completion_tokens for accurate LoadGen token/sec metrics
         """
         if not self.initialized:
             raise RuntimeError(
@@ -298,45 +300,52 @@ async def generate_stream(
 
                             # Extract text delta
                             delta_text = chunk.get("text", "")
-                            
+
                             # Check if this is the final chunk
-                            is_finished = chunk.get("finished", False)
-                            
-                            # For token IDs:
-                            # - SGLang may not return incremental token_ids in each streaming chunk
-                            # - But should return full output_ids in the FINAL chunk when finished=True
-                            # - For intermediate chunks, we estimate token count from text length
-                            delta_token_ids = []
+                            # SGLang uses 'finish_reason' in meta_info, not 'finished' flag
+                            meta_info = chunk.get("meta_info", {})
+                            finish_reason = meta_info.get("finish_reason")
+                            is_finished = (finish_reason is not None and finish_reason != "null") or chunk.get("finished", False)
+
+                            # Extract token information from chunk
+                            # SGLang's output_ids can have retractions, so use meta_info.completion_tokens
+                            # which is the reliable cumulative count
+                            chunk_output_ids = chunk.get("output_ids", [])
+                            completion_tokens = meta_info.get("completion_tokens", 0)
                             
-                            if is_finished and "output_ids" in chunk:
-                                # Final chunk: use the complete output_ids from SGLang
-                                all_output_ids = chunk["output_ids"]
-                                accumulated_token_ids = all_output_ids
-                                delta_token_ids = []  # Already accumulated
-                                logger.debug(f"Received final output_ids: {len(all_output_ids)} tokens")
-                            elif is_finished and "output_ids" not in chunk:
-                                # Final chunk but no output_ids - log warning
-                                logger.warning(
-                                    f"Final chunk received but no 'output_ids' field found. "
-                                    f"Using estimated token count: {len(accumulated_token_ids)}. "
-                                    f"For accurate token metrics, ensure SGLang returns 'output_ids' in streaming responses."
-                                )
-                            elif "token_ids" in chunk:
-                                # Incremental token IDs (if SGLang provides them)
-                                delta_token_ids = chunk["token_ids"]
-                            elif delta_text:
-                                # Estimate token count from text delta (rough approximation)
-                                # Average ~4 characters per token for English text
-                                estimated_tokens = max(1, len(delta_text) // 4)
-                                # Create placeholder token IDs for counting purposes
-                                # These are NOT real token IDs, just for LoadGen metrics
-                                delta_token_ids = [0] * estimated_tokens
-
-                            # Update accumulated state (only if not already set by final chunk)
-                            if not (is_finished and "output_ids" in chunk):
-                                # Accumulate incrementally (estimated or incremental token IDs)
-                                if delta_token_ids:
+                            if completion_tokens > 0:
+                                # Use completion_tokens as the authoritative count
+                                previous_count = len(accumulated_token_ids)
+                                
+                                if completion_tokens > previous_count:
+                                    # New tokens generated
+                                    num_new_tokens = completion_tokens - previous_count
+                                    
+                                    if chunk_output_ids and len(chunk_output_ids) >= num_new_tokens:
+                                        # Use actual token IDs from chunk
+                                        delta_token_ids = chunk_output_ids[-num_new_tokens:] if num_new_tokens > 0 else []
+                                    else:
+                                        # Fallback: create placeholder tokens for counting
+                                        delta_token_ids = list(range(previous_count, completion_tokens))
+                                    
                                     accumulated_token_ids.extend(delta_token_ids)
+                                else:
+                                    delta_token_ids = []
+                                
+                            else:
+                                # No completion_tokens - fallback to output_ids or text estimation
+                                if chunk_output_ids:
+                                    delta_token_ids = chunk_output_ids
+                                    accumulated_token_ids.extend(delta_token_ids)
+                                elif delta_text:
+                                    # Estimate from text length
+                                    estimated_tokens = max(1, len(delta_text) // 4)
+                                    delta_token_ids = [0] * estimated_tokens
+                                    accumulated_token_ids.extend(delta_token_ids)
+                                else:
+                                    delta_token_ids = []
+                            
+                            # Accumulate text
                             if delta_text:
                                 accumulated_text += delta_text
 
diff --git a/language/gpt-oss/mlperf/user.conf b/language/gpt-oss/mlperf/user.conf
index 9dc487f39f..a836241115 100644
--- a/language/gpt-oss/mlperf/user.conf
+++ b/language/gpt-oss/mlperf/user.conf
@@ -1,22 +1,11 @@
-# User configuration for gpt-oss MLPerf inference
-# Override default settings from mlperf.conf here
+gpt-oss-120b.Offline.target_qps = 10.0
+gpt-oss-120b.Offline.min_duration = 60000
+gpt-oss-120b.Offline.min_query_count = 8036
 
-# Offline scenario
-gpt-oss.Offline.target_qps = 1.0
-gpt-oss.Offline.min_duration = 60000
-gpt-oss.Offline.min_query_count = 100
-
-# Server scenario
-gpt-oss.Server.target_qps = 1.0
-gpt-oss.Server.min_duration = 60000
-gpt-oss.Server.target_latency = 0
-gpt-oss.Server.ttft_latency = 2000
-gpt-oss.Server.tpot_latency = 20
-
-# SingleStream scenario (if needed)
-gpt-oss.SingleStream.target_latency = 1000
-gpt-oss.SingleStream.min_duration = 60000
-
-# Performance sample count
-# gpt-oss.*.performance_sample_count_override = 8036
+gpt-oss-120b.Server.target_qps = 7.0
+gpt-oss-120b.Server.min_duration = 60000
+gpt-oss-120b.Server.min_query_count = 8036
 
+gpt-oss-120b.Server.target_latency = 0
+gpt-oss-120b.Server.ttft_latency = 2000
+gpt-oss-120b.Server.tpot_latency = 20
diff --git a/language/gpt-oss/run_mlperf.py b/language/gpt-oss/run_mlperf.py
index ff2a017175..bc83b32009 100755
--- a/language/gpt-oss/run_mlperf.py
+++ b/language/gpt-oss/run_mlperf.py
@@ -171,7 +171,7 @@ def configure_loadgen(
     mlperf_conf: Optional[str] = None,
     user_conf: Optional[str] = None,
     log_dir: Optional[str] = None,
-    model_name: str = "gpt-oss"
+    model_name: str = "gpt-oss-120b"
 ) -> lg.TestSettings:
     """Configure LoadGen test settings.
 
@@ -356,7 +356,7 @@ def do_cleanup():
                 temperature=args.temperature,
                 top_k=args.top_k,
                 top_p=args.top_p,
-                name=f"gpt-oss_offline_sut",
+                name=f"gpt-oss-120b_offline_sut",
                 progress_bar=pbar,
                 max_concurrency=args.max_concurrency
             )
@@ -369,7 +369,7 @@ def do_cleanup():
                 top_k=args.top_k,
                 top_p=args.top_p,
                 num_workers=args.num_workers,
-                name=f"gpt-oss_server_sut",
+                name=f"gpt-oss-120b_server_sut",
                 progress_bar=pbar
             )
 
diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf
index e6b356b490..af5b9f81a7 100644
--- a/loadgen/mlperf.conf
+++ b/loadgen/mlperf.conf
@@ -25,7 +25,7 @@ stable-diffusion-xl.*.performance_sample_count_override = 5000
 rgat.*.performance_sample_count_override = 788379
 pointpainting.*.performance_sample_count_override = 1024
 deepseek-r1.*.performance_sample_count_override = 4388
-gpt-oss.*.performance_sample_count_override = 8036
+gpt-oss-120b.*.performance_sample_count_override = 8036
 whisper.*.performance_sample_count_override = 1633
 # set to 0 to let entire sample set to be performance sample
 3d-unet.*.performance_sample_count_override = 0
@@ -67,7 +67,7 @@ llama3_1-8b.*.sample_concatenate_permutation = 1
 llama3_1-8b-edge.*.sample_concatenate_permutation = 1
 llama3_1-8b-interactive.*.sample_concatenate_permutation = 1
 deepseek-r1.*.sample_concatenate_permutation = 1
-gpt-oss.*.sample_concatenate_permutation = 1
+gpt-oss-120b.*.sample_concatenate_permutation = 1
 whisper.*.sample_concatenate_permutation = 1
 
 *.Server.target_latency = 10
@@ -92,7 +92,7 @@ llama3_1-8b.*.use_token_latencies = 1
 llama3_1-8b-edge.*.use_token_latencies = 1
 llama3_1-8b-interactive.*.use_token_latencies = 1
 deepseek-r1.*.use_token_latencies = 1
-gpt-oss.*.use_token_latencies = 1
+gpt-oss-120b.*.use_token_latencies = 1
 whisper.*.use_token_latencies = 1
 
 # gptj benchmark infers token latencies
@@ -135,9 +135,9 @@ deepseek-r1.Server.target_latency = 0
 deepseek-r1.Server.ttft_latency = 2000
 deepseek-r1.Server.tpot_latency = 80
 
-gpt-oss.Server.target_latency = 0
-gpt-oss.Server.ttft_latency = 2000
-gpt-oss.Server.tpot_latency = 20
+gpt-oss-120b.Server.target_latency = 0
+gpt-oss-120b.Server.ttft_latency = 2000
+gpt-oss-120b.Server.tpot_latency = 20
 
 *.Offline.target_latency_percentile = 90
 *.Offline.min_duration = 600000
@@ -162,7 +162,7 @@ llama3_1-8b-edge.Offline.min_query_count = 5000
 mixtral-8x7b.Offline.min_query_count = 15000
 rgat.Offline.min_query_count = 788379
 deepseek-r1.Offline.min_query_count = 4388
-gpt-oss.Offline.min_query_count = 8036
+gpt-oss-120b.Offline.min_query_count = 8036
 whisper.Offline.min_query_count = 1633
 
 # These fields should be defined and overridden by user.conf.

From e40a7daf33356ff46fff11eff7311e5f260920ff Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Fri, 21 Nov 2025 07:02:50 +0000
Subject: [PATCH 123/165] add fixes

---
 language/gpt-oss/README.md             |   8 ++
 language/gpt-oss/mlperf/base_sut.py    |  18 +++-
 language/gpt-oss/mlperf/offline_sut.py |  37 ++++++--
 language/gpt-oss/mlperf/server_sut.py  | 116 +++++++++++++++++++------
 language/gpt-oss/mlperf/user.conf      |   4 +-
 language/gpt-oss/run_mlperf.py         |  93 ++++++++++++--------
 6 files changed, 204 insertions(+), 72 deletions(-)

diff --git a/language/gpt-oss/README.md b/language/gpt-oss/README.md
index b6845d259c..4542acf654 100644
--- a/language/gpt-oss/README.md
+++ b/language/gpt-oss/README.md
@@ -6,6 +6,14 @@ This is the reference implementation for GPT-OSS-120B. This is a proposal and is
 * Model: `openai/gpt-oss-120b`, commit id: [`b5c939d`](https://huggingface.co/openai/gpt-oss-120b/tree/b5c939de8f754692c1647ca79fbf85e8c1e70f8a)
 * Dataset: Please request access at [this link](https://drive.google.com/drive/folders/1DCfEXHqe69okrqKbSyV-8VUw413JqpPY?usp=drive_link) - **this is a tentative dataset**
 
+Verify the dataset contents by computing the `sha1sum`:
+```bash
+$ sha1sum gptoss-*-eval.pkl
+35228fcf5581b916e70920748baf2c016ea2c06b  gptoss-acc-eval.pkl
+ddec911ad479fc4b30ef1c050c9dea63134c090e  gptoss-perf-eval.pkl
+
+```
+
 ## Environment setup
 Work on reference implementation is done using the sglang containers at [https://hub.docker.com/r/lmsysorg/sglang/tags](https://hub.docker.com/r/lmsysorg/sglang/tags). For enroot setup, a script is provided under [`setup_enroot.sh`](./setup_enroot.sh). For all sections below, we shall assume this environment is instantiated.
 
diff --git a/language/gpt-oss/mlperf/base_sut.py b/language/gpt-oss/mlperf/base_sut.py
index cb87f03c1f..f96af57e16 100644
--- a/language/gpt-oss/mlperf/base_sut.py
+++ b/language/gpt-oss/mlperf/base_sut.py
@@ -3,6 +3,7 @@
 
 import abc
 import logging
+import threading
 from typing import List, Dict, Any, Optional
 import mlperf_loadgen as lg
 
@@ -34,6 +35,10 @@ def __init__(
         self.sut = None
         self.results = {}
         self.progress_bar = progress_bar
+        
+        # Graceful shutdown support (set on KeyboardInterrupt)
+        self.should_stop = threading.Event()
+        
         logger.info(f"Initializing {self.name}")
 
     @abc.abstractmethod
@@ -68,7 +73,18 @@ def start(self) -> lg.ConstructSUT:
         return self.sut
 
     def stop(self) -> None:
-        """Stop the SUT and clean up resources."""
+        """Stop the SUT and clean up resources.
+        
+        Signals graceful shutdown and allows subclasses to cancel pending work.
+        """
+        logger.info(f"Stopping {self.name}...")
+        
+        # Signal all workers/tasks to stop
+        self.should_stop.set()
+        
+        # Subclasses should override to add their own cleanup
+        # (e.g., cancel tasks, clear queues)
+        
         if self.sut:
             lg.DestroySUT(self.sut)
             self.sut = None
diff --git a/language/gpt-oss/mlperf/offline_sut.py b/language/gpt-oss/mlperf/offline_sut.py
index c518293117..53f436ea4c 100644
--- a/language/gpt-oss/mlperf/offline_sut.py
+++ b/language/gpt-oss/mlperf/offline_sut.py
@@ -5,6 +5,7 @@
 import numpy as np
 import time
 from typing import List, Dict, Any
+from concurrent.futures import ThreadPoolExecutor, as_completed
 import mlperf_loadgen as lg
 from tqdm import tqdm
 from .base_sut import BaseSUT
@@ -88,10 +89,13 @@ def flush_queries(self) -> None:
             f"Flushing {len(self.pending_queries)} queries with max_concurrency={self.max_concurrency}")
         start_time = time.time()
 
-        from concurrent.futures import ThreadPoolExecutor, as_completed
-
         def process_single_query(query_sample):
             """Process a single query (backend batches automatically via continuous batching)."""
+            # Check if we should stop (e.g., KeyboardInterrupt)
+            if self.should_stop.is_set():
+                logger.info(f"Skipping query {query_sample.id} due to shutdown")
+                return None, None, None
+            
             query_id = query_sample.id
             input_ids = self.dataset[query_sample.index]
 
@@ -121,9 +125,24 @@ def process_single_query(query_sample):
 
                 # Process results as they complete
                 completed_count = 0
+                cancelled_count = 0
+                
                 for future in as_completed(futures):
+                    # Check if shutdown was requested
+                    if self.should_stop.is_set():
+                        logger.info("Shutdown requested, cancelling remaining futures...")
+                        for f in futures:
+                            f.cancel()
+                        cancelled_count = sum(1 for f in futures if f.cancelled())
+                        logger.info(f"Cancelled {cancelled_count} pending futures")
+                        break
                     try:
                         query_id, query_sample, response = future.result()
+                        
+                        # Skip if query was cancelled/skipped
+                        if query_id is None:
+                            continue
+                        
                         output_ids = response.get("output_ids", [])
 
                         # Store results
@@ -178,10 +197,16 @@ def process_single_query(query_sample):
                             f"Error processing query: {e}", exc_info=True)
 
             elapsed = time.time() - start_time
-            logger.info(
-                f"Completed {len(self.pending_queries)} queries in {elapsed:.2f}s "
-                f"({len(self.pending_queries)/elapsed:.2f} QPS)"
-            )
+            if cancelled_count > 0:
+                logger.info(
+                    f"Completed {completed_count} queries, cancelled {cancelled_count} queries "
+                    f"in {elapsed:.2f}s"
+                )
+            else:
+                logger.info(
+                    f"Completed {len(self.pending_queries)} queries in {elapsed:.2f}s "
+                    f"({len(self.pending_queries)/elapsed:.2f} QPS)"
+                )
 
         except Exception as e:
             logger.error(f"Error during concurrent flush: {e}", exc_info=True)
diff --git a/language/gpt-oss/mlperf/server_sut.py b/language/gpt-oss/mlperf/server_sut.py
index 024c57e1b4..231dd89acd 100644
--- a/language/gpt-oss/mlperf/server_sut.py
+++ b/language/gpt-oss/mlperf/server_sut.py
@@ -10,6 +10,7 @@
 from dataclasses import dataclass
 from typing import List, Dict, Any, Optional
 import mlperf_loadgen as lg
+from tqdm import tqdm
 
 from .base_sut import BaseSUT
 
@@ -73,9 +74,12 @@ def __init__(
         self.active_streams: Dict[int, StreamingQueryState] = {}
         self.active_streams_lock = threading.Lock()
 
+        # Track active async tasks for cancellation on KeyboardInterrupt
+        self.active_tasks = set()
+        self.active_tasks_lock = threading.Lock()
+
         # Worker threads
         self.workers = []
-        self.should_stop = threading.Event()
 
         # Progress tracking
         self.queries_completed = 0
@@ -144,20 +148,38 @@ def _worker_thread(self):
                         "Worker thread interrupted, exiting gracefully...")
                     break
 
-                # Schedule async streaming processing
+                # Schedule async streaming processing and track task
                 if self.loop and not self.should_stop.is_set():
-                    future = asyncio.run_coroutine_threadsafe(
-                        self._process_streaming_query(query_sample),
-                        self.loop
-                    )
+                    # Create the coroutine
+                    coro = self._process_streaming_query_tracked(query_sample)
+                    # Schedule it on the event loop
+                    future = asyncio.run_coroutine_threadsafe(coro, self.loop)
                     # Don't wait for completion - it happens asynchronously
 
         except Exception as e:
             logger.error(f"Worker thread error: {e}", exc_info=True)
 
+    async def _process_streaming_query_tracked(self, query_sample: lg.QuerySample):
+        """Wrapper that tracks the async task for cancellation."""
+        task = asyncio.current_task()
+        
+        # Add to active tasks
+        with self.active_tasks_lock:
+            self.active_tasks.add(task)
+        
+        try:
+            await self._process_streaming_query(query_sample)
+        finally:
+            # Remove from active tasks
+            with self.active_tasks_lock:
+                self.active_tasks.discard(task)
+
     async def _process_streaming_query(self, query_sample: lg.QuerySample):
         """Process a single query with streaming support.
 
+        Token reporting to LoadGen:
+        1. When first token arrives → lg.FirstTokenComplete([token_0])
+        2. When generation finishes → lg.QuerySamplesComplete([token_1, token_2, ..., token_n])
         Args:
             query_sample: MLPerf LoadGen query sample
         """
@@ -214,6 +236,11 @@ async def _process_streaming_query(self, query_sample: lg.QuerySample):
                 state.finished = True
                 await self._send_final_response(state)
 
+        except asyncio.CancelledError:
+            # Task was cancelled (e.g., KeyboardInterrupt during graceful shutdown)
+            logger.info(f"Streaming query {query_id} cancelled during shutdown")
+            # Don't send response to LoadGen - we're shutting down
+            raise  # Re-raise to mark task as cancelled
         except Exception as e:
             logger.error(
                 f"Error processing streaming query {query_id}: {e}",
@@ -229,18 +256,23 @@ async def _process_streaming_query(self, query_sample: lg.QuerySample):
                 self.active_streams.pop(query_id, None)
 
     async def _send_first_token_complete(self, state: StreamingQueryState):
-        """Send FirstTokenComplete to LoadGen for TTFT measurement."""
+        """Send FirstTokenComplete to LoadGen for TTFT measurement.
+        
+        Only sends the first token for TTFT measurement.
+        """
         try:
             logger.debug(
                 f"First token for query {state.query_id} at {state.first_token_time - state.start_time:.3f}s")
 
-            # Convert tokens to numpy array
-            if state.accumulated_tokens:
-                token_array = np.ascontiguousarray(
-                    state.accumulated_tokens, dtype=np.int32)
+            # LoadGen uses this to measure Time To First Token (TTFT)
+            if state.accumulated_tokens and len(state.accumulated_tokens) > 0:
+                # Extract only the first token
+                first_token_only = [state.accumulated_tokens[0]]
+                token_array = np.ascontiguousarray(first_token_only, dtype=np.int32)
             else:
-                # Need at least an empty array
+                # No tokens yet - this shouldn't happen but handle gracefully
                 token_array = np.array([], dtype=np.int32)
+                logger.warning(f"FirstTokenComplete called but no tokens accumulated for query {state.query_id}")
 
             # Create response
             response = lg.QuerySampleResponse(
@@ -252,6 +284,7 @@ async def _send_first_token_complete(self, state: StreamingQueryState):
 
             # Report to LoadGen
             lg.FirstTokenComplete([response])
+            logger.debug(f"Sent FirstTokenComplete for query {state.query_id}: 1 token")
 
         except Exception as e:
             logger.error(
@@ -259,12 +292,14 @@ async def _send_first_token_complete(self, state: StreamingQueryState):
                 exc_info=True)
 
     async def _send_final_response(self, state: StreamingQueryState):
-        """Send final QuerySamplesComplete to LoadGen."""
+        """Send final QuerySamplesComplete to LoadGen. (send all tokens except the first one)
+        """
         try:
+            num_total_tokens = len(state.accumulated_tokens)
             logger.debug(
-                f"Final response for query {state.query_id}, {len(state.accumulated_tokens)} tokens")
+                f"Final response for query {state.query_id}: {num_total_tokens} total tokens")
 
-            # Store results
+            # Store results (all tokens for internal tracking)
             self.results[state.query_id] = {
                 "output_ids": state.accumulated_tokens,
                 "output_text": state.accumulated_text,
@@ -274,10 +309,9 @@ async def _send_final_response(self, state: StreamingQueryState):
                 }
             }
 
-            # Convert tokens to numpy array
-            if state.accumulated_tokens:
-                token_array = np.ascontiguousarray(
-                    state.accumulated_tokens, dtype=np.int32)
+            if state.accumulated_tokens and len(state.accumulated_tokens) > 1:
+                remaining_tokens = state.accumulated_tokens[1:]
+                token_array = np.ascontiguousarray(remaining_tokens, dtype=np.int32)
             else:
                 token_array = np.array([], dtype=np.int32)
 
@@ -291,6 +325,10 @@ async def _send_final_response(self, state: StreamingQueryState):
 
             # Report to LoadGen
             lg.QuerySamplesComplete([response])
+            logger.debug(
+                f"Sent QuerySamplesComplete for query {state.query_id}: "
+                f"{len(token_array)} remaining tokens (total: {num_total_tokens})"
+            )
 
             # Update progress bar (force refresh for async updates)
             if self.progress_bar is not None:
@@ -348,15 +386,39 @@ def stop(self) -> None:
             logger.info(f"{self.name} already stopping or stopped.")
             return
 
-        logger.info(f"Stopping {self.name}...")
-        self.should_stop.set()
+        super().stop()
 
-        # Wait for workers
-        for i, worker in enumerate(self.workers):
-            logger.info(f"Waiting for worker {i+1}/{len(self.workers)}...")
-            worker.join(timeout=5)
-            if worker.is_alive():
-                logger.warning(f"Worker {i+1} did not terminate gracefully")
+        # Cancel all active streaming tasks
+        logger.info("Cancelling active streaming tasks...")
+        tasks_to_cancel = []
+        with self.active_tasks_lock:
+            tasks_to_cancel = list(self.active_tasks)
+        
+        if tasks_to_cancel:
+            logger.info(f"Cancelling {len(tasks_to_cancel)} active tasks")
+            for task in tasks_to_cancel:
+                if not task.done():
+                    task.cancel()
+        
+        # Clear pending queries from queue
+        pending_count = 0
+        try:
+            while True:
+                self.query_queue.get_nowait()
+                pending_count += 1
+        except queue.Empty:
+            pass
+        
+        if pending_count > 0:
+            logger.info(f"Cleared {pending_count} pending queries from queue")
+
+        # Wait for workers with progress bar
+        with tqdm(total=len(self.workers), desc="Stopping workers", unit="worker") as pbar:
+            for i, worker in enumerate(self.workers):
+                worker.join(timeout=5)
+                if worker.is_alive():
+                    logger.warning(f"Worker {i+1} did not terminate gracefully")
+                pbar.update(1)
 
         # Stop event loop
         if self.loop:
diff --git a/language/gpt-oss/mlperf/user.conf b/language/gpt-oss/mlperf/user.conf
index a836241115..a3b21cad1e 100644
--- a/language/gpt-oss/mlperf/user.conf
+++ b/language/gpt-oss/mlperf/user.conf
@@ -1,10 +1,12 @@
 gpt-oss-120b.Offline.target_qps = 10.0
 gpt-oss-120b.Offline.min_duration = 60000
 gpt-oss-120b.Offline.min_query_count = 8036
+# gpt-oss-120b.Offline.min_query_count = 1724
 
-gpt-oss-120b.Server.target_qps = 7.0
+gpt-oss-120b.Server.target_qps = 10
 gpt-oss-120b.Server.min_duration = 60000
 gpt-oss-120b.Server.min_query_count = 8036
+# gpt-oss-120b.Server.min_query_count = 1724
 
 gpt-oss-120b.Server.target_latency = 0
 gpt-oss-120b.Server.ttft_latency = 2000
diff --git a/language/gpt-oss/run_mlperf.py b/language/gpt-oss/run_mlperf.py
index bc83b32009..91057b9f22 100755
--- a/language/gpt-oss/run_mlperf.py
+++ b/language/gpt-oss/run_mlperf.py
@@ -16,12 +16,13 @@
 """
 
 import argparse
+import json
 import logging
 import os
 import sys
 import threading
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Dict, Any
 
 import mlperf_loadgen as lg
 import pandas as pd
@@ -42,6 +43,27 @@
 logger = logging.getLogger(__name__)
 
 
+def load_generation_config(config_path: str) -> Dict[str, Any]:
+    """Load generation configuration from JSON file.
+    
+    Args:
+        config_path: Path to generation_config.json
+        
+    Returns:
+        Dictionary with generation parameters
+    """
+    logger.info(f"Loading generation config from {config_path}")
+    
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    
+    # Filter out comment fields (starting with _)
+    gen_params = {k: v for k, v in config.items() if not k.startswith('_')}
+    
+    logger.info(f"Generation config loaded: {gen_params}")
+    return gen_params
+
+
 def create_argument_parser() -> argparse.ArgumentParser:
     """Create argument parser for MLPerf runner."""
     parser = argparse.ArgumentParser(
@@ -117,33 +139,12 @@ def create_argument_parser() -> argparse.ArgumentParser:
         help="Server URL for backend (SGLang)"
     )
 
-    # Generation parameters
+    # Generation configuration
     parser.add_argument(
-        "--max-tokens",
-        type=int,
-        default=32768,
-        help="Maximum tokens to generate"
-    )
-
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        default=0.001,
-        help="Sampling temperature"
-    )
-
-    parser.add_argument(
-        "--top-k",
-        type=int,
-        default=1,
-        help="Top-k sampling parameter"
-    )
-
-    parser.add_argument(
-        "--top-p",
-        type=float,
-        default=1.0,
-        help="Top-p sampling parameter"
+        "--generation-config",
+        type=str,
+        default="generation_config.json",
+        help="Path to generation configuration JSON file"
     )
 
     # Server scenario specific
@@ -203,15 +204,17 @@ def configure_loadgen(
         settings.mode = lg.TestMode.PerformanceOnly
 
     # Load configurations if files exist
+    # conf_type: 2 = mlperf.conf, 1 = user.conf
+    # LoadGen tracks config calls and only allows one user.conf for official submissions
     if mlperf_conf and Path(mlperf_conf).exists():
         logger.info(f"Loading MLPerf config from {mlperf_conf}")
-        settings.FromConfig(mlperf_conf, model_name, scenario.capitalize())
+        settings.FromConfig(mlperf_conf, model_name, scenario.capitalize(), 2)
     else:
         logger.warning(f"MLPerf config not found: {mlperf_conf}")
 
     if user_conf and Path(user_conf).exists():
         logger.info(f"Loading user config from {user_conf}")
-        settings.FromConfig(user_conf, model_name, scenario.capitalize())
+        settings.FromConfig(user_conf, model_name, scenario.capitalize(), 1)
     else:
         logger.warning(f"User config not found: {user_conf}")
 
@@ -313,6 +316,22 @@ def do_cleanup():
 
         logger.info(f"Loaded {len(prompts)} prompts from dataset")
 
+        # Load generation configuration
+        logger.info("Loading generation configuration...")
+        gen_config = load_generation_config(args.generation_config)
+        
+        # Extract generation parameters with defaults
+        max_tokens = gen_config.get('max_new_tokens', 10240)
+        temperature = gen_config.get('temperature', 1.0)
+        top_k = gen_config.get('top_k', -1)
+        top_p = gen_config.get('top_p', 1.0)
+        
+        logger.info("Generation parameters:")
+        logger.info(f"  max_new_tokens: {max_tokens}")
+        logger.info(f"  temperature: {temperature}")
+        logger.info(f"  top_k: {top_k}")
+        logger.info(f"  top_p: {top_p}")
+
         # Initialize backend
         logger.info(f"Initializing {args.backend} backend...")
         if args.backend == "sglang":
@@ -352,10 +371,10 @@ def do_cleanup():
             sut = OfflineSUT(
                 backend=backend,
                 dataset=prompts,
-                max_tokens=args.max_tokens,
-                temperature=args.temperature,
-                top_k=args.top_k,
-                top_p=args.top_p,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
                 name=f"gpt-oss-120b_offline_sut",
                 progress_bar=pbar,
                 max_concurrency=args.max_concurrency
@@ -364,10 +383,10 @@ def do_cleanup():
             sut = ServerSUT(
                 backend=backend,
                 dataset=prompts,
-                max_tokens=args.max_tokens,
-                temperature=args.temperature,
-                top_k=args.top_k,
-                top_p=args.top_p,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
                 num_workers=args.num_workers,
                 name=f"gpt-oss-120b_server_sut",
                 progress_bar=pbar

From 382fc9ed7d742d80f2353673d7b3de8a9cc3b29c Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Fri, 21 Nov 2025 07:03:26 +0000
Subject: [PATCH 124/165] add accuracy eval script for mlperf

---
 language/gpt-oss/eval_mlperf_accuracy.py | 578 +++++++++++++++++++++++
 1 file changed, 578 insertions(+)
 create mode 100644 language/gpt-oss/eval_mlperf_accuracy.py

diff --git a/language/gpt-oss/eval_mlperf_accuracy.py b/language/gpt-oss/eval_mlperf_accuracy.py
new file mode 100644
index 0000000000..eeadcde15d
--- /dev/null
+++ b/language/gpt-oss/eval_mlperf_accuracy.py
@@ -0,0 +1,578 @@
+#!/usr/bin/env python3
+"""
+Evaluate MLPerf accuracy logs for gpt-oss-120b.
+
+This script takes MLPerf accuracy JSON logs and a reference pickle file,
+evaluates the outputs, and generates accuracy scores by dataset and overall.
+
+Usage:
+    python eval_mlperf_accuracy.py \
+        --mlperf-log mlperf_logs_offline_x8_acc/offline/accuracy/mlperf_log_accuracy.json \
+        --reference-data data/accuracy_eval_tokenized_filtered.pkl \
+        --output-file accuracy_results.json
+"""
+
+import argparse
+import json
+import logging
+import pickle
+import struct
+import multiprocessing
+import os
+from pathlib import Path
+from typing import Dict, Any, List, Tuple
+from collections import defaultdict
+from concurrent.futures import ProcessPoolExecutor, as_completed, TimeoutError
+
+import pandas as pd
+from transformers import AutoTokenizer
+from tqdm import tqdm
+
+# Import evaluation functions from the existing script
+import sys
+sys.path.insert(0, str(Path(__file__).parent))
+from eval_accuracy import (
+    get_evaluator, validate_dataset_name, validate_text_input, DATASET_EVALUATORS,
+    evaluate_livecodebench_worker, load_lcb_benchmark
+)
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
+def load_mlperf_log(log_path: str) -> List[Dict[str, Any]]:
+    """Load MLPerf accuracy JSON log.
+    
+    Args:
+        log_path: Path to mlperf_log_accuracy.json
+        
+    Returns:
+        List of log entries with seq_id, qsl_idx, data (hex), token_count
+    """
+    logger.info(f"Loading MLPerf log from {log_path}")
+    with open(log_path, 'r') as f:
+        log_data = json.load(f)
+    
+    logger.info(f"Loaded {len(log_data)} log entries")
+    return log_data
+
+
+def decode_hex_to_tokens(hex_data: str) -> List[int]:
+    """Decode hex string to list of token IDs (int32).
+    
+    MLPerf stores token IDs as hex-encoded int32 array.
+    
+    Args:
+        hex_data: Hex string like "450D0300..."
+        
+    Returns:
+        List of token IDs
+    """
+    # Convert hex string to bytes
+    data_bytes = bytes.fromhex(hex_data)
+    
+    # Unpack as int32 array (little-endian)
+    num_tokens = len(data_bytes) // 4
+    token_ids = struct.unpack(f'<{num_tokens}i', data_bytes)
+    
+    return list(token_ids)
+
+
+def detokenize(token_ids: List[int], tokenizer) -> str:
+    """Convert token IDs to text.
+    
+    Args:
+        token_ids: List of integer token IDs
+        tokenizer: HuggingFace tokenizer
+        
+    Returns:
+        Decoded text string
+    """
+    return tokenizer.decode(token_ids, skip_special_tokens=False)
+
+
+def process_livecodebench_batch(
+    entries: List[Dict[str, Any]],
+    reference_df: pd.DataFrame,
+    tokenizer,
+    evaluator: Dict[str, Any],
+    lcb_executor: ProcessPoolExecutor,
+    dataset_name: str,
+    args
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """Process a batch of LiveCodeBench entries in parallel.
+    
+    Args:
+        entries: List of MLPerf log entries for this dataset
+        reference_df: Reference DataFrame
+        tokenizer: HuggingFace tokenizer
+        evaluator: Evaluator functions dict
+        lcb_executor: ProcessPoolExecutor for parallel evaluation
+        dataset_name: Dataset name
+        args: Command line arguments
+        
+    Returns:
+        Tuple of (results_list, outputs_list)
+    """
+    # First pass: decode and parse all entries
+    work_items = []
+    entry_metadata = []  # Store (entry, qsl_idx, ref_row, token_ids, model_output)
+    
+    logger.info(f"Parsing {len(entries)} {dataset_name} samples...")
+    for entry in tqdm(entries, desc=f"Parsing {dataset_name}", unit="sample"):
+        seq_id = entry['seq_id']
+        qsl_idx = entry['qsl_idx']
+        hex_data = entry['data']
+        
+        ref_row = reference_df.iloc[qsl_idx]
+        ground_truth = ref_row.get('ground_truth', None)
+        
+        # Decode tokens to text
+        token_ids = decode_hex_to_tokens(hex_data)
+        model_output = detokenize(token_ids, tokenizer)
+        
+        # Parse code from model output
+        extracted_code = evaluator['parse'](model_output)
+        
+        entry_metadata.append({
+            'entry': entry,
+            'qsl_idx': qsl_idx,
+            'ref_row': ref_row,
+            'token_ids': token_ids,
+            'model_output': model_output,
+            'extracted_code': extracted_code,
+            'ground_truth': ground_truth
+        })
+        
+        # Add to work queue if code was extracted
+        if extracted_code is not None and not pd.isna(ground_truth):
+            work_items.append((extracted_code, ground_truth))
+        else:
+            work_items.append(None)  # Placeholder for skipped items
+    
+    # Second pass: batch evaluate code in parallel
+    logger.info(f"Evaluating {len([w for w in work_items if w is not None])} {dataset_name} code samples with parallel workers...")
+    
+    results_list = []
+    outputs_list = []
+    
+    # Submit all work items
+    future_to_idx = {}
+    for idx, work_item in enumerate(work_items):
+        if work_item is not None:
+            future = lcb_executor.submit(evaluate_livecodebench_worker, work_item)
+            future_to_idx[future] = idx
+    
+    # Collect results with progress bar
+    eval_results = [None] * len(work_items)
+    
+    for future in tqdm(as_completed(future_to_idx.keys(), timeout=1200),
+                      total=len(future_to_idx),
+                      desc=f"Evaluating {dataset_name}",
+                      unit="sample"):
+        idx = future_to_idx[future]
+        try:
+            question_id, is_correct, detailed_reason = future.result(timeout=80)
+            eval_results[idx] = (is_correct, detailed_reason)
+        except TimeoutError:
+            logger.warning(f"Timeout evaluating sample {idx}: Test execution exceeded 80s timeout")
+            eval_results[idx] = (False, "Timeout: Test execution exceeded time limit")
+        except Exception as e:
+            logger.error(f"Error evaluating sample {idx}: {e}")
+            eval_results[idx] = (False, f"Error: {e}")
+    
+    # Third pass: compile final results
+    for idx, metadata in enumerate(entry_metadata):
+        entry = metadata['entry']
+        qsl_idx = metadata['qsl_idx']
+        token_ids = metadata['token_ids']
+        model_output = metadata['model_output']
+        extracted_code = metadata['extracted_code']
+        ground_truth = metadata['ground_truth']
+        
+        # Get evaluation result
+        if extracted_code is None or pd.isna(ground_truth):
+            is_correct = False
+            eval_details = "No code extracted from model output" if extracted_code is None else "No ground truth available"
+        else:
+            is_correct, eval_details = eval_results[idx]
+        
+        # Record result
+        result = {
+            'seq_id': entry['seq_id'],
+            'qsl_idx': qsl_idx,
+            'dataset': dataset_name,
+            'is_correct': is_correct,
+            'extracted_answer': str(extracted_code)[:200] if extracted_code is not None else None,
+            'ground_truth': str(ground_truth) if not pd.isna(ground_truth) else None,
+            'evaluation_details': eval_details,
+            'token_count': len(token_ids),
+            'model_output_preview': model_output[:200] if args.verbose else None
+        }
+        results_list.append(result)
+        
+        # Store output data if requested
+        if args.save_outputs:
+            output_record = {
+                'qsl_idx': qsl_idx,
+                'seq_id': entry['seq_id'],
+                'dataset': dataset_name,
+                'ground_truth': ground_truth,
+                'model_output': model_output,
+                'output_token_ids': token_ids,
+                'extracted_answer': extracted_code,
+                'is_correct': is_correct,
+                'evaluation_details': eval_details
+            }
+            outputs_list.append(output_record)
+    
+    return results_list, outputs_list
+
+
+def evaluate_single_entry(
+    model_output: str,
+    ground_truth: str,
+    dataset_name: str
+) -> Tuple[bool, Any, str]:
+    """Evaluate a single model output.
+    
+    Args:
+        model_output: Generated text from model
+        ground_truth: Expected answer
+        dataset_name: Dataset name (e.g., 'gpqa', 'math500')
+        
+    Returns:
+        Tuple of (is_correct, extracted_answer, evaluation_details)
+    """
+    evaluator = get_evaluator(dataset_name)
+    
+    # Parse answer from model output
+    extracted = evaluator['parse'](model_output)
+
+    # Evaluate correctness
+    is_correct = False
+    evaluation_details = ""
+
+    if extracted is None or pd.isna(extracted):
+        evaluation_details = "No answer extracted from model output"
+    else:
+        if not pd.isna(ground_truth):
+            try:
+                is_correct = evaluator['evaluate'](extracted, ground_truth)
+                if is_correct:
+                    evaluation_details = "Correct"
+                else:
+                    evaluation_details = f"Incorrect (extracted: {extracted}, ground_truth: {ground_truth})"
+            except Exception as e:
+                evaluation_details = f"Evaluation error: {e}"
+                logger.warning(f"Error evaluating: {e}")
+        else:
+            evaluation_details = "No ground truth available"
+    
+    return is_correct, extracted, evaluation_details
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Evaluate MLPerf accuracy logs for gpt-oss-120b"
+    )
+    parser.add_argument(
+        "--mlperf-log",
+        type=str,
+        required=True,
+        help="Path to mlperf_log_accuracy.json"
+    )
+    parser.add_argument(
+        "--reference-data",
+        type=str,
+        required=True,
+        help="Path to reference pickle file (DataFrame with dataset, ground_truth, etc.)"
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        default="openai/gpt-oss-120b",
+        help="HuggingFace tokenizer name or path"
+    )
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        default=None,
+        help="Output JSON file for results (optional)"
+    )
+    parser.add_argument(
+        "--save-outputs",
+        type=str,
+        default=None,
+        help="Save detokenized outputs to pickle file (ordered by qsl_idx) for debugging"
+    )
+    parser.add_argument(
+        "--num-lcb-workers",
+        type=int,
+        default=64,
+        help="Number of parallel workers for LiveCodeBench evaluation (default: 64)"
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Verbose logging"
+    )
+    
+    args = parser.parse_args()
+    
+    if args.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+    
+    # Load MLPerf log
+    mlperf_log = load_mlperf_log(args.mlperf_log)
+    
+    # Load reference data
+    logger.info(f"Loading reference data from {args.reference_data}")
+    with open(args.reference_data, 'rb') as f:
+        reference_df = pickle.load(f)
+    
+    logger.info(f"Reference data shape: {reference_df.shape}")
+    logger.info(f"Reference columns: {list(reference_df.columns)}")
+    
+    # Log unique datasets in reference data
+    if 'dataset' in reference_df.columns:
+        unique_datasets = reference_df['dataset'].unique()
+        dataset_counts = reference_df['dataset'].value_counts()
+        logger.info(f"Unique datasets in reference data ({len(unique_datasets)} total):")
+        for ds in sorted(unique_datasets):
+            logger.info(f"  '{ds}' ({dataset_counts[ds]} samples)")
+        
+        logger.info("\nSample rows from reference data:")
+        for idx in [0, 1, 2]:
+            if idx < len(reference_df):
+                logger.info(f"  Row {idx}: dataset='{reference_df.iloc[idx]['dataset']}'")
+        
+        # Show how each will be mapped to evaluators
+        logger.info("\nExpected Dataset → Evaluator mapping:")
+        for ds in sorted(unique_datasets):
+            try:
+                ds_lower = validate_dataset_name(ds)
+                # Find which evaluator key matches
+                matched_key = None
+                for key in DATASET_EVALUATORS.keys():
+                    if key in ds_lower:
+                        matched_key = key
+                        break
+                logger.info(f"  '{ds}' (normalized: '{ds_lower}') → '{matched_key}'")
+            except Exception as e:
+                logger.warning(f"  '{ds}' → ERROR: {e}")
+    
+    # Load tokenizer
+    logger.info(f"Loading tokenizer: {args.tokenizer}")
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
+    
+    # Group MLPerf log entries by dataset
+    logger.info("Grouping MLPerf log entries by dataset...")
+    dataset_entries = defaultdict(list)
+    
+    for entry in mlperf_log:
+        qsl_idx = entry['qsl_idx']
+        if qsl_idx >= len(reference_df):
+            logger.warning(f"qsl_idx {qsl_idx} out of range (max: {len(reference_df)-1})")
+            continue
+        
+        ref_row = reference_df.iloc[qsl_idx]
+        dataset_name = validate_dataset_name(ref_row['dataset'])
+        dataset_entries[dataset_name].append(entry)
+    
+    logger.info(f"Grouped entries by dataset:")
+    for ds_name, entries in sorted(dataset_entries.items()):
+        logger.info(f"  {ds_name}: {len(entries)} samples")
+    
+    # Pre-load LiveCodeBench benchmark if needed
+    lcb_executor = None
+    if any('livecodebench' in ds for ds in dataset_entries.keys()):
+        try:
+            logger.info("Pre-loading LiveCodeBench benchmark for parallel evaluation...")
+            os.environ['TQDM_DISABLE'] = '1'  # Disable tqdm in workers
+            _ = load_lcb_benchmark()
+            logger.info("LiveCodeBench benchmark loaded successfully")
+            
+            # Create shared ProcessPoolExecutor for all LCB evaluations
+            max_workers = min(multiprocessing.cpu_count(), args.num_lcb_workers)
+            lcb_executor = ProcessPoolExecutor(max_workers=max_workers)
+            logger.info(f"Created ProcessPoolExecutor with {max_workers} workers for LiveCodeBench")
+        except Exception as e:
+            logger.warning(f"Failed to pre-load LiveCodeBench benchmark: {e}")
+            logger.warning("LiveCodeBench evaluation may be slower")
+    
+    # Process each dataset separately with its own progress bar
+    logger.info("\nProcessing MLPerf log entries by dataset...")
+    
+    results = []
+    dataset_stats = defaultdict(lambda: {"correct": 0, "total": 0})
+    outputs_data = []  # For saving detokenized outputs
+    
+    try:
+        for dataset_name in sorted(dataset_entries.keys()):
+            entries = dataset_entries[dataset_name]
+            logger.info(f"\n{'=' * 80}")
+            logger.info(f"Processing {dataset_name}: {len(entries)} samples")
+            logger.info(f"{'=' * 80}")
+            
+            evaluator = get_evaluator(dataset_name)
+            is_livecodebench = 'livecodebench' in dataset_name.lower()
+            
+            if is_livecodebench and lcb_executor is not None:
+                # Batched LiveCodeBench evaluation
+                results_batch, outputs_batch = process_livecodebench_batch(
+                    entries, reference_df, tokenizer, evaluator,
+                    lcb_executor, dataset_name, args
+                )
+                results.extend(results_batch)
+                if args.save_outputs:
+                    outputs_data.extend(outputs_batch)
+                
+                # Update stats
+                for res in results_batch:
+                    dataset_stats[dataset_name]["total"] += 1
+                    if res['is_correct']:
+                        dataset_stats[dataset_name]["correct"] += 1
+            else:
+                # Sequential evaluation for non-LCB datasets
+                for entry in tqdm(entries, desc=f"Evaluating {dataset_name}", unit="sample"):
+                    seq_id = entry['seq_id']
+                    qsl_idx = entry['qsl_idx']
+                    hex_data = entry['data']
+                    
+                    ref_row = reference_df.iloc[qsl_idx]
+                    ground_truth = ref_row.get('ground_truth', None)
+                    
+                    # Decode tokens to text
+                    token_ids = decode_hex_to_tokens(hex_data)
+                    model_output = detokenize(token_ids, tokenizer)
+                    
+                    # Evaluate
+                    try:
+                        is_correct, extracted, eval_details = evaluate_single_entry(
+                            model_output, ground_truth, dataset_name
+                        )
+                    except Exception as e:
+                        logger.warning(f"Evaluation error for qsl_idx={qsl_idx}, dataset={dataset_name}: {e}")
+                        is_correct = False
+                        extracted = None
+                        eval_details = f"Evaluation error: {e}"
+                    
+                    # Record result
+                    result = {
+                        'seq_id': seq_id,
+                        'qsl_idx': qsl_idx,
+                        'dataset': dataset_name,
+                        'is_correct': is_correct,
+                        'extracted_answer': str(extracted) if extracted is not None else None,
+                        'ground_truth': str(ground_truth) if not pd.isna(ground_truth) else None,
+                        'evaluation_details': eval_details,
+                        'token_count': len(token_ids),
+                        'model_output_preview': model_output[:200] if args.verbose else None
+                    }
+                    results.append(result)
+                    
+                    # Store output data for pickle export
+                    if args.save_outputs:
+                        output_record = {
+                            'qsl_idx': qsl_idx,
+                            'seq_id': seq_id,
+                            'dataset': dataset_name,
+                            'ground_truth': ground_truth,
+                            'model_output': model_output,
+                            'output_token_ids': token_ids,
+                            'extracted_answer': extracted,
+                            'is_correct': is_correct,
+                            'evaluation_details': eval_details
+                        }
+                        outputs_data.append(output_record)
+                    
+                    # Update stats
+                    dataset_stats[dataset_name]["total"] += 1
+                    if is_correct:
+                        dataset_stats[dataset_name]["correct"] += 1
+    
+    finally:
+        # Clean up LiveCodeBench executor
+        if lcb_executor is not None:
+            logger.info("Shutting down LiveCodeBench ProcessPoolExecutor")
+            lcb_executor.shutdown(wait=True)
+            os.environ.pop('TQDM_DISABLE', None)
+    
+    # Calculate overall stats
+    total_correct = sum(stats["correct"] for stats in dataset_stats.values())
+    total_samples = sum(stats["total"] for stats in dataset_stats.values())
+    overall_accuracy = (total_correct / total_samples * 100) if total_samples > 0 else 0.0
+    
+    # Print results
+    print("\n" + "=" * 80)
+    print("MLPerf Accuracy Evaluation Results")
+    print("=" * 80)
+    print(f"Total samples: {total_samples}")
+    print(f"Overall accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
+    print("=" * 80)
+    print("\nPer-Dataset Breakdown:")
+    print("-" * 80)
+    
+    for dataset_name in sorted(dataset_stats.keys()):
+        stats = dataset_stats[dataset_name]
+        accuracy = (stats["correct"] / stats["total"] * 100) if stats["total"] > 0 else 0.0
+        print(f"{dataset_name:20s}: {accuracy:6.2f}% ({stats['correct']:4d}/{stats['total']:4d})")
+    
+    print("=" * 80)
+    
+    # Save detokenized outputs to pickle if requested
+    if args.save_outputs:
+        logger.info(f"Saving detokenized outputs to {args.save_outputs}...")
+        
+        # Sort by qsl_idx for ordered output
+        outputs_data_sorted = sorted(outputs_data, key=lambda x: x['qsl_idx'])
+        
+        # Convert to DataFrame for easier inspection
+        outputs_df = pd.DataFrame(outputs_data_sorted)
+        
+        output_path = Path(args.save_outputs)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        with open(output_path, 'wb') as f:
+            pickle.dump(outputs_df, f)
+        
+        logger.info(f"Saved {len(outputs_df)} detokenized outputs (ordered by qsl_idx) to: {output_path}")
+        logger.info(f"Columns: {list(outputs_df.columns)}")
+    
+    # Save detailed results if requested
+    if args.output_file:
+        output_data = {
+            "summary": {
+                "total_samples": total_samples,
+                "total_correct": total_correct,
+                "overall_accuracy": overall_accuracy,
+                "per_dataset": {
+                    dataset: {
+                        "correct": stats["correct"],
+                        "total": stats["total"],
+                        "accuracy": (stats["correct"] / stats["total"] * 100) if stats["total"] > 0 else 0.0
+                    }
+                    for dataset, stats in dataset_stats.items()
+                }
+            },
+            "detailed_results": results if args.verbose else None
+        }
+        
+        output_path = Path(args.output_file)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        with open(output_path, 'w') as f:
+            json.dump(output_data, f, indent=2)
+        
+        logger.info(f"Results saved to: {output_path}")
+    
+    logger.info("Evaluation complete!")
+
+
+if __name__ == "__main__":
+    main()
+

From 31f435a2c8e97759598851094f5882aba63f1c32 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Fri, 21 Nov 2025 08:06:13 +0000
Subject: [PATCH 125/165] finishing touches

---
 language/gpt-oss/README.md            | 43 +++++++++++++++++++++++----
 language/gpt-oss/mlperf/server_sut.py |  2 ++
 language/gpt-oss/run_mlperf.py        | 38 ++++++++++++++---------
 3 files changed, 63 insertions(+), 20 deletions(-)

diff --git a/language/gpt-oss/README.md b/language/gpt-oss/README.md
index 4542acf654..6c52ccd74f 100644
--- a/language/gpt-oss/README.md
+++ b/language/gpt-oss/README.md
@@ -39,6 +39,11 @@ Then, run a benchmark script that uses the client to send/recv requests.
 ### Run the inference
 ```bash
 python3 run_mlperf.py --help
+usage: run_mlperf.py [-h] [--mode {offline,server}] --input-file INPUT_FILE [--max-samples MAX_SAMPLES] [--mlperf-conf MLPERF_CONF]
+                     [--user-conf USER_CONF] [--accuracy] [--output-dir OUTPUT_DIR] [--backend {sglang}] [--server-url SERVER_URL]
+                     [--generation-config GENERATION_CONFIG] [--max-new-tokens MAX_NEW_TOKENS] [--num-workers NUM_WORKERS]
+                     [--max-concurrency MAX_CONCURRENCY]
+
 Run MLPerf inference benchmarks for gpt-oss
 
 options:
@@ -59,14 +64,40 @@ options:
   --backend {sglang}    Backend to use for inference
   --server-url SERVER_URL
                         Server URL for backend (SGLang)
-  --max-tokens MAX_TOKENS
-                        Maximum tokens to generate
-  --temperature TEMPERATURE
-                        Sampling temperature
-  --top-k TOP_K         Top-k sampling parameter
-  --top-p TOP_P         Top-p sampling parameter
+  --generation-config GENERATION_CONFIG
+                        Path to generation configuration JSON file
+  --max-new-tokens MAX_NEW_TOKENS
+                        Override max_new_tokens from generation config (default: use value from config)
   --num-workers NUM_WORKERS
                         Number of worker threads (for server scenario)
   --max-concurrency MAX_CONCURRENCY
                         Maximum concurrent requests to backend (SGLang handles batching internally)
+
 ```
+
+### Evaluate the accuracy
+Run `run_mlperf.py` with `--accuracy`, and then use the generated `mlperf_log_accuracy.json` to evaluate the accuracy of the run. Usage is as below.
+```bash
+python3 eval_mlperf_accuracy.py --help
+usage: eval_mlperf_accuracy.py [-h] --mlperf-log MLPERF_LOG --reference-data REFERENCE_DATA [--tokenizer TOKENIZER] [--output-file OUTPUT_FILE]
+                               [--save-outputs SAVE_OUTPUTS] [--num-lcb-workers NUM_LCB_WORKERS] [--verbose]
+
+Evaluate MLPerf accuracy logs for gpt-oss-120b
+
+options:
+  -h, --help            show this help message and exit
+  --mlperf-log MLPERF_LOG
+                        Path to mlperf_log_accuracy.json
+  --reference-data REFERENCE_DATA
+                        Path to reference pickle file (DataFrame with dataset, ground_truth, etc.)
+  --tokenizer TOKENIZER
+                        HuggingFace tokenizer name or path
+  --output-file OUTPUT_FILE
+                        Output JSON file for results (optional)
+  --save-outputs SAVE_OUTPUTS
+                        Save detokenized outputs to pickle file (ordered by qsl_idx) for debugging
+  --num-lcb-workers NUM_LCB_WORKERS
+                        Number of parallel workers for LiveCodeBench evaluation (default: 64)
+  --verbose             Verbose logging
+
+```
\ No newline at end of file
diff --git a/language/gpt-oss/mlperf/server_sut.py b/language/gpt-oss/mlperf/server_sut.py
index 231dd89acd..d34dcfefb7 100644
--- a/language/gpt-oss/mlperf/server_sut.py
+++ b/language/gpt-oss/mlperf/server_sut.py
@@ -5,6 +5,7 @@
 import logging
 import numpy as np
 import queue
+import sys
 import threading
 import time
 from dataclasses import dataclass
@@ -336,6 +337,7 @@ async def _send_final_response(self, state: StreamingQueryState):
                     self.queries_completed += 1
                     self.progress_bar.update(1)
                     self.progress_bar.refresh()  # Force redraw from async context
+                    sys.stdout.flush()  # Force flush for immediate display in async/threaded context
 
         except Exception as e:
             logger.error(
diff --git a/language/gpt-oss/run_mlperf.py b/language/gpt-oss/run_mlperf.py
index 91057b9f22..fea2d59392 100755
--- a/language/gpt-oss/run_mlperf.py
+++ b/language/gpt-oss/run_mlperf.py
@@ -60,7 +60,6 @@ def load_generation_config(config_path: str) -> Dict[str, Any]:
     # Filter out comment fields (starting with _)
     gen_params = {k: v for k, v in config.items() if not k.startswith('_')}
     
-    logger.info(f"Generation config loaded: {gen_params}")
     return gen_params
 
 
@@ -146,6 +145,13 @@ def create_argument_parser() -> argparse.ArgumentParser:
         default="generation_config.json",
         help="Path to generation configuration JSON file"
     )
+    
+    parser.add_argument(
+        "--max-new-tokens",
+        type=int,
+        default=None,
+        help="Override max_new_tokens from generation config (default: use value from config)"
+    )
 
     # Server scenario specific
     parser.add_argument(
@@ -207,13 +213,13 @@ def configure_loadgen(
     # conf_type: 2 = mlperf.conf, 1 = user.conf
     # LoadGen tracks config calls and only allows one user.conf for official submissions
     if mlperf_conf and Path(mlperf_conf).exists():
-        logger.info(f"Loading MLPerf config from {mlperf_conf}")
+        logger.debug(f"Loading MLPerf config from {mlperf_conf}")
         settings.FromConfig(mlperf_conf, model_name, scenario.capitalize(), 2)
     else:
         logger.warning(f"MLPerf config not found: {mlperf_conf}")
 
     if user_conf and Path(user_conf).exists():
-        logger.info(f"Loading user config from {user_conf}")
+        logger.debug(f"Loading user config from {user_conf}")
         settings.FromConfig(user_conf, model_name, scenario.capitalize(), 1)
     else:
         logger.warning(f"User config not found: {user_conf}")
@@ -304,7 +310,7 @@ def do_cleanup():
         logger.info("=" * 80)
 
         # Load dataset
-        logger.info("Loading tokenized dataset...")
+        logger.debug("Loading tokenized dataset...")
         with tqdm(total=1, desc="Loading dataset", unit="file") as pbar:
             dataset_info = load_tokenized_dataset(
                 args.input_file,
@@ -321,7 +327,14 @@ def do_cleanup():
         gen_config = load_generation_config(args.generation_config)
         
         # Extract generation parameters with defaults
-        max_tokens = gen_config.get('max_new_tokens', 10240)
+        # CLI override takes precedence over config file
+        if args.max_new_tokens is not None:
+            max_tokens = args.max_new_tokens
+            logger.info(f"Using max_new_tokens from CLI override: {max_tokens}")
+        else:
+            max_tokens = gen_config.get('max_new_tokens', 10240)
+            logger.info(f"Using max_new_tokens from config: {max_tokens}")
+        
         temperature = gen_config.get('temperature', 1.0)
         top_k = gen_config.get('top_k', -1)
         top_p = gen_config.get('top_p', 1.0)
@@ -333,7 +346,7 @@ def do_cleanup():
         logger.info(f"  top_p: {top_p}")
 
         # Initialize backend
-        logger.info(f"Initializing {args.backend} backend...")
+        logger.debug(f"Initializing {args.backend} backend...")
         if args.backend == "sglang":
             # Set pool size to match max_concurrency with small safety margin
             # This prevents "connection pool is full" warnings
@@ -343,7 +356,6 @@ def do_cleanup():
                 timeout=1200,
                 max_pool_size=pool_size
             )
-            logger.info(f"Backend configured with connection pool size: {pool_size} (from max_concurrency={args.max_concurrency})")
         else:
             raise ValueError(f"Unknown backend: {args.backend}")
 
@@ -362,11 +374,12 @@ def do_cleanup():
             position=0,
             mininterval=0.1,
             smoothing=0.1,
-            dynamic_ncols=True
+            dynamic_ncols=True,
+            file=sys.stdout  # Force unbuffered output for async updates
         )
 
         # Create SUT with progress bar
-        logger.info(f"Creating {args.mode} SUT...")
+        logger.debug(f"Creating {args.mode} SUT...")
         if args.mode == "offline":
             sut = OfflineSUT(
                 backend=backend,
@@ -417,12 +430,9 @@ def do_cleanup():
         log_settings.log_output.copy_summary_to_stdout = True
         log_settings.enable_trace = False
 
-        # Start the SUT
-        logger.info("Starting SUT...")
-        sut.start()
-
-        # Run test
+        # Start the SUT and run test
         logger.info("Running LoadGen test...")
+        sut.start()
         lg.StartTestWithLogSettings(
             sut.sut,
             qsl.qsl,

From 63592a3b6c40de460e684e20b778a96aef0388df Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Fri, 21 Nov 2025 08:06:42 +0000
Subject: [PATCH 126/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/mlperf/server_sut.py | 39 ++++++++++++++++-----------
 language/gpt-oss/run_mlperf.py        | 24 +++++++++--------
 2 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/language/gpt-oss/mlperf/server_sut.py b/language/gpt-oss/mlperf/server_sut.py
index d34dcfefb7..2f9c83532f 100644
--- a/language/gpt-oss/mlperf/server_sut.py
+++ b/language/gpt-oss/mlperf/server_sut.py
@@ -160,14 +160,15 @@ def _worker_thread(self):
         except Exception as e:
             logger.error(f"Worker thread error: {e}", exc_info=True)
 
-    async def _process_streaming_query_tracked(self, query_sample: lg.QuerySample):
+    async def _process_streaming_query_tracked(
+            self, query_sample: lg.QuerySample):
         """Wrapper that tracks the async task for cancellation."""
         task = asyncio.current_task()
-        
+
         # Add to active tasks
         with self.active_tasks_lock:
             self.active_tasks.add(task)
-        
+
         try:
             await self._process_streaming_query(query_sample)
         finally:
@@ -238,8 +239,10 @@ async def _process_streaming_query(self, query_sample: lg.QuerySample):
                 await self._send_final_response(state)
 
         except asyncio.CancelledError:
-            # Task was cancelled (e.g., KeyboardInterrupt during graceful shutdown)
-            logger.info(f"Streaming query {query_id} cancelled during shutdown")
+            # Task was cancelled (e.g., KeyboardInterrupt during graceful
+            # shutdown)
+            logger.info(
+                f"Streaming query {query_id} cancelled during shutdown")
             # Don't send response to LoadGen - we're shutting down
             raise  # Re-raise to mark task as cancelled
         except Exception as e:
@@ -258,7 +261,7 @@ async def _process_streaming_query(self, query_sample: lg.QuerySample):
 
     async def _send_first_token_complete(self, state: StreamingQueryState):
         """Send FirstTokenComplete to LoadGen for TTFT measurement.
-        
+
         Only sends the first token for TTFT measurement.
         """
         try:
@@ -269,11 +272,13 @@ async def _send_first_token_complete(self, state: StreamingQueryState):
             if state.accumulated_tokens and len(state.accumulated_tokens) > 0:
                 # Extract only the first token
                 first_token_only = [state.accumulated_tokens[0]]
-                token_array = np.ascontiguousarray(first_token_only, dtype=np.int32)
+                token_array = np.ascontiguousarray(
+                    first_token_only, dtype=np.int32)
             else:
                 # No tokens yet - this shouldn't happen but handle gracefully
                 token_array = np.array([], dtype=np.int32)
-                logger.warning(f"FirstTokenComplete called but no tokens accumulated for query {state.query_id}")
+                logger.warning(
+                    f"FirstTokenComplete called but no tokens accumulated for query {state.query_id}")
 
             # Create response
             response = lg.QuerySampleResponse(
@@ -285,7 +290,8 @@ async def _send_first_token_complete(self, state: StreamingQueryState):
 
             # Report to LoadGen
             lg.FirstTokenComplete([response])
-            logger.debug(f"Sent FirstTokenComplete for query {state.query_id}: 1 token")
+            logger.debug(
+                f"Sent FirstTokenComplete for query {state.query_id}: 1 token")
 
         except Exception as e:
             logger.error(
@@ -312,7 +318,8 @@ async def _send_final_response(self, state: StreamingQueryState):
 
             if state.accumulated_tokens and len(state.accumulated_tokens) > 1:
                 remaining_tokens = state.accumulated_tokens[1:]
-                token_array = np.ascontiguousarray(remaining_tokens, dtype=np.int32)
+                token_array = np.ascontiguousarray(
+                    remaining_tokens, dtype=np.int32)
             else:
                 token_array = np.array([], dtype=np.int32)
 
@@ -355,7 +362,8 @@ def issue_queries(self, query_samples: List[lg.QuerySample]) -> None:
         # Update progress bar total dynamically as queries arrive
         if self.progress_bar is not None:
             with self.progress_lock:
-                self.progress_bar.total = (self.progress_bar.total or 0) + len(query_samples)
+                self.progress_bar.total = (
+                    self.progress_bar.total or 0) + len(query_samples)
                 self.progress_bar.refresh()
 
         for qs in query_samples:
@@ -395,13 +403,13 @@ def stop(self) -> None:
         tasks_to_cancel = []
         with self.active_tasks_lock:
             tasks_to_cancel = list(self.active_tasks)
-        
+
         if tasks_to_cancel:
             logger.info(f"Cancelling {len(tasks_to_cancel)} active tasks")
             for task in tasks_to_cancel:
                 if not task.done():
                     task.cancel()
-        
+
         # Clear pending queries from queue
         pending_count = 0
         try:
@@ -410,7 +418,7 @@ def stop(self) -> None:
                 pending_count += 1
         except queue.Empty:
             pass
-        
+
         if pending_count > 0:
             logger.info(f"Cleared {pending_count} pending queries from queue")
 
@@ -419,7 +427,8 @@ def stop(self) -> None:
             for i, worker in enumerate(self.workers):
                 worker.join(timeout=5)
                 if worker.is_alive():
-                    logger.warning(f"Worker {i+1} did not terminate gracefully")
+                    logger.warning(
+                        f"Worker {i+1} did not terminate gracefully")
                 pbar.update(1)
 
         # Stop event loop
diff --git a/language/gpt-oss/run_mlperf.py b/language/gpt-oss/run_mlperf.py
index fea2d59392..d119b2d771 100755
--- a/language/gpt-oss/run_mlperf.py
+++ b/language/gpt-oss/run_mlperf.py
@@ -45,21 +45,21 @@
 
 def load_generation_config(config_path: str) -> Dict[str, Any]:
     """Load generation configuration from JSON file.
-    
+
     Args:
         config_path: Path to generation_config.json
-        
+
     Returns:
         Dictionary with generation parameters
     """
     logger.info(f"Loading generation config from {config_path}")
-    
+
     with open(config_path, 'r') as f:
         config = json.load(f)
-    
+
     # Filter out comment fields (starting with _)
     gen_params = {k: v for k, v in config.items() if not k.startswith('_')}
-    
+
     return gen_params
 
 
@@ -145,7 +145,7 @@ def create_argument_parser() -> argparse.ArgumentParser:
         default="generation_config.json",
         help="Path to generation configuration JSON file"
     )
-    
+
     parser.add_argument(
         "--max-new-tokens",
         type=int,
@@ -211,7 +211,8 @@ def configure_loadgen(
 
     # Load configurations if files exist
     # conf_type: 2 = mlperf.conf, 1 = user.conf
-    # LoadGen tracks config calls and only allows one user.conf for official submissions
+    # LoadGen tracks config calls and only allows one user.conf for official
+    # submissions
     if mlperf_conf and Path(mlperf_conf).exists():
         logger.debug(f"Loading MLPerf config from {mlperf_conf}")
         settings.FromConfig(mlperf_conf, model_name, scenario.capitalize(), 2)
@@ -325,20 +326,21 @@ def do_cleanup():
         # Load generation configuration
         logger.info("Loading generation configuration...")
         gen_config = load_generation_config(args.generation_config)
-        
+
         # Extract generation parameters with defaults
         # CLI override takes precedence over config file
         if args.max_new_tokens is not None:
             max_tokens = args.max_new_tokens
-            logger.info(f"Using max_new_tokens from CLI override: {max_tokens}")
+            logger.info(
+                f"Using max_new_tokens from CLI override: {max_tokens}")
         else:
             max_tokens = gen_config.get('max_new_tokens', 10240)
             logger.info(f"Using max_new_tokens from config: {max_tokens}")
-        
+
         temperature = gen_config.get('temperature', 1.0)
         top_k = gen_config.get('top_k', -1)
         top_p = gen_config.get('top_p', 1.0)
-        
+
         logger.info("Generation parameters:")
         logger.info(f"  max_new_tokens: {max_tokens}")
         logger.info(f"  temperature: {temperature}")

From a41f882a14dc6eb3a40be79f283af727ed7beffd Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Fri, 21 Nov 2025 20:09:19 +0000
Subject: [PATCH 127/165] refactor mode -> scenario

---
 language/gpt-oss/README.md     |  4 ++--
 language/gpt-oss/run_mlperf.py | 22 +++++++++++-----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/language/gpt-oss/README.md b/language/gpt-oss/README.md
index 6c52ccd74f..593721aebd 100644
--- a/language/gpt-oss/README.md
+++ b/language/gpt-oss/README.md
@@ -39,7 +39,7 @@ Then, run a benchmark script that uses the client to send/recv requests.
 ### Run the inference
 ```bash
 python3 run_mlperf.py --help
-usage: run_mlperf.py [-h] [--mode {offline,server}] --input-file INPUT_FILE [--max-samples MAX_SAMPLES] [--mlperf-conf MLPERF_CONF]
+usage: run_mlperf.py [-h] [--scenario {offline,server}] --input-file INPUT_FILE [--max-samples MAX_SAMPLES] [--mlperf-conf MLPERF_CONF]
                      [--user-conf USER_CONF] [--accuracy] [--output-dir OUTPUT_DIR] [--backend {sglang}] [--server-url SERVER_URL]
                      [--generation-config GENERATION_CONFIG] [--max-new-tokens MAX_NEW_TOKENS] [--num-workers NUM_WORKERS]
                      [--max-concurrency MAX_CONCURRENCY]
@@ -48,7 +48,7 @@ Run MLPerf inference benchmarks for gpt-oss
 
 options:
   -h, --help            show this help message and exit
-  --mode {offline,server}
+  --scenario {offline,server}
                         MLPerf scenario mode
   --input-file INPUT_FILE
                         Path to tokenized dataset (pickle file)
diff --git a/language/gpt-oss/run_mlperf.py b/language/gpt-oss/run_mlperf.py
index d119b2d771..86e588f2c0 100755
--- a/language/gpt-oss/run_mlperf.py
+++ b/language/gpt-oss/run_mlperf.py
@@ -6,13 +6,13 @@
 
 Usage:
     # Offline scenario (performance)
-    python run_mlperf.py --mode offline --input-file data/accuracy_eval_tokenized.pkl
+    python run_mlperf.py --scenario offline --input-file data/accuracy_eval_tokenized.pkl
 
     # Server scenario (performance)
-    python run_mlperf.py --mode server --input-file data/accuracy_eval_tokenized.pkl
+    python run_mlperf.py --scenario server --input-file data/accuracy_eval_tokenized.pkl
 
     # Accuracy mode
-    python run_mlperf.py --mode offline --accuracy --input-file data/accuracy_eval_tokenized.pkl
+    python run_mlperf.py --scenario offline --accuracy --input-file data/accuracy_eval_tokenized.pkl
 """
 
 import argparse
@@ -71,11 +71,11 @@ def create_argument_parser() -> argparse.ArgumentParser:
 
     # Scenario selection
     parser.add_argument(
-        "--mode",
+        "--scenario",
         type=str,
         default="offline",
         choices=["offline", "server"],
-        help="MLPerf scenario mode"
+        help="MLPerf scenario (offline or server)"
     )
 
     # Dataset
@@ -296,7 +296,7 @@ def do_cleanup():
     try:
         # Create output directories
         output_dir = Path(args.output_dir)
-        log_dir = output_dir / args.mode / \
+        log_dir = output_dir / args.scenario / \
             ("accuracy" if args.accuracy else "performance")
         log_dir.mkdir(parents=True, exist_ok=True)
 
@@ -304,7 +304,7 @@ def do_cleanup():
         logger.info("MLPerf Inference Benchmark Runner for GPT-OSS")
         logger.info("=" * 80)
         logger.info(f"Backend: {args.backend}")
-        logger.info(f"Mode: {args.mode}")
+        logger.info(f"Scenario: {args.scenario}")
         logger.info(f"Accuracy: {args.accuracy}")
         logger.info(f"Input file: {args.input_file}")
         logger.info(f"Output directory: {log_dir}")
@@ -370,7 +370,7 @@ def do_cleanup():
         # - Server: Incremented as queries arrive
         pbar = tqdm(
             total=0,  # Will be updated dynamically by SUT
-            desc=f"MLPerf {args.mode}",
+            desc=f"MLPerf {args.scenario}",
             unit="query",
             leave=True,
             position=0,
@@ -381,8 +381,8 @@ def do_cleanup():
         )
 
         # Create SUT with progress bar
-        logger.debug(f"Creating {args.mode} SUT...")
-        if args.mode == "offline":
+        logger.debug(f"Creating {args.scenario} SUT...")
+        if args.scenario == "offline":
             sut = OfflineSUT(
                 backend=backend,
                 dataset=prompts,
@@ -419,7 +419,7 @@ def do_cleanup():
 
         # Configure LoadGen
         settings = configure_loadgen(
-            scenario=args.mode,
+            scenario=args.scenario,
             accuracy_mode=args.accuracy,
             mlperf_conf=args.mlperf_conf,
             user_conf=args.user_conf,

From 81f6ca539622103aab46824d4e7bcb6765adabbe Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 26 Nov 2025 07:23:17 +0000
Subject: [PATCH 128/165] add eval_perf script

---
 language/gpt-oss/eval_mlperf_performance.py | 575 ++++++++++++++++++
 language/gpt-oss/run_infer_trtllm.py        | 642 ++++++++++++++++++++
 2 files changed, 1217 insertions(+)
 create mode 100755 language/gpt-oss/eval_mlperf_performance.py
 create mode 100644 language/gpt-oss/run_infer_trtllm.py

diff --git a/language/gpt-oss/eval_mlperf_performance.py b/language/gpt-oss/eval_mlperf_performance.py
new file mode 100755
index 0000000000..46b67c6991
--- /dev/null
+++ b/language/gpt-oss/eval_mlperf_performance.py
@@ -0,0 +1,575 @@
+#!/usr/bin/env python3
+"""Evaluate MLPerf performance logs and analyze output token lengths.
+
+This script reads MLPerf accuracy logs (mlperf_log_accuracy.json) and
+detokenizes the hex-encoded token IDs to produce human-readable text output.
+Optionally includes input prompts and reference data from a pickle file,
+and generates histogram plots for token length analysis.
+
+Usage:
+    # Basic usage (outputs only)
+    python eval_mlperf_performance.py \
+        --mlperf-log mlperf_logs/offline/accuracy/mlperf_log_accuracy.json \
+        --output-file detokenized_outputs.json \
+        --tokenizer openai/gpt-oss-120b
+
+    # With reference data (includes inputs and metadata)
+    python eval_mlperf_performance.py \
+        --mlperf-log mlperf_logs/offline/accuracy/mlperf_log_accuracy.json \
+        --output-file detokenized_outputs.json \
+        --reference-data data/accuracy_eval_tokenized_filtered.pkl \
+        --tokenizer openai/gpt-oss-120b
+    
+    # With histogram plots (enables plotting when --plot-dir is specified)
+    python eval_mlperf_performance.py \
+        --mlperf-log mlperf_logs/offline/accuracy/mlperf_log_accuracy.json \
+        --output-file detokenized_outputs.json \
+        --reference-data data/accuracy_eval_tokenized_filtered.pkl \
+        --plot-dir plots
+
+The output JSON format (with reference data):
+    [
+        {
+            "qsl_idx": 0,
+            "token_ids": [1, 2, 3, ...],
+            "text": "detokenized response text",
+            "num_tokens": 150,
+            "dataset": "gpqa",
+            "input_prompt": "Question: ...",
+            "input_token_ids": [...],
+            "num_input_tokens": 1024,
+            "ground_truth": "Answer"
+        },
+        ...
+    ]
+"""
+
+import argparse
+import json
+import logging
+import pickle
+import sys
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+
+import pandas as pd
+import matplotlib
+matplotlib.use('Agg')  # Non-interactive backend for server environments
+import matplotlib.pyplot as plt
+from transformers import AutoTokenizer
+from tqdm import tqdm
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Parse MLPerf accuracy JSON and detokenize responses"
+    )
+    
+    parser.add_argument(
+        "--mlperf-log",
+        type=str,
+        required=True,
+        help="Path to mlperf_log_accuracy.json file"
+    )
+    
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        required=True,
+        help="Path to output JSON file with detokenized responses"
+    )
+    
+    parser.add_argument(
+        "--reference-data",
+        type=str,
+        default=None,
+        help="Path to reference pickle file (DataFrame with prompts, dataset, etc.) - optional"
+    )
+    
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        default="openai/gpt-oss-120b",
+        help="Tokenizer to use for detokenization (default: openai/gpt-oss-120b)"
+    )
+    
+    parser.add_argument(
+        "--pretty",
+        action="store_true",
+        help="Pretty-print the output JSON with indentation"
+    )
+    
+    parser.add_argument(
+        "--plot-dir",
+        type=str,
+        default=None,
+        help="Directory to save histogram plots (enables plotting if specified)"
+    )
+    
+    return parser.parse_args()
+
+
+def decode_hex_to_tokens(hex_string: str) -> List[int]:
+    """Decode hex-encoded byte array to list of token IDs.
+    
+    MLPerf stores token IDs as hex-encoded bytes where each token is a 4-byte
+    little-endian integer.
+    
+    Args:
+        hex_string: Hex-encoded string from MLPerf log
+        
+    Returns:
+        List of token IDs
+    """
+    # Remove any whitespace
+    hex_string = hex_string.strip()
+    
+    # Convert hex string to bytes
+    byte_data = bytes.fromhex(hex_string)
+    
+    # Each token is stored as 4 bytes (int32) in little-endian format
+    token_ids = []
+    for i in range(0, len(byte_data), 4):
+        if i + 4 <= len(byte_data):
+            # Unpack 4 bytes as little-endian int32
+            token_id = int.from_bytes(byte_data[i:i+4], byteorder='little', signed=True)
+            token_ids.append(token_id)
+    
+    return token_ids
+
+
+def parse_mlperf_log(log_path: str) -> List[Dict[str, Any]]:
+    """Parse MLPerf accuracy log file.
+    
+    Handles multiple formats:
+    - JSON array: [{"qsl_idx": 0, ...}, ...]
+    - JSONL: one JSON object per line
+    - Concatenated JSON: multiple JSON objects on same line
+    
+    Args:
+        log_path: Path to mlperf_log_accuracy.json
+        
+    Returns:
+        List of entries with qsl_idx and hex-encoded data
+    """
+    logger.info(f"Reading MLPerf log: {log_path}")
+    
+    entries = []
+    
+    # First try to load as a single JSON array
+    try:
+        with open(log_path, 'r') as f:
+            log_data = json.load(f)
+        if isinstance(log_data, list):
+            logger.info(f"Loaded {len(log_data)} entries as JSON array")
+            return log_data
+    except json.JSONDecodeError:
+        pass  # Not a valid JSON array, try line-by-line parsing
+    
+    # Parse line by line (JSONL or concatenated JSON)
+    decoder = json.JSONDecoder()
+    with open(log_path, 'r') as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:
+                continue
+            
+            # Try to parse as single JSON object first
+            try:
+                entry = json.loads(line)
+                entries.append(entry)
+            except json.JSONDecodeError:
+                # Line might have multiple concatenated JSON objects
+                # Extract them one by one using raw_decode
+                remaining = line
+                parsed_count = 0
+                while remaining:
+                    remaining = remaining.lstrip()
+                    if not remaining:
+                        break
+                    try:
+                        obj, end_idx = decoder.raw_decode(remaining)
+                        entries.append(obj)
+                        remaining = remaining[end_idx:]
+                        parsed_count += 1
+                    except json.JSONDecodeError as e:
+                        if parsed_count == 0:
+                            logger.warning(f"Line {line_num}: Could not parse JSON: {e}")
+                        break
+    
+    logger.info(f"Loaded {len(entries)} entries from MLPerf log")
+    return entries
+
+
+def plot_histograms(
+    results: List[Dict[str, Any]],
+    output_dir: str,
+    has_reference: bool = False
+) -> None:
+    """Generate histogram plots for output token lengths and differences.
+    
+    Args:
+        results: List of parsed results with token lengths
+        output_dir: Directory to save plots
+        has_reference: Whether reference data is available for difference plots
+    """
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    
+    logger.info(f"Generating histogram plots in {output_dir}...")
+    
+    # Extract output token lengths
+    output_lengths = [r['num_tokens'] for r in results]
+    
+    # Plot 1: Output Sequence Length (OSL) Histogram
+    plt.figure(figsize=(12, 6))
+    plt.hist(output_lengths, bins=50, edgecolor='black', alpha=0.7, color='steelblue')
+    plt.xlabel('Output Token Length (OSL)', fontsize=12)
+    plt.ylabel('Frequency', fontsize=12)
+    plt.title(f'Distribution of Output Token Lengths\n(n={len(output_lengths)}, mean={sum(output_lengths)/len(output_lengths):.1f}, median={sorted(output_lengths)[len(output_lengths)//2]})', fontsize=14)
+    plt.grid(axis='y', alpha=0.3)
+    
+    # Add statistics box
+    stats_text = f'Min: {min(output_lengths)}\nMax: {max(output_lengths)}\nStd: {(sum((x - sum(output_lengths)/len(output_lengths))**2 for x in output_lengths) / len(output_lengths))**0.5:.1f}'
+    plt.text(0.98, 0.97, stats_text, transform=plt.gca().transAxes,
+             fontsize=10, verticalalignment='top', horizontalalignment='right',
+             bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
+    
+    osl_plot_path = output_path / 'output_token_length_histogram.png'
+    plt.tight_layout()
+    plt.savefig(osl_plot_path, dpi=300, bbox_inches='tight')
+    plt.close()
+    logger.info(f"✓ Saved OSL histogram: {osl_plot_path}")
+    
+    # Plot 2: Token Length Difference Histogram (if reference data available)
+    if has_reference:
+        results_with_diff = [r for r in results if 'output_token_len_diff' in r]
+        if results_with_diff:
+            differences = [r['output_token_len_diff'] for r in results_with_diff]
+            
+            plt.figure(figsize=(12, 6))
+            plt.hist(differences, bins=50, edgecolor='black', alpha=0.7, color='coral')
+            plt.xlabel('Token Length Difference (Actual - Reference)', fontsize=12)
+            plt.ylabel('Frequency', fontsize=12)
+            
+            mean_diff = sum(differences) / len(differences)
+            median_diff = sorted(differences)[len(differences)//2]
+            plt.title(f'Distribution of Output Token Length Differences\n(n={len(differences)}, mean={mean_diff:.1f}, median={median_diff})', fontsize=14)
+            plt.grid(axis='y', alpha=0.3)
+            plt.axvline(x=0, color='red', linestyle='--', linewidth=2, label='Zero difference')
+            
+            # Add statistics box
+            longer = sum(1 for d in differences if d > 0)
+            shorter = sum(1 for d in differences if d < 0)
+            exact = sum(1 for d in differences if d == 0)
+            stats_text = f'Min: {min(differences)}\nMax: {max(differences)}\nStd: {(sum((x - mean_diff)**2 for x in differences) / len(differences))**0.5:.1f}\n\nLonger: {longer} ({longer/len(differences)*100:.1f}%)\nShorter: {shorter} ({shorter/len(differences)*100:.1f}%)\nExact: {exact} ({exact/len(differences)*100:.1f}%)'
+            plt.text(0.98, 0.97, stats_text, transform=plt.gca().transAxes,
+                     fontsize=9, verticalalignment='top', horizontalalignment='right',
+                     bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.5))
+            
+            plt.legend()
+            
+            diff_plot_path = output_path / 'token_length_difference_histogram.png'
+            plt.tight_layout()
+            plt.savefig(diff_plot_path, dpi=300, bbox_inches='tight')
+            plt.close()
+            logger.info(f"✓ Saved difference histogram: {diff_plot_path}")
+            
+            # Plot 3: Combined comparison (side by side)
+            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
+            
+            # Reference vs Actual
+            ref_lengths = [r['ref_num_output_tokens'] for r in results_with_diff]
+            actual_lengths = [r['actual_num_output_tokens'] for r in results_with_diff]
+            
+            ax1.hist([ref_lengths, actual_lengths], bins=50, label=['Reference', 'Actual'],
+                    alpha=0.6, edgecolor='black', color=['steelblue', 'coral'])
+            ax1.set_xlabel('Output Token Length', fontsize=12)
+            ax1.set_ylabel('Frequency', fontsize=12)
+            ax1.set_title(f'Reference vs Actual Output Token Lengths\n(n={len(results_with_diff)})', fontsize=13)
+            ax1.legend()
+            ax1.grid(axis='y', alpha=0.3)
+            
+            # Scatter plot: Reference vs Actual
+            ax2.scatter(ref_lengths, actual_lengths, alpha=0.4, s=10, color='purple')
+            ax2.plot([min(ref_lengths), max(ref_lengths)], [min(ref_lengths), max(ref_lengths)],
+                    'r--', linewidth=2, label='y=x (perfect match)')
+            ax2.set_xlabel('Reference Token Length', fontsize=12)
+            ax2.set_ylabel('Actual Token Length', fontsize=12)
+            ax2.set_title('Reference vs Actual Token Lengths (Scatter)', fontsize=13)
+            ax2.legend()
+            ax2.grid(alpha=0.3)
+            
+            comparison_plot_path = output_path / 'token_length_comparison.png'
+            plt.tight_layout()
+            plt.savefig(comparison_plot_path, dpi=300, bbox_inches='tight')
+            plt.close()
+            logger.info(f"✓ Saved comparison plot: {comparison_plot_path}")
+        else:
+            logger.warning("No samples with token length differences found")
+    
+    logger.info(f"✓ All plots saved to {output_dir}/")
+
+
+def detokenize_responses(
+    entries: List[Dict[str, Any]],
+    tokenizer: Any,
+    reference_df: Optional[pd.DataFrame] = None
+) -> List[Dict[str, Any]]:
+    """Detokenize responses from MLPerf log entries.
+    
+    When reference data is provided, input_prompt is generated by detokenizing
+    input token IDs from the reference data (checks: tok_input, input_token_ids,
+    input_tokens, tokenized_input). This shows exactly what was sent to the model
+    (after tokenization), not the original text prompt.
+    
+    Args:
+        entries: List of MLPerf log entries with hex-encoded token IDs
+        tokenizer: HuggingFace tokenizer instance
+        reference_df: Optional reference DataFrame with input prompts and metadata
+        
+    Returns:
+        List of dictionaries with qsl_idx, token_ids, and detokenized text
+    """
+    logger.info("Detokenizing responses...")
+    
+    results = []
+    for entry in tqdm(entries, desc="Detokenizing", unit="response"):
+        qsl_idx = entry.get("qsl_idx")
+        hex_data = entry.get("data", "")
+        
+        # Decode hex to token IDs
+        try:
+            token_ids = decode_hex_to_tokens(hex_data)
+        except Exception as e:
+            logger.error(f"Error decoding tokens for qsl_idx={qsl_idx}: {e}")
+            token_ids = []
+        
+        # Detokenize to text
+        try:
+            text = tokenizer.decode(token_ids, skip_special_tokens=True)
+        except Exception as e:
+            logger.error(f"Error detokenizing qsl_idx={qsl_idx}: {e}")
+            text = ""
+        
+        # Build result record
+        result = {
+            "qsl_idx": qsl_idx,
+            "token_ids": token_ids,
+            "text": text,
+            "num_tokens": len(token_ids)
+        }
+        
+        # Add reference data if available
+        if reference_df is not None and qsl_idx < len(reference_df):
+            ref_row = reference_df.iloc[qsl_idx]
+            
+            # Add common fields from reference data
+            if 'dataset' in ref_row:
+                result['dataset'] = ref_row['dataset']
+            
+            # Get input token IDs and detokenize to see what was actually sent to the model
+            # Check multiple possible field names for input tokens
+            input_token_ids = None
+            for field in ['tok_input', 'input_token_ids', 'input_tokens', 'tokenized_input']:
+                if field in ref_row:
+                    input_token_ids = ref_row[field]
+                    break
+            
+            if input_token_ids is not None:
+                result['input_token_ids'] = input_token_ids
+                if isinstance(input_token_ids, list):
+                    result['num_input_tokens'] = len(input_token_ids)
+                    # Detokenize input tokens to show what was actually sent to the model
+                    try:
+                        result['input_prompt'] = tokenizer.decode(input_token_ids, skip_special_tokens=False)
+                    except Exception as e:
+                        logger.warning(f"Error detokenizing input tokens for qsl_idx={qsl_idx}: {e}")
+                        result['input_prompt'] = None
+                else:
+                    result['num_input_tokens'] = None
+                    result['input_prompt'] = None
+            else:
+                # Fallback to raw prompt field if input token IDs not available
+                if 'prompt' in ref_row:
+                    result['input_prompt'] = ref_row['prompt']
+                elif 'input_text' in ref_row:
+                    result['input_prompt'] = ref_row['input_text']
+                elif 'text' in ref_row:
+                    result['input_prompt'] = ref_row['text']
+            
+            if 'ground_truth' in ref_row:
+                result['ground_truth'] = ref_row['ground_truth']
+            
+            # Compute output token length difference
+            # Check for reference output token length in various possible field names
+            ref_output_len = None
+            for field in ['output_token_ids', 'target_token_ids', 'output_tokens', 'expected_output_token_ids']:
+                if field in ref_row:
+                    ref_tokens = ref_row[field]
+                    if isinstance(ref_tokens, list):
+                        ref_output_len = len(ref_tokens)
+                        result['ref_output_token_ids'] = ref_tokens
+                        break
+                    elif isinstance(ref_tokens, (int, float)) and not pd.isna(ref_tokens):
+                        ref_output_len = int(ref_tokens)
+                        break
+            
+            # Also check for direct length field
+            if ref_output_len is None:
+                for field in ['output_len', 'output_length', 'num_output_tokens', 'target_len']:
+                    if field in ref_row and not pd.isna(ref_row[field]):
+                        ref_output_len = int(ref_row[field])
+                        break
+            
+            if ref_output_len is not None:
+                actual_output_len = len(token_ids)
+                result['ref_num_output_tokens'] = ref_output_len
+                result['actual_num_output_tokens'] = actual_output_len
+                result['output_token_len_diff'] = actual_output_len - ref_output_len
+                result['output_token_len_ratio'] = actual_output_len / ref_output_len if ref_output_len > 0 else None
+            
+            # Add any other columns that might be useful
+            for col in ['question_id', 'difficulty', 'subject', 'category']:
+                if col in ref_row:
+                    result[col] = ref_row[col]
+        
+        results.append(result)
+    
+    return results
+
+
+def main():
+    """Main function."""
+    args = parse_args()
+    
+    # Validate input file exists
+    log_path = Path(args.mlperf_log)
+    if not log_path.exists():
+        logger.error(f"MLPerf log file not found: {args.mlperf_log}")
+        sys.exit(1)
+    
+    logger.info("=" * 80)
+    logger.info("MLPerf Accuracy Log Parser")
+    logger.info("=" * 80)
+    logger.info(f"Input log: {args.mlperf_log}")
+    logger.info(f"Output file: {args.output_file}")
+    logger.info(f"Reference data: {args.reference_data if args.reference_data else 'None (outputs only)'}")
+    logger.info(f"Tokenizer: {args.tokenizer}")
+    logger.info("=" * 80)
+    
+    # Load reference data if provided
+    reference_df = None
+    if args.reference_data:
+        logger.info(f"Loading reference data from {args.reference_data}")
+        try:
+            with open(args.reference_data, 'rb') as f:
+                reference_df = pickle.load(f)
+            logger.info(f"✓ Reference data loaded: {reference_df.shape}")
+            logger.info(f"  Columns: {list(reference_df.columns)}")
+        except Exception as e:
+            logger.error(f"Failed to load reference data: {e}")
+            sys.exit(1)
+    
+    # Load tokenizer
+    logger.info(f"Loading tokenizer: {args.tokenizer}")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
+        logger.info("✓ Tokenizer loaded successfully")
+    except Exception as e:
+        logger.error(f"Failed to load tokenizer: {e}")
+        sys.exit(1)
+    
+    # Parse MLPerf log
+    try:
+        entries = parse_mlperf_log(args.mlperf_log)
+    except Exception as e:
+        logger.error(f"Failed to parse MLPerf log: {e}")
+        sys.exit(1)
+    
+    if not entries:
+        logger.error("No entries found in MLPerf log")
+        sys.exit(1)
+    
+    # Detokenize responses
+    try:
+        results = detokenize_responses(entries, tokenizer, reference_df)
+    except Exception as e:
+        logger.error(f"Failed to detokenize responses: {e}")
+        sys.exit(1)
+    
+    # Write output JSON
+    logger.info(f"Writing detokenized outputs to: {args.output_file}")
+    output_path = Path(args.output_file)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    
+    with open(output_path, 'w') as f:
+        if args.pretty:
+            json.dump(results, f, indent=2, ensure_ascii=False)
+        else:
+            json.dump(results, f, ensure_ascii=False)
+    
+    logger.info("=" * 80)
+    logger.info("✓ Parsing completed successfully")
+    logger.info("=" * 80)
+    logger.info(f"Total responses parsed: {len(results)}")
+    
+    # Print statistics
+    total_tokens = sum(r["num_tokens"] for r in results)
+    avg_tokens = total_tokens / len(results) if results else 0
+    logger.info(f"Total output tokens: {total_tokens:,}")
+    logger.info(f"Average tokens per response: {avg_tokens:.1f}")
+    
+    # Print token length difference statistics if reference data was provided
+    if reference_df is not None:
+        results_with_diff = [r for r in results if 'output_token_len_diff' in r]
+        if results_with_diff:
+            diffs = [r['output_token_len_diff'] for r in results_with_diff]
+            ratios = [r['output_token_len_ratio'] for r in results_with_diff if r['output_token_len_ratio'] is not None]
+            
+            logger.info(f"\nOutput Token Length Analysis ({len(results_with_diff)} samples with reference):")
+            logger.info(f"  Mean difference (actual - ref): {sum(diffs) / len(diffs):.2f} tokens")
+            logger.info(f"  Min difference: {min(diffs)} tokens")
+            logger.info(f"  Max difference: {max(diffs)} tokens")
+            if ratios:
+                logger.info(f"  Mean ratio (actual / ref): {sum(ratios) / len(ratios):.3f}x")
+            
+            # Count samples that are longer/shorter
+            longer = sum(1 for d in diffs if d > 0)
+            shorter = sum(1 for d in diffs if d < 0)
+            exact = sum(1 for d in diffs if d == 0)
+            logger.info(f"  Longer than reference: {longer} ({longer/len(diffs)*100:.1f}%)")
+            logger.info(f"  Shorter than reference: {shorter} ({shorter/len(diffs)*100:.1f}%)")
+            logger.info(f"  Exact match: {exact} ({exact/len(diffs)*100:.1f}%)")
+    
+    logger.info("=" * 80)
+    
+    # Show sample output
+    if results:
+        logger.info("Sample output (first entry):")
+        sample = results[0]
+        logger.info(f"  qsl_idx: {sample['qsl_idx']}")
+        logger.info(f"  num_tokens: {sample['num_tokens']}")
+        logger.info(f"  text preview: {sample['text'][:200]}...")
+        logger.info("=" * 80)
+    
+    # Generate histogram plots if plot directory is specified
+    if args.plot_dir:
+        logger.info("\n" + "=" * 80)
+        logger.info("Generating Histogram Plots")
+        logger.info("=" * 80)
+        plot_histograms(results, args.plot_dir, has_reference=(reference_df is not None))
+        logger.info("=" * 80)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/language/gpt-oss/run_infer_trtllm.py b/language/gpt-oss/run_infer_trtllm.py
new file mode 100644
index 0000000000..adac3c6271
--- /dev/null
+++ b/language/gpt-oss/run_infer_trtllm.py
@@ -0,0 +1,642 @@
+#!/usr/bin/env python3
+"""
+Script to send text prompts to TensorRT-LLM server via OpenAI completions endpoint.
+Supports round-robin load balancing across multiple server endpoints.
+
+Usage:
+    python run_infer_trtllm.py --input-tokens tokenized_data.pkl [options]
+
+Arguments:
+    --input-tokens     Path to pickle file containing data with text_input column from harmony-tokens.py
+    --server-url       TensorRT-LLM server URL(s) - comma-separated for round-robin (e.g., "localhost:8000,localhost:8001")
+    --max-samples      Maximum number of samples to process (default: all)
+    --max-tokens       Maximum tokens to generate per request (default: 100)
+    --max-concurrency  Maximum number of concurrent requests (default: 256)
+    --output           Output pickle file for responses (optional)
+    --pass-k           Number of inference passes per sample for pass@k strategy (default: 1)
+
+Examples:
+    # Single server
+    python run_infer_trtllm.py --input-tokens data.pkl --server-url localhost:8000
+
+    # Multiple servers with round-robin
+    python run_infer_trtllm.py --input-tokens data.pkl --server-url localhost:8000,localhost:8001,localhost:8002
+"""
+
+import asyncio
+import argparse
+import time
+import logging
+from typing import List, Dict, Any, Tuple
+import pandas as pd
+from tqdm import tqdm
+from transformers import AutoTokenizer
+import httpx
+from openai import AsyncOpenAI
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+# Suppress verbose HTTP logs from httpx and openai
+logging.getLogger("httpx").setLevel(logging.WARNING)
+logging.getLogger("openai").setLevel(logging.WARNING)
+
+# Initialize tokenizer
+MODEL_NAME = "openai/gpt-oss-120b"
+tokenizer = None
+
+
+def get_tokenizer():
+    """Get or initialize the tokenizer."""
+    global tokenizer
+    if tokenizer is None:
+        logger.info(f"Loading tokenizer for {MODEL_NAME}...")
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+        logger.info("Tokenizer loaded successfully")
+    return tokenizer
+
+
+class TRTLLMClient:
+    """Client for TensorRT-LLM server using OpenAI-compatible endpoint with round-robin support."""
+
+    def __init__(self,
+                 server_urls: List[str] = None,
+                 temperature: float = 0.001,
+                 top_k: int = 1,
+                 top_p: float = 1.0,
+                 max_concurrency: int = 256,
+                 timeout: int = 1200):
+        # Support multiple server URLs for round-robin load balancing
+        if server_urls is None:
+            server_urls = ["localhost:8000"]
+        self.server_urls = server_urls
+        self.num_servers = len(server_urls)
+        self.current_server_index = 0
+
+        self.temperature = temperature
+        self.top_k = top_k
+        self.top_p = top_p
+        self.max_concurrency = max_concurrency
+        self.timeout = timeout
+        self.model_name = MODEL_NAME
+
+        # Initialize async OpenAI clients (one per server)
+        self.http_clients = []
+        self.async_clients = []
+        self.concurrency_semaphore = None
+
+        logger.info(
+            f"Initialized client with {self.num_servers} server(s): {', '.join(self.server_urls)}")
+
+    async def initialize(self):
+        """Initialize OpenAI clients for all servers."""
+        # Create semaphore for concurrency control
+        self.concurrency_semaphore = asyncio.Semaphore(self.max_concurrency)
+
+        # Create HTTP and OpenAI clients for each server
+        for server_url in self.server_urls:
+            # Setup HTTP client with proper connection limits for high
+            # concurrency
+            http_client = httpx.AsyncClient(
+                timeout=httpx.Timeout(self.timeout),
+                limits=httpx.Limits(
+                    max_keepalive_connections=self.max_concurrency * 2,
+                    max_connections=self.max_concurrency * 2,
+                ),
+                http2=True
+            )
+
+            # Setup OpenAI client with the configured HTTP client
+            async_client = AsyncOpenAI(
+                api_key='dummy',  # TensorRT-LLM server doesn't require real API key
+                base_url=f"http://{server_url}/v1/",
+                timeout=self.timeout,
+                max_retries=10,
+                http_client=http_client,
+            )
+
+            self.http_clients.append(http_client)
+            self.async_clients.append(async_client)
+
+        logger.info(f"Initialized {len(self.async_clients)} OpenAI client(s)")
+
+    def _get_next_client(self) -> AsyncOpenAI:
+        """Get the next client using round-robin selection."""
+        client = self.async_clients[self.current_server_index]
+        self.current_server_index = (
+            self.current_server_index + 1) % self.num_servers
+        return client
+
+    async def send_request(
+            self, prompt: str, max_tokens: int = 100,
+            sample_id: int = 0, pass_num: int = 0) -> Tuple[int, int, Dict[str, Any], float]:
+        """Send a single request to the TensorRT-LLM server using round-robin.
+
+        Args:
+            prompt: Text prompt to send
+            max_tokens: Maximum tokens to generate
+            sample_id: Sample identifier
+            pass_num: Pass number for pass@k strategy
+
+        Returns:
+            Tuple of (sample_id, pass_num, response, latency)
+        """
+        # Prepare generation parameters using OpenAI completions format (as per
+        # TensorRT-LLM docs)
+        extra_body = {
+            # TensorRT-LLM specific parameters
+            "min_tokens": 1,
+        }
+
+        # Only include top_k if it's not 0 (so it can default to None on server
+        # side)
+        if self.top_k != 0:
+            extra_body["top_k"] = self.top_k
+
+        gen_params = {
+            "model": self.model_name,
+            "prompt": prompt,
+            "max_tokens": max_tokens,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "stream": False,
+            "extra_body": extra_body,
+        }
+
+        try:
+            # Track latency: time from request sent to response received
+            start_time = time.time()
+
+            # Select client using round-robin
+            client = self._get_next_client()
+
+            # Use semaphore for concurrency control
+            async with self.concurrency_semaphore:
+                completion = await client.completions.create(**gen_params)
+
+            end_time = time.time()
+            latency = end_time - start_time
+
+            # Extract response text from completions format
+            response_text = completion.choices[0].text
+
+            # Tokenize the response to get output_ids (similar to SGLang
+            # format)
+            tokenizer = get_tokenizer()
+            output_ids = tokenizer.encode(
+                response_text, add_special_tokens=False)
+
+            # Format response similar to SGLang format for compatibility
+            response = {
+                "output_ids": output_ids,
+                "text": response_text,
+                "meta_info": {
+                    "completion_tokens": len(output_ids),
+                }
+            }
+
+            return sample_id, pass_num, response, latency
+
+        except Exception as e:
+            logger.error(f"Request {sample_id} (pass {pass_num}) failed: {e}")
+            return sample_id, pass_num, {"error": str(e)}, None
+
+    async def shutdown(self):
+        """Clean up resources for all clients."""
+        for http_client in self.http_clients:
+            if http_client:
+                await http_client.aclose()
+
+
+def load_tokenized_data(data_file: str) -> pd.DataFrame:
+    """Load data from pickle file produced by harmony-tokens.py."""
+    logger.info(f"Loading data from {data_file}")
+
+    # Load DataFrame from pickle
+    df = pd.read_pickle(data_file)
+    logger.info(f"Loaded DataFrame with shape: {df.shape}")
+
+    # Check if text_input column exists and has valid data
+    if 'text_input' in df.columns:
+        # Check for any None values in text_input
+        failed_mask = df['text_input'].isna()
+        failed_count = failed_mask.sum()
+
+        if failed_count > 0:
+            failed_indices = df[failed_mask].index.unique()
+            error_msg = f"Found {failed_count} samples with missing text_input at indices: {failed_indices.tolist()}"
+            logger.error(error_msg)
+            raise AssertionError(error_msg)
+
+        # Check first sample
+        first_text = df.iloc[0]['text_input']
+        if isinstance(first_text, str):
+            logger.info(
+                f"First sample text length: {len(first_text)} characters")
+        else:
+            logger.warning(
+                "text_input column exists but first sample is not a string")
+
+        logger.info(f"All {len(df)} samples have valid text_input")
+    else:
+        logger.error("No 'text_input' column found in DataFrame")
+        raise ValueError("DataFrame must contain 'text_input' column")
+
+    return df
+
+
+async def send_requests_async(
+        tokenized_df: pd.DataFrame, server_urls: List[str],
+        max_tokens: int = 100, max_concurrency: int = 256,
+        temperature: float = 0.001, top_k: int = 1, top_p: float = 1.0,
+        timeout: int = 1200, pass_k: int = 1):
+    """Send all requests to TensorRT-LLM server(s) asynchronously with round-robin load balancing.
+
+    Args:
+        server_urls: List of server URLs for round-robin load balancing
+        pass_k: Number of inference passes per sample for pass@k strategy
+
+    Returns:
+        tuple: (responses_by_pass, latencies_by_pass) - Dict mapping (sample_id, pass_num) to response/latency
+    """
+    num_samples = len(tokenized_df)
+    total_requests = num_samples * pass_k
+    logger.info(
+        f"Sending {total_requests} requests ({num_samples} samples × {pass_k} passes) with {max_concurrency} concurrent workers...")
+
+    # Initialize client with multiple servers for round-robin
+    client = TRTLLMClient(
+        server_urls=server_urls,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        max_concurrency=max_concurrency,
+        timeout=timeout
+    )
+    await client.initialize()
+
+    # Prepare all tasks - create pass_k requests per sample
+    tasks = []
+    for idx, row in tokenized_df.iterrows():
+        for pass_num in range(pass_k):
+            task = client.send_request(
+                row['text_input'],
+                max_tokens=max_tokens,
+                sample_id=idx,
+                pass_num=pass_num
+            )
+            tasks.append(task)
+
+    start_time = time.time()
+
+    # Execute all tasks concurrently with progress bar
+    results = []
+    for coro in tqdm(
+            asyncio.as_completed(tasks),
+            total=len(tasks),
+            desc="Sending requests",
+            unit="request"):
+        result = await coro
+        results.append(result)
+
+    # Group results by sample_id and pass_num
+    responses_by_pass = {}
+    latencies_by_pass = {}
+    for sample_id, pass_num, response, latency in results:
+        responses_by_pass[(sample_id, pass_num)] = response
+        latencies_by_pass[(sample_id, pass_num)] = latency
+
+    total_time = time.time() - start_time
+    logger.info(
+        f"Completed {total_requests} requests in {total_time:.2f} seconds")
+    logger.info(f"Average rate: {total_requests/total_time:.2f} requests/sec")
+
+    # Log latency statistics
+    valid_latencies = [
+        lat for lat in latencies_by_pass.values() if lat is not None]
+    if valid_latencies:
+        avg_latency = sum(valid_latencies) / len(valid_latencies)
+        min_latency = min(valid_latencies)
+        max_latency = max(valid_latencies)
+        logger.info(
+            f"Latency stats - Avg: {avg_latency:.3f}s, Min: {min_latency:.3f}s, Max: {max_latency:.3f}s")
+
+    # Shutdown client
+    await client.shutdown()
+
+    return responses_by_pass, latencies_by_pass
+
+
+def extract_response_ids(
+        responses_by_pass: Dict[tuple, Dict[str, Any]], tokenized_df: pd.DataFrame, pass_k: int) -> Dict[tuple, List[int]]:
+    """Extract response output_ids from TensorRT-LLM responses for all passes.
+
+    Args:
+        responses_by_pass: Dict mapping (sample_id, pass_num) to response
+        tokenized_df: DataFrame with samples
+        pass_k: Number of passes per sample
+
+    Returns:
+        Dict mapping (sample_id, pass_num) to output_ids list
+    """
+    logger.info("Extracting response output_ids...")
+
+    response_ids_by_pass = {}
+    total_responses = len(tokenized_df) * pass_k
+
+    with tqdm(total=total_responses, desc="Extracting responses", unit="response") as pbar:
+        for idx, row in tokenized_df.iterrows():
+            for pass_num in range(pass_k):
+                response = responses_by_pass.get((idx, pass_num), {})
+                response_id = []
+                if "error" not in response and "output_ids" in response:
+                    try:
+                        # TensorRT-LLM returns the generated token IDs in the
+                        # 'output_ids' field
+                        response_id = response["output_ids"]
+                    except Exception as e:
+                        logger.warning(
+                            f"Failed to extract response for sample {idx}, pass {pass_num}: {e}")
+                response_ids_by_pass[(idx, pass_num)] = response_id
+                pbar.update(1)
+
+    logger.info("Response output_ids extraction complete")
+    return response_ids_by_pass
+
+
+def detokenize_output_ids(
+        response_ids_by_pass: Dict[tuple, List[int]], pass_k: int) -> Dict[tuple, str]:
+    """Detokenize output_ids back to text using AutoTokenizer for all passes.
+
+    Args:
+        response_ids_by_pass: Dict mapping (sample_id, pass_num) to output_ids
+        pass_k: Number of passes per sample
+
+    Returns:
+        Dict mapping (sample_id, pass_num) to detokenized text
+    """
+    logger.info("Detokenizing output_ids to text...")
+
+    tokenizer = get_tokenizer()
+    detokenized_texts_by_pass = {}
+
+    for (sample_id, pass_num), token_ids in tqdm(
+            response_ids_by_pass.items(), desc="Detokenizing outputs", unit="output"):
+        try:
+            # Detokenize the token IDs back to text
+            text = tokenizer.decode(token_ids, skip_special_tokens=True)
+            detokenized_texts_by_pass[(sample_id, pass_num)] = text
+        except Exception as e:
+            logger.warning(
+                f"Failed to detokenize output for sample {sample_id}, pass {pass_num}: {e}")
+            detokenized_texts_by_pass[(sample_id, pass_num)] = ""
+
+    logger.info("Output detokenization complete")
+    return detokenized_texts_by_pass
+
+
+def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
+                   response_ids_by_pass: Dict[tuple, List[int]],
+                   detokenized_texts_by_pass: Dict[tuple, str],
+                   latencies_by_pass: Dict[tuple, float],
+                   tokenized_df: pd.DataFrame, pass_k: int, output_file: str = None) -> pd.DataFrame:
+    """Save all responses to DataFrame and optionally to pickle file.
+
+    Args:
+        responses_by_pass: Dict mapping (sample_id, pass_num) to response
+        response_ids_by_pass: Dict mapping (sample_id, pass_num) to output_ids
+        detokenized_texts_by_pass: Dict mapping (sample_id, pass_num) to text
+        latencies_by_pass: Dict mapping (sample_id, pass_num) to latency
+        tokenized_df: Original DataFrame with samples
+        pass_k: Number of passes per sample
+        output_file: Optional output pickle file
+
+    Returns:
+        DataFrame with columns for each pass (e.g., model_output_0, model_output_1, ...)
+    """
+    logger.info("Processing responses and updating DataFrame...")
+
+    # Work with the original DataFrame
+    result_df = tokenized_df.copy()
+
+    # Create columns for each pass with _0, _1, _2, ... suffixes
+    for pass_num in range(pass_k):
+        # Lists to store data for this pass
+        model_outputs = []
+        tok_model_outputs = []
+        tok_model_output_lens = []
+        infer_times = []
+
+        for idx in tokenized_df.index:
+            key = (idx, pass_num)
+            detokenized_text = detokenized_texts_by_pass.get(key, "")
+            response_ids = response_ids_by_pass.get(key, [])
+            latency = latencies_by_pass.get(key, None)
+
+            model_outputs.append(detokenized_text)
+            tok_model_outputs.append(response_ids)
+            tok_model_output_lens.append(len(response_ids))
+            infer_times.append(latency)
+
+        # Add columns with suffixes
+        result_df[f'model_output_{pass_num}'] = model_outputs
+        result_df[f'tok_model_output_{pass_num}'] = tok_model_outputs
+        result_df[f'tok_model_output_len_{pass_num}'] = tok_model_output_lens
+        result_df[f'infer_time_{pass_num}'] = infer_times
+
+    # Calculate output token lengths for logging
+    all_output_token_lengths = []
+    for idx in tokenized_df.index:
+        for pass_num in range(pass_k):
+            key = (idx, pass_num)
+            response = responses_by_pass.get(key, {})
+            response_ids = response_ids_by_pass.get(key, [])
+            try:
+                output_token_length = response.get(
+                    "meta_info", {}).get(
+                    "completion_tokens", len(response_ids))
+                all_output_token_lengths.append(output_token_length)
+            except Exception as e:
+                logger.warning(
+                    f"Failed to calculate output tokens for sample {idx}, pass {pass_num}: {e}")
+                all_output_token_lengths.append(len(response_ids))
+
+    logger.info(f"Updated DataFrame with shape: {result_df.shape}")
+    new_columns = [
+        f'model_output_{i}, tok_model_output_{i}, tok_model_output_len_{i}, infer_time_{i}' for i in range(pass_k)]
+    logger.info(f"Added columns for {pass_k} passes: {', '.join(new_columns)}")
+    if all_output_token_lengths:
+        logger.info(
+            f"Average output token length: {sum(all_output_token_lengths)/len(all_output_token_lengths):.1f}")
+
+    # Save to pickle file if output_file is provided
+    if output_file:
+        logger.info(f"Saving responses to {output_file}...")
+        result_df.to_pickle(output_file)
+        logger.info(f"Responses saved to {output_file}")
+
+    return result_df
+
+
+async def process_requests_async(tokenized_df: pd.DataFrame, server_urls: List[str],
+                                 max_samples: int = None, max_tokens: int = 100,
+                                 max_concurrency: int = 256, output_file: str = None,
+                                 temperature: float = 0.001, top_k: int = 1, top_p: float = 1.0,
+                                 timeout: int = 1200, pass_k: int = 1) -> pd.DataFrame:
+    """Main processing function that handles requests and response extraction.
+
+    Args:
+        server_urls: List of server URLs for round-robin load balancing
+        pass_k: Number of inference passes per sample for pass@k strategy
+    """
+
+    # Step 1: Limit samples if specified
+    if max_samples is not None:
+        tokenized_df = tokenized_df.head(max_samples)
+        logger.info(f"Limited to first {max_samples} samples")
+
+    # Step 2: Send all requests asynchronously (k passes per sample)
+    responses_by_pass, latencies_by_pass = await send_requests_async(
+        tokenized_df,
+        server_urls,
+        max_tokens,
+        max_concurrency,
+        temperature,
+        top_k,
+        top_p,
+        timeout,
+        pass_k)
+
+    # Step 3: Extract response output_ids for all passes
+    response_ids_by_pass = extract_response_ids(
+        responses_by_pass, tokenized_df, pass_k)
+
+    # Step 4: Detokenize output_ids to text for model_output for all passes
+    detokenized_texts_by_pass = detokenize_output_ids(
+        response_ids_by_pass, pass_k)
+
+    # Step 5: Save all results and return DataFrame
+    result_df = save_responses(
+        responses_by_pass,
+        response_ids_by_pass,
+        detokenized_texts_by_pass,
+        latencies_by_pass,
+        tokenized_df,
+        pass_k,
+        output_file)
+
+    return result_df
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Send text prompts to TensorRT-LLM server via OpenAI completions endpoint")
+    parser.add_argument("--input-tokens", required=True,
+                        help="Path to pickle file containing data with text_input column from harmony-tokens.py")
+    parser.add_argument("--server-url", default="localhost:8000",
+                        help="TensorRT-LLM server URL(s) - comma-separated for round-robin load balancing (default: localhost:8000)")
+    parser.add_argument("--max-samples", type=int, default=None,
+                        help="Maximum number of samples to process (default: all)")
+    parser.add_argument("--max-tokens", type=int, default=100,
+                        help="Maximum tokens to generate per request")
+    parser.add_argument("--max-concurrency", type=int, default=256,
+                        help="Maximum number of concurrent requests (default: 256)")
+    parser.add_argument("--output", default=None,
+                        help="Output pickle file for responses (optional)")
+    parser.add_argument("--pass-k", type=int, default=1,
+                        help="Number of inference passes per sample for pass@k strategy (default: 1)")
+    parser.add_argument("--temperature", type=float, default=0.001,
+                        help="Temperature for sampling (default: 0.001)")
+    parser.add_argument("--top-k", type=int, default=1,
+                        help="Top-k for sampling (default: 1)")
+    parser.add_argument("--top-p", type=float, default=1.0,
+                        help="Top-p for sampling (default: 1.0)")
+    parser.add_argument("--timeout", type=int, default=1200,
+                        help="Timeout for requests (default: 1200)")
+
+    args = parser.parse_args()
+
+    # Parse comma-separated server URLs
+    server_urls = [url.strip() for url in args.server_url.split(',')]
+    logger.info(
+        f"Configured {len(server_urls)} server(s) for round-robin load balancing")
+
+    # Test connection
+    async def test_connection():
+        logger.info(f"Testing server connection(s)...")
+        client = TRTLLMClient(
+            server_urls=server_urls,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            top_p=args.top_p,
+            max_concurrency=1,
+            timeout=args.timeout
+        )
+        await client.initialize()
+
+        try:
+            _, _, test_response, _ = await client.send_request(
+                prompt="Test", max_tokens=5, sample_id=0, pass_num=0)
+            if "error" in test_response:
+                logger.error(
+                    f"Server connection failed: {test_response['error']}")
+                logger.error(
+                    "Make sure your TensorRT-LLM server(s) are running with OpenAI endpoint enabled.")
+                return False
+            logger.info("Server connection successful")
+            return True
+        finally:
+            await client.shutdown()
+
+    # Run connection test
+    if not asyncio.run(test_connection()):
+        return
+
+    # Load pre-tokenized data
+    tokenized_df = load_tokenized_data(args.input_tokens)
+
+    # Process requests and get result DataFrame
+    result_df = asyncio.run(process_requests_async(
+        tokenized_df, server_urls,
+        max_samples=args.max_samples,
+        max_tokens=args.max_tokens,
+        max_concurrency=args.max_concurrency,
+        output_file=args.output,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        timeout=args.timeout,
+        pass_k=args.pass_k))
+
+    # Print summary
+    logger.info(f"\nProcessing completed:")
+    logger.info(f"  - Total samples processed: {len(result_df)}")
+    logger.info(f"  - Number of passes per sample: {args.pass_k}")
+    logger.info(
+        f"  - Average input token length: {result_df['tok_input_len'].mean():.1f}")
+
+    # Calculate average output length across all passes
+    if args.pass_k == 1:
+        avg_output_len = result_df['tok_model_output_len_0'].mean()
+        logger.info(f"  - Average output token length: {avg_output_len:.1f}")
+    else:
+        all_output_lens = []
+        for i in range(args.pass_k):
+            all_output_lens.extend(
+                result_df[f'tok_model_output_len_{i}'].tolist())
+        avg_output_len = sum(all_output_lens) / \
+            len(all_output_lens) if all_output_lens else 0
+        logger.info(
+            f"  - Average output token length (across all passes): {avg_output_len:.1f}")
+
+    if args.output:
+        logger.info(f"  - Results saved to: {args.output}")
+    else:
+        logger.info("  - Results returned as DataFrame (not saved to file)")
+
+
+if __name__ == "__main__":
+    main()

From f780189019a8d42f83232c986c700dad13e9e140 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 26 Nov 2025 07:24:02 +0000
Subject: [PATCH 129/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/eval_mlperf_performance.py | 287 ++++++++++++--------
 1 file changed, 171 insertions(+), 116 deletions(-)

diff --git a/language/gpt-oss/eval_mlperf_performance.py b/language/gpt-oss/eval_mlperf_performance.py
index 46b67c6991..5ef2b90afe 100755
--- a/language/gpt-oss/eval_mlperf_performance.py
+++ b/language/gpt-oss/eval_mlperf_performance.py
@@ -19,7 +19,7 @@
         --output-file detokenized_outputs.json \
         --reference-data data/accuracy_eval_tokenized_filtered.pkl \
         --tokenizer openai/gpt-oss-120b
-    
+
     # With histogram plots (enables plotting when --plot-dir is specified)
     python eval_mlperf_performance.py \
         --mlperf-log mlperf_logs/offline/accuracy/mlperf_log_accuracy.json \
@@ -44,6 +44,9 @@
     ]
 """
 
+from tqdm import tqdm
+from transformers import AutoTokenizer
+import matplotlib.pyplot as plt
 import argparse
 import json
 import logging
@@ -55,9 +58,6 @@
 import pandas as pd
 import matplotlib
 matplotlib.use('Agg')  # Non-interactive backend for server environments
-import matplotlib.pyplot as plt
-from transformers import AutoTokenizer
-from tqdm import tqdm
 
 # Configure logging
 logging.basicConfig(
@@ -72,98 +72,99 @@ def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         description="Parse MLPerf accuracy JSON and detokenize responses"
     )
-    
+
     parser.add_argument(
         "--mlperf-log",
         type=str,
         required=True,
         help="Path to mlperf_log_accuracy.json file"
     )
-    
+
     parser.add_argument(
         "--output-file",
         type=str,
         required=True,
         help="Path to output JSON file with detokenized responses"
     )
-    
+
     parser.add_argument(
         "--reference-data",
         type=str,
         default=None,
         help="Path to reference pickle file (DataFrame with prompts, dataset, etc.) - optional"
     )
-    
+
     parser.add_argument(
         "--tokenizer",
         type=str,
         default="openai/gpt-oss-120b",
         help="Tokenizer to use for detokenization (default: openai/gpt-oss-120b)"
     )
-    
+
     parser.add_argument(
         "--pretty",
         action="store_true",
         help="Pretty-print the output JSON with indentation"
     )
-    
+
     parser.add_argument(
         "--plot-dir",
         type=str,
         default=None,
         help="Directory to save histogram plots (enables plotting if specified)"
     )
-    
+
     return parser.parse_args()
 
 
 def decode_hex_to_tokens(hex_string: str) -> List[int]:
     """Decode hex-encoded byte array to list of token IDs.
-    
+
     MLPerf stores token IDs as hex-encoded bytes where each token is a 4-byte
     little-endian integer.
-    
+
     Args:
         hex_string: Hex-encoded string from MLPerf log
-        
+
     Returns:
         List of token IDs
     """
     # Remove any whitespace
     hex_string = hex_string.strip()
-    
+
     # Convert hex string to bytes
     byte_data = bytes.fromhex(hex_string)
-    
+
     # Each token is stored as 4 bytes (int32) in little-endian format
     token_ids = []
     for i in range(0, len(byte_data), 4):
         if i + 4 <= len(byte_data):
             # Unpack 4 bytes as little-endian int32
-            token_id = int.from_bytes(byte_data[i:i+4], byteorder='little', signed=True)
+            token_id = int.from_bytes(
+                byte_data[i:i + 4], byteorder='little', signed=True)
             token_ids.append(token_id)
-    
+
     return token_ids
 
 
 def parse_mlperf_log(log_path: str) -> List[Dict[str, Any]]:
     """Parse MLPerf accuracy log file.
-    
+
     Handles multiple formats:
     - JSON array: [{"qsl_idx": 0, ...}, ...]
     - JSONL: one JSON object per line
     - Concatenated JSON: multiple JSON objects on same line
-    
+
     Args:
         log_path: Path to mlperf_log_accuracy.json
-        
+
     Returns:
         List of entries with qsl_idx and hex-encoded data
     """
     logger.info(f"Reading MLPerf log: {log_path}")
-    
+
     entries = []
-    
+
     # First try to load as a single JSON array
     try:
         with open(log_path, 'r') as f:
@@ -173,7 +174,7 @@ def parse_mlperf_log(log_path: str) -> List[Dict[str, Any]]:
             return log_data
     except json.JSONDecodeError:
         pass  # Not a valid JSON array, try line-by-line parsing
-    
+
     # Parse line by line (JSONL or concatenated JSON)
     decoder = json.JSONDecoder()
     with open(log_path, 'r') as f:
@@ -181,7 +182,7 @@ def parse_mlperf_log(log_path: str) -> List[Dict[str, Any]]:
             line = line.strip()
             if not line:
                 continue
-            
+
             # Try to parse as single JSON object first
             try:
                 entry = json.loads(line)
@@ -202,9 +203,10 @@ def parse_mlperf_log(log_path: str) -> List[Dict[str, Any]]:
                         parsed_count += 1
                     except json.JSONDecodeError as e:
                         if parsed_count == 0:
-                            logger.warning(f"Line {line_num}: Could not parse JSON: {e}")
+                            logger.warning(
+                                f"Line {line_num}: Could not parse JSON: {e}")
                         break
-    
+
     logger.info(f"Loaded {len(entries)} entries from MLPerf log")
     return entries
 
@@ -215,7 +217,7 @@ def plot_histograms(
     has_reference: bool = False
 ) -> None:
     """Generate histogram plots for output token lengths and differences.
-    
+
     Args:
         results: List of parsed results with token lengths
         output_dir: Directory to save plots
@@ -223,49 +225,72 @@ def plot_histograms(
     """
     output_path = Path(output_dir)
     output_path.mkdir(parents=True, exist_ok=True)
-    
+
     logger.info(f"Generating histogram plots in {output_dir}...")
-    
+
     # Extract output token lengths
     output_lengths = [r['num_tokens'] for r in results]
-    
+
     # Plot 1: Output Sequence Length (OSL) Histogram
     plt.figure(figsize=(12, 6))
-    plt.hist(output_lengths, bins=50, edgecolor='black', alpha=0.7, color='steelblue')
+    plt.hist(
+        output_lengths,
+        bins=50,
+        edgecolor='black',
+        alpha=0.7,
+        color='steelblue')
     plt.xlabel('Output Token Length (OSL)', fontsize=12)
     plt.ylabel('Frequency', fontsize=12)
-    plt.title(f'Distribution of Output Token Lengths\n(n={len(output_lengths)}, mean={sum(output_lengths)/len(output_lengths):.1f}, median={sorted(output_lengths)[len(output_lengths)//2]})', fontsize=14)
+    plt.title(
+        f'Distribution of Output Token Lengths\n(n={len(output_lengths)}, mean={sum(output_lengths)/len(output_lengths):.1f}, median={sorted(output_lengths)[len(output_lengths)//2]})',
+        fontsize=14)
     plt.grid(axis='y', alpha=0.3)
-    
+
     # Add statistics box
     stats_text = f'Min: {min(output_lengths)}\nMax: {max(output_lengths)}\nStd: {(sum((x - sum(output_lengths)/len(output_lengths))**2 for x in output_lengths) / len(output_lengths))**0.5:.1f}'
     plt.text(0.98, 0.97, stats_text, transform=plt.gca().transAxes,
              fontsize=10, verticalalignment='top', horizontalalignment='right',
              bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
-    
+
     osl_plot_path = output_path / 'output_token_length_histogram.png'
     plt.tight_layout()
     plt.savefig(osl_plot_path, dpi=300, bbox_inches='tight')
     plt.close()
     logger.info(f"✓ Saved OSL histogram: {osl_plot_path}")
-    
+
     # Plot 2: Token Length Difference Histogram (if reference data available)
     if has_reference:
-        results_with_diff = [r for r in results if 'output_token_len_diff' in r]
+        results_with_diff = [
+            r for r in results if 'output_token_len_diff' in r]
         if results_with_diff:
-            differences = [r['output_token_len_diff'] for r in results_with_diff]
-            
+            differences = [r['output_token_len_diff']
+                           for r in results_with_diff]
+
             plt.figure(figsize=(12, 6))
-            plt.hist(differences, bins=50, edgecolor='black', alpha=0.7, color='coral')
-            plt.xlabel('Token Length Difference (Actual - Reference)', fontsize=12)
+            plt.hist(
+                differences,
+                bins=50,
+                edgecolor='black',
+                alpha=0.7,
+                color='coral')
+            plt.xlabel(
+                'Token Length Difference (Actual - Reference)',
+                fontsize=12)
             plt.ylabel('Frequency', fontsize=12)
-            
+
             mean_diff = sum(differences) / len(differences)
-            median_diff = sorted(differences)[len(differences)//2]
-            plt.title(f'Distribution of Output Token Length Differences\n(n={len(differences)}, mean={mean_diff:.1f}, median={median_diff})', fontsize=14)
+            median_diff = sorted(differences)[len(differences) // 2]
+            plt.title(
+                f'Distribution of Output Token Length Differences\n(n={len(differences)}, mean={mean_diff:.1f}, median={median_diff})',
+                fontsize=14)
             plt.grid(axis='y', alpha=0.3)
-            plt.axvline(x=0, color='red', linestyle='--', linewidth=2, label='Zero difference')
-            
+            plt.axvline(
+                x=0,
+                color='red',
+                linestyle='--',
+                linewidth=2,
+                label='Zero difference')
+
             # Add statistics box
             longer = sum(1 for d in differences if d > 0)
             shorter = sum(1 for d in differences if d < 0)
@@ -274,40 +299,51 @@ def plot_histograms(
             plt.text(0.98, 0.97, stats_text, transform=plt.gca().transAxes,
                      fontsize=9, verticalalignment='top', horizontalalignment='right',
                      bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.5))
-            
+
             plt.legend()
-            
+
             diff_plot_path = output_path / 'token_length_difference_histogram.png'
             plt.tight_layout()
             plt.savefig(diff_plot_path, dpi=300, bbox_inches='tight')
             plt.close()
             logger.info(f"✓ Saved difference histogram: {diff_plot_path}")
-            
+
             # Plot 3: Combined comparison (side by side)
             fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
-            
+
             # Reference vs Actual
-            ref_lengths = [r['ref_num_output_tokens'] for r in results_with_diff]
-            actual_lengths = [r['actual_num_output_tokens'] for r in results_with_diff]
-            
+            ref_lengths = [r['ref_num_output_tokens']
+                           for r in results_with_diff]
+            actual_lengths = [r['actual_num_output_tokens']
+                              for r in results_with_diff]
+
             ax1.hist([ref_lengths, actual_lengths], bins=50, label=['Reference', 'Actual'],
-                    alpha=0.6, edgecolor='black', color=['steelblue', 'coral'])
+                     alpha=0.6, edgecolor='black', color=['steelblue', 'coral'])
             ax1.set_xlabel('Output Token Length', fontsize=12)
             ax1.set_ylabel('Frequency', fontsize=12)
-            ax1.set_title(f'Reference vs Actual Output Token Lengths\n(n={len(results_with_diff)})', fontsize=13)
+            ax1.set_title(
+                f'Reference vs Actual Output Token Lengths\n(n={len(results_with_diff)})',
+                fontsize=13)
             ax1.legend()
             ax1.grid(axis='y', alpha=0.3)
-            
+
             # Scatter plot: Reference vs Actual
-            ax2.scatter(ref_lengths, actual_lengths, alpha=0.4, s=10, color='purple')
+            ax2.scatter(
+                ref_lengths,
+                actual_lengths,
+                alpha=0.4,
+                s=10,
+                color='purple')
             ax2.plot([min(ref_lengths), max(ref_lengths)], [min(ref_lengths), max(ref_lengths)],
-                    'r--', linewidth=2, label='y=x (perfect match)')
+                     'r--', linewidth=2, label='y=x (perfect match)')
             ax2.set_xlabel('Reference Token Length', fontsize=12)
             ax2.set_ylabel('Actual Token Length', fontsize=12)
-            ax2.set_title('Reference vs Actual Token Lengths (Scatter)', fontsize=13)
+            ax2.set_title(
+                'Reference vs Actual Token Lengths (Scatter)',
+                fontsize=13)
             ax2.legend()
             ax2.grid(alpha=0.3)
-            
+
             comparison_plot_path = output_path / 'token_length_comparison.png'
             plt.tight_layout()
             plt.savefig(comparison_plot_path, dpi=300, bbox_inches='tight')
@@ -315,7 +351,7 @@ def plot_histograms(
             logger.info(f"✓ Saved comparison plot: {comparison_plot_path}")
         else:
             logger.warning("No samples with token length differences found")
-    
+
     logger.info(f"✓ All plots saved to {output_dir}/")
 
 
@@ -325,41 +361,41 @@ def detokenize_responses(
     reference_df: Optional[pd.DataFrame] = None
 ) -> List[Dict[str, Any]]:
     """Detokenize responses from MLPerf log entries.
-    
+
     When reference data is provided, input_prompt is generated by detokenizing
     input token IDs from the reference data (checks: tok_input, input_token_ids,
     input_tokens, tokenized_input). This shows exactly what was sent to the model
     (after tokenization), not the original text prompt.
-    
+
     Args:
         entries: List of MLPerf log entries with hex-encoded token IDs
         tokenizer: HuggingFace tokenizer instance
         reference_df: Optional reference DataFrame with input prompts and metadata
-        
+
     Returns:
         List of dictionaries with qsl_idx, token_ids, and detokenized text
     """
     logger.info("Detokenizing responses...")
-    
+
     results = []
     for entry in tqdm(entries, desc="Detokenizing", unit="response"):
         qsl_idx = entry.get("qsl_idx")
         hex_data = entry.get("data", "")
-        
+
         # Decode hex to token IDs
         try:
             token_ids = decode_hex_to_tokens(hex_data)
         except Exception as e:
             logger.error(f"Error decoding tokens for qsl_idx={qsl_idx}: {e}")
             token_ids = []
-        
+
         # Detokenize to text
         try:
             text = tokenizer.decode(token_ids, skip_special_tokens=True)
         except Exception as e:
             logger.error(f"Error detokenizing qsl_idx={qsl_idx}: {e}")
             text = ""
-        
+
         # Build result record
         result = {
             "qsl_idx": qsl_idx,
@@ -367,32 +403,36 @@ def detokenize_responses(
             "text": text,
             "num_tokens": len(token_ids)
         }
-        
+
         # Add reference data if available
         if reference_df is not None and qsl_idx < len(reference_df):
             ref_row = reference_df.iloc[qsl_idx]
-            
+
             # Add common fields from reference data
             if 'dataset' in ref_row:
                 result['dataset'] = ref_row['dataset']
-            
+
             # Get input token IDs and detokenize to see what was actually sent to the model
             # Check multiple possible field names for input tokens
             input_token_ids = None
-            for field in ['tok_input', 'input_token_ids', 'input_tokens', 'tokenized_input']:
+            for field in ['tok_input', 'input_token_ids',
+                          'input_tokens', 'tokenized_input']:
                 if field in ref_row:
                     input_token_ids = ref_row[field]
                     break
-            
+
             if input_token_ids is not None:
                 result['input_token_ids'] = input_token_ids
                 if isinstance(input_token_ids, list):
                     result['num_input_tokens'] = len(input_token_ids)
-                    # Detokenize input tokens to show what was actually sent to the model
+                    # Detokenize input tokens to show what was actually sent to
+                    # the model
                     try:
-                        result['input_prompt'] = tokenizer.decode(input_token_ids, skip_special_tokens=False)
+                        result['input_prompt'] = tokenizer.decode(
+                            input_token_ids, skip_special_tokens=False)
                     except Exception as e:
-                        logger.warning(f"Error detokenizing input tokens for qsl_idx={qsl_idx}: {e}")
+                        logger.warning(
+                            f"Error detokenizing input tokens for qsl_idx={qsl_idx}: {e}")
                         result['input_prompt'] = None
                 else:
                     result['num_input_tokens'] = None
@@ -405,14 +445,16 @@ def detokenize_responses(
                     result['input_prompt'] = ref_row['input_text']
                 elif 'text' in ref_row:
                     result['input_prompt'] = ref_row['text']
-            
+
             if 'ground_truth' in ref_row:
                 result['ground_truth'] = ref_row['ground_truth']
-            
+
             # Compute output token length difference
-            # Check for reference output token length in various possible field names
+            # Check for reference output token length in various possible field
+            # names
             ref_output_len = None
-            for field in ['output_token_ids', 'target_token_ids', 'output_tokens', 'expected_output_token_ids']:
+            for field in ['output_token_ids', 'target_token_ids',
+                          'output_tokens', 'expected_output_token_ids']:
                 if field in ref_row:
                     ref_tokens = ref_row[field]
                     if isinstance(ref_tokens, list):
@@ -422,50 +464,54 @@ def detokenize_responses(
                     elif isinstance(ref_tokens, (int, float)) and not pd.isna(ref_tokens):
                         ref_output_len = int(ref_tokens)
                         break
-            
+
             # Also check for direct length field
             if ref_output_len is None:
-                for field in ['output_len', 'output_length', 'num_output_tokens', 'target_len']:
+                for field in ['output_len', 'output_length',
+                              'num_output_tokens', 'target_len']:
                     if field in ref_row and not pd.isna(ref_row[field]):
                         ref_output_len = int(ref_row[field])
                         break
-            
+
             if ref_output_len is not None:
                 actual_output_len = len(token_ids)
                 result['ref_num_output_tokens'] = ref_output_len
                 result['actual_num_output_tokens'] = actual_output_len
-                result['output_token_len_diff'] = actual_output_len - ref_output_len
-                result['output_token_len_ratio'] = actual_output_len / ref_output_len if ref_output_len > 0 else None
-            
+                result['output_token_len_diff'] = actual_output_len - \
+                    ref_output_len
+                result['output_token_len_ratio'] = actual_output_len / \
+                    ref_output_len if ref_output_len > 0 else None
+
             # Add any other columns that might be useful
             for col in ['question_id', 'difficulty', 'subject', 'category']:
                 if col in ref_row:
                     result[col] = ref_row[col]
-        
+
         results.append(result)
-    
+
     return results
 
 
 def main():
     """Main function."""
     args = parse_args()
-    
+
     # Validate input file exists
     log_path = Path(args.mlperf_log)
     if not log_path.exists():
         logger.error(f"MLPerf log file not found: {args.mlperf_log}")
         sys.exit(1)
-    
+
     logger.info("=" * 80)
     logger.info("MLPerf Accuracy Log Parser")
     logger.info("=" * 80)
     logger.info(f"Input log: {args.mlperf_log}")
     logger.info(f"Output file: {args.output_file}")
-    logger.info(f"Reference data: {args.reference_data if args.reference_data else 'None (outputs only)'}")
+    logger.info(
+        f"Reference data: {args.reference_data if args.reference_data else 'None (outputs only)'}")
     logger.info(f"Tokenizer: {args.tokenizer}")
     logger.info("=" * 80)
-    
+
     # Load reference data if provided
     reference_df = None
     if args.reference_data:
@@ -478,7 +524,7 @@ def main():
         except Exception as e:
             logger.error(f"Failed to load reference data: {e}")
             sys.exit(1)
-    
+
     # Load tokenizer
     logger.info(f"Loading tokenizer: {args.tokenizer}")
     try:
@@ -487,71 +533,79 @@ def main():
     except Exception as e:
         logger.error(f"Failed to load tokenizer: {e}")
         sys.exit(1)
-    
+
     # Parse MLPerf log
     try:
         entries = parse_mlperf_log(args.mlperf_log)
     except Exception as e:
         logger.error(f"Failed to parse MLPerf log: {e}")
         sys.exit(1)
-    
+
     if not entries:
         logger.error("No entries found in MLPerf log")
         sys.exit(1)
-    
+
     # Detokenize responses
     try:
         results = detokenize_responses(entries, tokenizer, reference_df)
     except Exception as e:
         logger.error(f"Failed to detokenize responses: {e}")
         sys.exit(1)
-    
+
     # Write output JSON
     logger.info(f"Writing detokenized outputs to: {args.output_file}")
     output_path = Path(args.output_file)
     output_path.parent.mkdir(parents=True, exist_ok=True)
-    
+
     with open(output_path, 'w') as f:
         if args.pretty:
             json.dump(results, f, indent=2, ensure_ascii=False)
         else:
             json.dump(results, f, ensure_ascii=False)
-    
+
     logger.info("=" * 80)
     logger.info("✓ Parsing completed successfully")
     logger.info("=" * 80)
     logger.info(f"Total responses parsed: {len(results)}")
-    
+
     # Print statistics
     total_tokens = sum(r["num_tokens"] for r in results)
     avg_tokens = total_tokens / len(results) if results else 0
     logger.info(f"Total output tokens: {total_tokens:,}")
     logger.info(f"Average tokens per response: {avg_tokens:.1f}")
-    
+
     # Print token length difference statistics if reference data was provided
     if reference_df is not None:
-        results_with_diff = [r for r in results if 'output_token_len_diff' in r]
+        results_with_diff = [
+            r for r in results if 'output_token_len_diff' in r]
         if results_with_diff:
             diffs = [r['output_token_len_diff'] for r in results_with_diff]
-            ratios = [r['output_token_len_ratio'] for r in results_with_diff if r['output_token_len_ratio'] is not None]
-            
-            logger.info(f"\nOutput Token Length Analysis ({len(results_with_diff)} samples with reference):")
-            logger.info(f"  Mean difference (actual - ref): {sum(diffs) / len(diffs):.2f} tokens")
+            ratios = [r['output_token_len_ratio']
+                      for r in results_with_diff if r['output_token_len_ratio'] is not None]
+
+            logger.info(
+                f"\nOutput Token Length Analysis ({len(results_with_diff)} samples with reference):")
+            logger.info(
+                f"  Mean difference (actual - ref): {sum(diffs) / len(diffs):.2f} tokens")
             logger.info(f"  Min difference: {min(diffs)} tokens")
             logger.info(f"  Max difference: {max(diffs)} tokens")
             if ratios:
-                logger.info(f"  Mean ratio (actual / ref): {sum(ratios) / len(ratios):.3f}x")
-            
+                logger.info(
+                    f"  Mean ratio (actual / ref): {sum(ratios) / len(ratios):.3f}x")
+
             # Count samples that are longer/shorter
             longer = sum(1 for d in diffs if d > 0)
             shorter = sum(1 for d in diffs if d < 0)
             exact = sum(1 for d in diffs if d == 0)
-            logger.info(f"  Longer than reference: {longer} ({longer/len(diffs)*100:.1f}%)")
-            logger.info(f"  Shorter than reference: {shorter} ({shorter/len(diffs)*100:.1f}%)")
-            logger.info(f"  Exact match: {exact} ({exact/len(diffs)*100:.1f}%)")
-    
+            logger.info(
+                f"  Longer than reference: {longer} ({longer/len(diffs)*100:.1f}%)")
+            logger.info(
+                f"  Shorter than reference: {shorter} ({shorter/len(diffs)*100:.1f}%)")
+            logger.info(
+                f"  Exact match: {exact} ({exact/len(diffs)*100:.1f}%)")
+
     logger.info("=" * 80)
-    
+
     # Show sample output
     if results:
         logger.info("Sample output (first entry):")
@@ -560,16 +614,17 @@ def main():
         logger.info(f"  num_tokens: {sample['num_tokens']}")
         logger.info(f"  text preview: {sample['text'][:200]}...")
         logger.info("=" * 80)
-    
+
     # Generate histogram plots if plot directory is specified
     if args.plot_dir:
         logger.info("\n" + "=" * 80)
         logger.info("Generating Histogram Plots")
         logger.info("=" * 80)
-        plot_histograms(results, args.plot_dir, has_reference=(reference_df is not None))
+        plot_histograms(
+            results, args.plot_dir, has_reference=(
+                reference_df is not None))
         logger.info("=" * 80)
 
 
 if __name__ == "__main__":
     main()
-

From 7f47e5ed384a99314d177d8c2345a1dd61569d23 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 1 Dec 2025 10:34:24 -0800
Subject: [PATCH 130/165] add pass@k to acc eval

---
 language/gpt-oss/eval_mlperf_accuracy.py | 195 ++++++++++++++++++-----
 language/gpt-oss/mlperf/offline_sut.py   |   6 +-
 language/gpt-oss/mlperf/user.conf        |  10 +-
 language/gpt-oss/run_mlperf.py           |   9 +-
 4 files changed, 174 insertions(+), 46 deletions(-)

diff --git a/language/gpt-oss/eval_mlperf_accuracy.py b/language/gpt-oss/eval_mlperf_accuracy.py
index eeadcde15d..bc17253a54 100644
--- a/language/gpt-oss/eval_mlperf_accuracy.py
+++ b/language/gpt-oss/eval_mlperf_accuracy.py
@@ -43,21 +43,33 @@
 logger = logging.getLogger(__name__)
 
 
-def load_mlperf_log(log_path: str) -> List[Dict[str, Any]]:
+def load_mlperf_log(log_path: str) -> Tuple[List[Dict[str, Any]], int]:
     """Load MLPerf accuracy JSON log.
     
     Args:
         log_path: Path to mlperf_log_accuracy.json
         
     Returns:
-        List of log entries with seq_id, qsl_idx, data (hex), token_count
+        Tuple of (log_data, num_repeats)
+        - log_data: List of log entries with seq_id, qsl_idx, repeat_idx, data (hex), token_count
+        - num_repeats: Number of repeats per sample (pass@k value)
     """
     logger.info(f"Loading MLPerf log from {log_path}")
     with open(log_path, 'r') as f:
         log_data = json.load(f)
     
+    # Detect number of repeats from repeat_idx field
+    max_repeat_idx = 0
+    for entry in log_data:
+        repeat_idx = entry.get('repeat_idx', 0)
+        max_repeat_idx = max(max_repeat_idx, repeat_idx)
+    
+    num_repeats = max_repeat_idx + 1
+    
     logger.info(f"Loaded {len(log_data)} log entries")
-    return log_data
+    logger.info(f"Detected repeats_per_sample = {num_repeats} (pass@{num_repeats} format)")
+    
+    return log_data, num_repeats
 
 
 def decode_hex_to_tokens(hex_data: str) -> List[int]:
@@ -121,10 +133,11 @@ def process_livecodebench_batch(
     work_items = []
     entry_metadata = []  # Store (entry, qsl_idx, ref_row, token_ids, model_output)
     
-    logger.info(f"Parsing {len(entries)} {dataset_name} samples...")
-    for entry in tqdm(entries, desc=f"Parsing {dataset_name}", unit="sample"):
+    logger.info(f"Parsing {len(entries)} {dataset_name} entries...")
+    for entry in tqdm(entries, desc=f"Parsing {dataset_name}", unit="entry"):
         seq_id = entry['seq_id']
         qsl_idx = entry['qsl_idx']
+        repeat_idx = entry.get('repeat_idx', 0)
         hex_data = entry['data']
         
         ref_row = reference_df.iloc[qsl_idx]
@@ -140,6 +153,7 @@ def process_livecodebench_batch(
         entry_metadata.append({
             'entry': entry,
             'qsl_idx': qsl_idx,
+            'repeat_idx': repeat_idx,
             'ref_row': ref_row,
             'token_ids': token_ids,
             'model_output': model_output,
@@ -188,6 +202,7 @@ def process_livecodebench_batch(
     for idx, metadata in enumerate(entry_metadata):
         entry = metadata['entry']
         qsl_idx = metadata['qsl_idx']
+        repeat_idx = metadata['repeat_idx']
         token_ids = metadata['token_ids']
         model_output = metadata['model_output']
         extracted_code = metadata['extracted_code']
@@ -204,6 +219,7 @@ def process_livecodebench_batch(
         result = {
             'seq_id': entry['seq_id'],
             'qsl_idx': qsl_idx,
+            'repeat_idx': repeat_idx,
             'dataset': dataset_name,
             'is_correct': is_correct,
             'extracted_answer': str(extracted_code)[:200] if extracted_code is not None else None,
@@ -218,6 +234,7 @@ def process_livecodebench_batch(
         if args.save_outputs:
             output_record = {
                 'qsl_idx': qsl_idx,
+                'repeat_idx': repeat_idx,
                 'seq_id': entry['seq_id'],
                 'dataset': dataset_name,
                 'ground_truth': ground_truth,
@@ -327,7 +344,7 @@ def main():
         logging.getLogger().setLevel(logging.DEBUG)
     
     # Load MLPerf log
-    mlperf_log = load_mlperf_log(args.mlperf_log)
+    mlperf_log, num_repeats = load_mlperf_log(args.mlperf_log)
     
     # Load reference data
     logger.info(f"Loading reference data from {args.reference_data}")
@@ -336,6 +353,7 @@ def main():
     
     logger.info(f"Reference data shape: {reference_df.shape}")
     logger.info(f"Reference columns: {list(reference_df.columns)}")
+    logger.info(f"Evaluation mode: pass@{num_repeats}" if num_repeats > 1 else "Evaluation mode: single-pass")
     
     # Log unique datasets in reference data
     if 'dataset' in reference_df.columns:
@@ -369,12 +387,17 @@ def main():
     logger.info(f"Loading tokenizer: {args.tokenizer}")
     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
     
-    # Group MLPerf log entries by dataset
+    # Group MLPerf log entries by dataset and track repeats
     logger.info("Grouping MLPerf log entries by dataset...")
     dataset_entries = defaultdict(list)
     
+    # Track unique (qsl_idx, repeat_idx) pairs to verify coverage
+    qsl_repeat_pairs = set()
+    
     for entry in mlperf_log:
         qsl_idx = entry['qsl_idx']
+        repeat_idx = entry.get('repeat_idx', 0)
+        
         if qsl_idx >= len(reference_df):
             logger.warning(f"qsl_idx {qsl_idx} out of range (max: {len(reference_df)-1})")
             continue
@@ -382,10 +405,16 @@ def main():
         ref_row = reference_df.iloc[qsl_idx]
         dataset_name = validate_dataset_name(ref_row['dataset'])
         dataset_entries[dataset_name].append(entry)
+        qsl_repeat_pairs.add((qsl_idx, repeat_idx))
+    
+    # Count unique qsl_idx values
+    unique_qsl_indices = set(pair[0] for pair in qsl_repeat_pairs)
     
     logger.info(f"Grouped entries by dataset:")
     for ds_name, entries in sorted(dataset_entries.items()):
-        logger.info(f"  {ds_name}: {len(entries)} samples")
+        logger.info(f"  {ds_name}: {len(entries)} entries")
+    logger.info(f"Unique samples (qsl_idx): {len(unique_qsl_indices)}")
+    logger.info(f"Total entries (samples × repeats): {len(qsl_repeat_pairs)}")
     
     # Pre-load LiveCodeBench benchmark if needed
     lcb_executor = None
@@ -408,7 +437,13 @@ def main():
     logger.info("\nProcessing MLPerf log entries by dataset...")
     
     results = []
-    dataset_stats = defaultdict(lambda: {"correct": 0, "total": 0})
+    # Track stats for each repeat and aggregated
+    dataset_stats = defaultdict(lambda: {
+        "per_repeat": {i: {"correct": 0, "total": 0} for i in range(num_repeats)},
+        "aggregated": {"correct": 0, "total": 0}  # pass@k: at least one correct
+    })
+    # Track results per (qsl_idx, repeat_idx) for aggregation
+    qsl_results = defaultdict(lambda: {i: None for i in range(num_repeats)})
     outputs_data = []  # For saving detokenized outputs
     
     try:
@@ -431,16 +466,25 @@ def main():
                 if args.save_outputs:
                     outputs_data.extend(outputs_batch)
                 
-                # Update stats
+                # Update per-repeat stats and track for aggregation
                 for res in results_batch:
-                    dataset_stats[dataset_name]["total"] += 1
-                    if res['is_correct']:
-                        dataset_stats[dataset_name]["correct"] += 1
+                    repeat_idx = res['repeat_idx']
+                    qsl_idx = res['qsl_idx']
+                    is_correct = res['is_correct']
+                    
+                    # Track result for aggregation
+                    qsl_results[(dataset_name, qsl_idx)][repeat_idx] = is_correct
+                    
+                    # Update per-repeat stats
+                    dataset_stats[dataset_name]["per_repeat"][repeat_idx]["total"] += 1
+                    if is_correct:
+                        dataset_stats[dataset_name]["per_repeat"][repeat_idx]["correct"] += 1
             else:
                 # Sequential evaluation for non-LCB datasets
-                for entry in tqdm(entries, desc=f"Evaluating {dataset_name}", unit="sample"):
+                for entry in tqdm(entries, desc=f"Evaluating {dataset_name}", unit="entry"):
                     seq_id = entry['seq_id']
                     qsl_idx = entry['qsl_idx']
+                    repeat_idx = entry.get('repeat_idx', 0)
                     hex_data = entry['data']
                     
                     ref_row = reference_df.iloc[qsl_idx]
@@ -456,7 +500,7 @@ def main():
                             model_output, ground_truth, dataset_name
                         )
                     except Exception as e:
-                        logger.warning(f"Evaluation error for qsl_idx={qsl_idx}, dataset={dataset_name}: {e}")
+                        logger.warning(f"Evaluation error for qsl_idx={qsl_idx}, repeat={repeat_idx}, dataset={dataset_name}: {e}")
                         is_correct = False
                         extracted = None
                         eval_details = f"Evaluation error: {e}"
@@ -465,6 +509,7 @@ def main():
                     result = {
                         'seq_id': seq_id,
                         'qsl_idx': qsl_idx,
+                        'repeat_idx': repeat_idx,
                         'dataset': dataset_name,
                         'is_correct': is_correct,
                         'extracted_answer': str(extracted) if extracted is not None else None,
@@ -479,6 +524,7 @@ def main():
                     if args.save_outputs:
                         output_record = {
                             'qsl_idx': qsl_idx,
+                            'repeat_idx': repeat_idx,
                             'seq_id': seq_id,
                             'dataset': dataset_name,
                             'ground_truth': ground_truth,
@@ -490,10 +536,13 @@ def main():
                         }
                         outputs_data.append(output_record)
                     
-                    # Update stats
-                    dataset_stats[dataset_name]["total"] += 1
+                    # Track result for this (qsl_idx, repeat_idx)
+                    qsl_results[(dataset_name, qsl_idx)][repeat_idx] = is_correct
+                    
+                    # Update per-repeat stats
+                    dataset_stats[dataset_name]["per_repeat"][repeat_idx]["total"] += 1
                     if is_correct:
-                        dataset_stats[dataset_name]["correct"] += 1
+                        dataset_stats[dataset_name]["per_repeat"][repeat_idx]["correct"] += 1
     
     finally:
         # Clean up LiveCodeBench executor
@@ -502,25 +551,66 @@ def main():
             lcb_executor.shutdown(wait=True)
             os.environ.pop('TQDM_DISABLE', None)
     
-    # Calculate overall stats
-    total_correct = sum(stats["correct"] for stats in dataset_stats.values())
-    total_samples = sum(stats["total"] for stats in dataset_stats.values())
+    # Aggregate results across repeats (pass@k logic)
+    logger.info("\nAggregating results across repeats...")
+    for (dataset_name, qsl_idx), repeat_results in qsl_results.items():
+        # Check if ANY repeat is correct (pass@k)
+        is_pass_k_correct = any(
+            result for result in repeat_results.values() if result is not None and result
+        )
+        
+        # Update aggregated stats
+        dataset_stats[dataset_name]["aggregated"]["total"] += 1
+        if is_pass_k_correct:
+            dataset_stats[dataset_name]["aggregated"]["correct"] += 1
+    
+    # Calculate overall stats (aggregated pass@k if num_repeats > 1, else per-repeat[0])
+    if num_repeats > 1:
+        total_correct = sum(stats["aggregated"]["correct"] for stats in dataset_stats.values())
+        total_samples = sum(stats["aggregated"]["total"] for stats in dataset_stats.values())
+    else:
+        total_correct = sum(stats["per_repeat"][0]["correct"] for stats in dataset_stats.values())
+        total_samples = sum(stats["per_repeat"][0]["total"] for stats in dataset_stats.values())
+    
     overall_accuracy = (total_correct / total_samples * 100) if total_samples > 0 else 0.0
     
     # Print results
     print("\n" + "=" * 80)
     print("MLPerf Accuracy Evaluation Results")
     print("=" * 80)
-    print(f"Total samples: {total_samples}")
-    print(f"Overall accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
+    print(f"Evaluation mode: pass@{num_repeats}" if num_repeats > 1 else "Evaluation mode: single-pass")
+    print(f"Total unique samples: {total_samples}")
+    print(f"Overall pass@{num_repeats} accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})" if num_repeats > 1 
+          else f"Overall accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
     print("=" * 80)
-    print("\nPer-Dataset Breakdown:")
-    print("-" * 80)
     
-    for dataset_name in sorted(dataset_stats.keys()):
-        stats = dataset_stats[dataset_name]
-        accuracy = (stats["correct"] / stats["total"] * 100) if stats["total"] > 0 else 0.0
-        print(f"{dataset_name:20s}: {accuracy:6.2f}% ({stats['correct']:4d}/{stats['total']:4d})")
+    if num_repeats > 1:
+        print("\nPer-Dataset pass@k Results (aggregated):")
+        print("-" * 80)
+        for dataset_name in sorted(dataset_stats.keys()):
+            stats = dataset_stats[dataset_name]["aggregated"]
+            if stats["total"] > 0:
+                accuracy = (stats["correct"] / stats["total"] * 100)
+                print(f"{dataset_name:20s}: {accuracy:6.2f}% ({stats['correct']:4d}/{stats['total']:4d})")
+        
+        print("\n" + "=" * 80)
+        print("Per-Dataset, Per-Repeat Breakdown:")
+        print("-" * 80)
+        for dataset_name in sorted(dataset_stats.keys()):
+            print(f"\n{dataset_name}:")
+            for repeat_idx in range(num_repeats):
+                stats = dataset_stats[dataset_name]["per_repeat"][repeat_idx]
+                if stats["total"] > 0:
+                    accuracy = (stats["correct"] / stats["total"] * 100)
+                    print(f"  Repeat {repeat_idx}: {accuracy:6.2f}% ({stats['correct']:4d}/{stats['total']:4d})")
+    else:
+        print("\nPer-Dataset Breakdown:")
+        print("-" * 80)
+        for dataset_name in sorted(dataset_stats.keys()):
+            stats = dataset_stats[dataset_name]["per_repeat"][0]
+            if stats["total"] > 0:
+                accuracy = (stats["correct"] / stats["total"] * 100)
+                print(f"{dataset_name:20s}: {accuracy:6.2f}% ({stats['correct']:4d}/{stats['total']:4d})")
     
     print("=" * 80)
     
@@ -528,8 +618,8 @@ def main():
     if args.save_outputs:
         logger.info(f"Saving detokenized outputs to {args.save_outputs}...")
         
-        # Sort by qsl_idx for ordered output
-        outputs_data_sorted = sorted(outputs_data, key=lambda x: x['qsl_idx'])
+        # Sort by (qsl_idx, repeat_idx) for ordered output
+        outputs_data_sorted = sorted(outputs_data, key=lambda x: (x['qsl_idx'], x.get('repeat_idx', 0)))
         
         # Convert to DataFrame for easier inspection
         outputs_df = pd.DataFrame(outputs_data_sorted)
@@ -545,19 +635,46 @@ def main():
     
     # Save detailed results if requested
     if args.output_file:
+        # Build per-dataset stats
+        per_dataset_stats = {}
+        for dataset_name, stats in dataset_stats.items():
+            if num_repeats > 1:
+                # Aggregated pass@k stats
+                agg_stats = stats["aggregated"]
+                per_dataset_stats[dataset_name] = {
+                    "pass_k": num_repeats,
+                    "aggregated": {
+                        "correct": agg_stats["correct"],
+                        "total": agg_stats["total"],
+                        "accuracy": (agg_stats["correct"] / agg_stats["total"] * 100) if agg_stats["total"] > 0 else 0.0
+                    },
+                    "per_repeat": {}
+                }
+                # Add per-repeat stats
+                for repeat_idx in range(num_repeats):
+                    repeat_stats = stats["per_repeat"][repeat_idx]
+                    if repeat_stats["total"] > 0:
+                        per_dataset_stats[dataset_name]["per_repeat"][repeat_idx] = {
+                            "correct": repeat_stats["correct"],
+                            "total": repeat_stats["total"],
+                            "accuracy": (repeat_stats["correct"] / repeat_stats["total"] * 100)
+                        }
+            else:
+                # Single-pass stats
+                single_stats = stats["per_repeat"][0]
+                per_dataset_stats[dataset_name] = {
+                    "correct": single_stats["correct"],
+                    "total": single_stats["total"],
+                    "accuracy": (single_stats["correct"] / single_stats["total"] * 100) if single_stats["total"] > 0 else 0.0
+                }
+
         output_data = {
             "summary": {
+                "num_repeats": num_repeats,
                 "total_samples": total_samples,
                 "total_correct": total_correct,
                 "overall_accuracy": overall_accuracy,
-                "per_dataset": {
-                    dataset: {
-                        "correct": stats["correct"],
-                        "total": stats["total"],
-                        "accuracy": (stats["correct"] / stats["total"] * 100) if stats["total"] > 0 else 0.0
-                    }
-                    for dataset, stats in dataset_stats.items()
-                }
+                "per_dataset": per_dataset_stats
             },
             "detailed_results": results if args.verbose else None
         }
diff --git a/language/gpt-oss/mlperf/offline_sut.py b/language/gpt-oss/mlperf/offline_sut.py
index 53f436ea4c..fe3348d65c 100644
--- a/language/gpt-oss/mlperf/offline_sut.py
+++ b/language/gpt-oss/mlperf/offline_sut.py
@@ -66,9 +66,11 @@ def issue_queries(self, query_samples: List[lg.QuerySample]) -> None:
         """
         logger.info(f"Received {len(query_samples)} queries")
 
-        # Update progress bar total (Offline gets all queries at once)
+        # Update progress bar total by accumulating (for repeats_per_sample > 1)
+        # LoadGen may call issue_queries multiple times for repeated sampling
         if self.progress_bar is not None:
-            self.progress_bar.total = len(query_samples)
+            self.progress_bar.total = (
+                self.progress_bar.total or 0) + len(query_samples)
             self.progress_bar.refresh()
 
         # Store queries for batch processing
diff --git a/language/gpt-oss/mlperf/user.conf b/language/gpt-oss/mlperf/user.conf
index a3b21cad1e..dc6baf3d19 100644
--- a/language/gpt-oss/mlperf/user.conf
+++ b/language/gpt-oss/mlperf/user.conf
@@ -1,13 +1,15 @@
 gpt-oss-120b.Offline.target_qps = 10.0
 gpt-oss-120b.Offline.min_duration = 60000
-gpt-oss-120b.Offline.min_query_count = 8036
-# gpt-oss-120b.Offline.min_query_count = 1724
+# gpt-oss-120b.Offline.performance_sample_count = 8036
+gpt-oss-120b.Offline.performance_sample_count = 1724
 
 gpt-oss-120b.Server.target_qps = 10
 gpt-oss-120b.Server.min_duration = 60000
-gpt-oss-120b.Server.min_query_count = 8036
-# gpt-oss-120b.Server.min_query_count = 1724
+# gpt-oss-120b.Server.performance_sample_count = 8036
+gpt-oss-120b.Server.performance_sample_count = 1724
 
 gpt-oss-120b.Server.target_latency = 0
 gpt-oss-120b.Server.ttft_latency = 2000
 gpt-oss-120b.Server.tpot_latency = 20
+
+gpt-oss-120b.*.repeats_per_sample = 5
diff --git a/language/gpt-oss/run_mlperf.py b/language/gpt-oss/run_mlperf.py
index 86e588f2c0..9280999e13 100755
--- a/language/gpt-oss/run_mlperf.py
+++ b/language/gpt-oss/run_mlperf.py
@@ -169,6 +169,13 @@ def create_argument_parser() -> argparse.ArgumentParser:
         help="Maximum concurrent requests to backend (SGLang handles batching internally)"
     )
 
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=1200,
+        help="Timeout for HTTP requests in seconds (default: 1200)"
+    )
+
     return parser
 
 
@@ -355,7 +362,7 @@ def do_cleanup():
             pool_size = int(args.max_concurrency * 1.1)  # 10% safety margin
             backend = SGLangBackend(
                 server_url=args.server_url,
-                timeout=1200,
+                timeout=args.timeout,
                 max_pool_size=pool_size
             )
         else:

From d3a7b5859ba349cc191037e371e812c653a762fe Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 1 Dec 2025 10:35:47 -0800
Subject: [PATCH 131/165] add repeats_per_sample option to loadgen

---
 loadgen/issue_query_controller.cc |  5 +++--
 loadgen/issue_query_controller.h  |  4 +++-
 loadgen/loadgen.cc                | 32 ++++++++++++++++++++++++++-----
 loadgen/logging.cc                | 16 +++++++++-------
 loadgen/logging.h                 |  3 ++-
 loadgen/test_settings.h           |  9 +++++++++
 loadgen/test_settings_internal.cc |  4 +++-
 loadgen/test_settings_internal.h  |  2 ++
 8 files changed, 58 insertions(+), 17 deletions(-)

diff --git a/loadgen/issue_query_controller.cc b/loadgen/issue_query_controller.cc
index 4c5ca66f0c..376ed98b8b 100644
--- a/loadgen/issue_query_controller.cc
+++ b/loadgen/issue_query_controller.cc
@@ -30,7 +30,8 @@ namespace loadgen {
 QueryMetadata::QueryMetadata(
     const std::vector<QuerySampleIndex>& query_sample_indices,
     std::chrono::nanoseconds scheduled_delta,
-    ResponseDelegate* response_delegate, SequenceGen* sequence_gen)
+    ResponseDelegate* response_delegate, SequenceGen* sequence_gen,
+    uint64_t repeat_index)
     : scheduled_delta(scheduled_delta),
       response_delegate(response_delegate),
       sequence_id(sequence_gen->NextQueryId()),
@@ -38,7 +39,7 @@ QueryMetadata::QueryMetadata(
   samples_.reserve(query_sample_indices.size());
   for (QuerySampleIndex qsi : query_sample_indices) {
     samples_.push_back({this, sequence_gen->NextSampleId(), qsi,
-                        sequence_gen->NextAccLogRng()});
+                        sequence_gen->NextAccLogRng(), repeat_index});
   }
   query_to_send.reserve(query_sample_indices.size());
   for (auto& s : samples_) {
diff --git a/loadgen/issue_query_controller.h b/loadgen/issue_query_controller.h
index 5668c574ed..2cdb4e0a92 100644
--- a/loadgen/issue_query_controller.h
+++ b/loadgen/issue_query_controller.h
@@ -83,6 +83,7 @@ struct SampleMetadata {
   uint64_t sequence_id;
   QuerySampleIndex sample_index;
   double accuracy_log_val;
+  uint64_t repeat_index;  // Index for repeated sampling (0 to k-1)
 };
 
 /// \brief Maintains data and timing info for a query and all its samples.
@@ -90,7 +91,8 @@ class QueryMetadata {
  public:
   QueryMetadata(const std::vector<QuerySampleIndex>& query_sample_indices,
                 std::chrono::nanoseconds scheduled_delta,
-                ResponseDelegate* response_delegate, SequenceGen* sequence_gen);
+                ResponseDelegate* response_delegate, SequenceGen* sequence_gen,
+                uint64_t repeat_index = 0);
   QueryMetadata(QueryMetadata&& src);
 
   void NotifyOneSampleCompleted(PerfClock::time_point timestamp);
diff --git a/loadgen/loadgen.cc b/loadgen/loadgen.cc
index 42b2140de2..0a5d2585ab 100644
--- a/loadgen/loadgen.cc
+++ b/loadgen/loadgen.cc
@@ -121,7 +121,8 @@ struct ResponseDelegateDetailed : public ResponseDelegate {
 
       if (sample_data_copy) {
         log.LogAccuracy(sample->sequence_id, sample->sample_index,
-                        LogBinaryAsHexString{sample_data_copy}, n_tokens);
+                        LogBinaryAsHexString{sample_data_copy}, n_tokens,
+                        sample->repeat_index);
         delete sample_data_copy;
       }
 
@@ -263,6 +264,12 @@ std::vector<QueryMetadata> GenerateQueries(
   auto tracer =
       MakeScopedTracer([](AsyncTrace& trace) { trace("GenerateQueries"); });
 
+  // In PerformanceOnly mode, repeats_per_sample must be 1
+  if (mode != TestMode::AccuracyOnly) {
+    assert(settings.repeats_per_sample == 1 &&
+           "repeats_per_sample must be 1 in PerformanceOnly mode");
+  }
+
   auto& loaded_samples = loaded_sample_set.set;
 
   // Generate 2x more samples than we think we'll need given the expected
@@ -287,6 +294,11 @@ std::vector<QueryMetadata> GenerateQueries(
     // For MultiStream, loaded samples is properly padded.
     // For Offline, we create a 'remainder' query at the end of this function.
     min_queries = loaded_samples.size() / samples_per_query;
+    
+    // For repeated sampling, multiply min_queries by repeats
+    if (mode == TestMode::AccuracyOnly) {
+      min_queries *= settings.repeats_per_sample;
+    }
   }
 
   std::vector<QueryMetadata> queries;
@@ -395,9 +407,15 @@ std::vector<QueryMetadata> GenerateQueries(
                                : sample_distribution(sample_rng)];
       }
     }
-    queries.emplace_back(samples, timestamp, response_delegate, sequence_gen);
-    prev_timestamp = timestamp;
-    timestamp += schedule_distribution(schedule_rng);
+    
+    // Handle repeated sampling: create repeats_per_sample queries for the same sample(s)
+    // In PerformanceOnly mode, this is always 1 (single query per sample)
+    for (uint64_t k = 0; k < settings.repeats_per_sample; k++) {
+      queries.emplace_back(samples, timestamp, response_delegate, sequence_gen, k);
+      prev_timestamp = timestamp;
+      timestamp += schedule_distribution(schedule_rng);
+    }
+    
     // In equal_issue mode, the min_queries will be bumped up by a multiple of
     // the dataset size if the test time has not met the threshold.
     if (enable_equal_issue && (queries.size() >= min_queries) &&
@@ -417,7 +435,11 @@ std::vector<QueryMetadata> GenerateQueries(
       for (auto& s : samples) {
         s = loaded_samples[sample_distribution(sample_rng)];
       }
-      queries.emplace_back(samples, timestamp, response_delegate, sequence_gen);
+      
+      // Handle repeated sampling for remainder query as well
+      for (uint64_t k = 0; k < settings.repeats_per_sample; k++) {
+        queries.emplace_back(samples, timestamp, response_delegate, sequence_gen, k);
+      }
     }
   }
 
diff --git a/loadgen/logging.cc b/loadgen/logging.cc
index d7e83e54b9..d9d4883be7 100644
--- a/loadgen/logging.cc
+++ b/loadgen/logging.cc
@@ -280,22 +280,24 @@ void AsyncLog::StopTrace() {
 
 void AsyncLog::LogAccuracy(uint64_t seq_id, const QuerySampleIndex qsl_idx,
                            const LogBinaryAsHexString& response,
-                           int64_t n_tokens = 0) {
+                           int64_t n_tokens, uint64_t repeat_index) {
   std::unique_lock<std::mutex> lock(log_mutex_);
   if (!accuracy_out_) {
     return;
   }
   *accuracy_out_ << (accuracy_needs_comma_ ? ",\n{ " : "\n{ ");
+
   if (!use_tokens_) {
-    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
-            response);
+    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx,
+            "repeat_idx", repeat_index, "data", response);
   } else if (!needs_first_token_) {
-    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
-            response, "token_count", n_tokens);
+    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx,
+            "repeat_idx", repeat_index, "data", response, "token_count", n_tokens);
   } else {
     const size_t i = seq_id - latencies_first_sample_sequence_id_;
-    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
-            response, "token_data", token_records_[i], "token_count", n_tokens);
+    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx,
+            "repeat_idx", repeat_index, "data", response, "token_data",
+            token_records_[i], "token_count", n_tokens);
   }
 
   *accuracy_out_ << " }";
diff --git a/loadgen/logging.h b/loadgen/logging.h
index 8f1a398e9d..2c770438d6 100644
--- a/loadgen/logging.h
+++ b/loadgen/logging.h
@@ -213,7 +213,8 @@ class AsyncLog {
   void SetCurrentPidTid(uint64_t pid, uint64_t tid);
 
   void LogAccuracy(uint64_t seq_id, const QuerySampleIndex qsl_idx,
-                   const LogBinaryAsHexString& response, int64_t n_tokens);
+                   const LogBinaryAsHexString& response, int64_t n_tokens,
+                   uint64_t repeat_index = 0);
   void CacheToken(uint64_t seq_id, const LogBinaryAsHexString& response);
 
   template <typename... Args>
diff --git a/loadgen/test_settings.h b/loadgen/test_settings.h
index 584d073bb8..5470277e2c 100644
--- a/loadgen/test_settings.h
+++ b/loadgen/test_settings.h
@@ -271,6 +271,15 @@ struct TestSettings {
   /// \brief Infer token latencies
   bool infer_token_latencies = false;
   uint64_t token_latency_scaling_factor;
+  
+  /// \brief Enable repeated sampling in accuracy mode
+  /// \details When enabled, each sample is issued
+  /// repeats_per_sample times and multiple responses are collected.
+  /// This is used for code generation benchmarks like gpt-oss where multiple
+  /// solutions are generated and evaluated (pass@k metric).
+  /// Default is 1 (single sample). Set to k (e.g., 5 for pass@5) to enable.
+  /// Must be 1 for PerformanceOnly mode.
+  uint64_t repeats_per_sample = 1;
   /**@}*/
 };
 
diff --git a/loadgen/test_settings_internal.cc b/loadgen/test_settings_internal.cc
index 3f2cd88473..425d4db833 100644
--- a/loadgen/test_settings_internal.cc
+++ b/loadgen/test_settings_internal.cc
@@ -54,7 +54,8 @@ TestSettingsInternal::TestSettingsInternal(
       server_ttft_latency(requested.server_ttft_latency),
       server_tpot_latency(requested.server_tpot_latency),
       infer_token_latencies(requested.infer_token_latencies),
-      token_latency_scaling_factor(requested.token_latency_scaling_factor) {
+      token_latency_scaling_factor(requested.token_latency_scaling_factor),
+      repeats_per_sample(requested.repeats_per_sample) {
   // Target QPS, target latency, and max_async_queries.
   switch (requested.scenario) {
     case TestScenario::SingleStream:
@@ -779,6 +780,7 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model,
   lookupkv(model, scenario, "max_query_count", &max_query_count, nullptr);
   lookupkv(model, scenario, "performance_sample_count_override",
            &performance_sample_count_override, nullptr);
+  lookupkv(model, scenario, "repeats_per_sample", &repeats_per_sample, nullptr);
   lookupkv(model, "SingleStream", "target_latency", nullptr,
            &single_stream_expected_latency_ns, 1000 * 1000);
   lookupkv(model, "MultiStream", "target_latency", nullptr,
diff --git a/loadgen/test_settings_internal.h b/loadgen/test_settings_internal.h
index ab2773bd18..44a430d869 100644
--- a/loadgen/test_settings_internal.h
+++ b/loadgen/test_settings_internal.h
@@ -88,6 +88,8 @@ struct TestSettingsInternal {
 
   bool infer_token_latencies = false;
   int64_t token_latency_scaling_factor;
+  
+  uint64_t repeats_per_sample;
 };
 
 /// \brief A namespace of collections of FindPeakPerformance helper functions,

From 50051f20eb2ebe571e1f072706a2fe1617abcf6d Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Mon, 1 Dec 2025 18:46:28 +0000
Subject: [PATCH 132/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/eval_mlperf_accuracy.py | 294 +++++++++++++----------
 language/gpt-oss/mlperf/offline_sut.py   |  20 +-
 loadgen/issue_query_controller.cc        |  12 +-
 loadgen/loadgen.cc                       |  19 +-
 loadgen/logging.cc                       |  17 +-
 loadgen/test_settings.h                  |   2 +-
 loadgen/test_settings_internal.h         |   2 +-
 7 files changed, 204 insertions(+), 162 deletions(-)

diff --git a/language/gpt-oss/eval_mlperf_accuracy.py b/language/gpt-oss/eval_mlperf_accuracy.py
index bc17253a54..34bbbc02fd 100644
--- a/language/gpt-oss/eval_mlperf_accuracy.py
+++ b/language/gpt-oss/eval_mlperf_accuracy.py
@@ -12,6 +12,10 @@
         --output-file accuracy_results.json
 """
 
+from eval_accuracy import (
+    get_evaluator, validate_dataset_name, validate_text_input, DATASET_EVALUATORS,
+    evaluate_livecodebench_worker, load_lcb_benchmark
+)
 import argparse
 import json
 import logging
@@ -31,10 +35,6 @@
 # Import evaluation functions from the existing script
 import sys
 sys.path.insert(0, str(Path(__file__).parent))
-from eval_accuracy import (
-    get_evaluator, validate_dataset_name, validate_text_input, DATASET_EVALUATORS,
-    evaluate_livecodebench_worker, load_lcb_benchmark
-)
 
 logging.basicConfig(
     level=logging.INFO,
@@ -45,10 +45,10 @@
 
 def load_mlperf_log(log_path: str) -> Tuple[List[Dict[str, Any]], int]:
     """Load MLPerf accuracy JSON log.
-    
+
     Args:
         log_path: Path to mlperf_log_accuracy.json
-        
+
     Returns:
         Tuple of (log_data, num_repeats)
         - log_data: List of log entries with seq_id, qsl_idx, repeat_idx, data (hex), token_count
@@ -57,49 +57,50 @@ def load_mlperf_log(log_path: str) -> Tuple[List[Dict[str, Any]], int]:
     logger.info(f"Loading MLPerf log from {log_path}")
     with open(log_path, 'r') as f:
         log_data = json.load(f)
-    
+
     # Detect number of repeats from repeat_idx field
     max_repeat_idx = 0
     for entry in log_data:
         repeat_idx = entry.get('repeat_idx', 0)
         max_repeat_idx = max(max_repeat_idx, repeat_idx)
-    
+
     num_repeats = max_repeat_idx + 1
-    
+
     logger.info(f"Loaded {len(log_data)} log entries")
-    logger.info(f"Detected repeats_per_sample = {num_repeats} (pass@{num_repeats} format)")
-    
+    logger.info(
+        f"Detected repeats_per_sample = {num_repeats} (pass@{num_repeats} format)")
+
     return log_data, num_repeats
 
 
 def decode_hex_to_tokens(hex_data: str) -> List[int]:
     """Decode hex string to list of token IDs (int32).
-    
+
     MLPerf stores token IDs as hex-encoded int32 array.
-    
+
     Args:
         hex_data: Hex string like "450D0300..."
-        
+
     Returns:
         List of token IDs
     """
     # Convert hex string to bytes
     data_bytes = bytes.fromhex(hex_data)
-    
+
     # Unpack as int32 array (little-endian)
     num_tokens = len(data_bytes) // 4
     token_ids = struct.unpack(f'<{num_tokens}i', data_bytes)
-    
+
     return list(token_ids)
 
 
 def detokenize(token_ids: List[int], tokenizer) -> str:
     """Convert token IDs to text.
-    
+
     Args:
         token_ids: List of integer token IDs
         tokenizer: HuggingFace tokenizer
-        
+
     Returns:
         Decoded text string
     """
@@ -116,7 +117,7 @@ def process_livecodebench_batch(
     args
 ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
     """Process a batch of LiveCodeBench entries in parallel.
-    
+
     Args:
         entries: List of MLPerf log entries for this dataset
         reference_df: Reference DataFrame
@@ -125,31 +126,32 @@ def process_livecodebench_batch(
         lcb_executor: ProcessPoolExecutor for parallel evaluation
         dataset_name: Dataset name
         args: Command line arguments
-        
+
     Returns:
         Tuple of (results_list, outputs_list)
     """
     # First pass: decode and parse all entries
     work_items = []
-    entry_metadata = []  # Store (entry, qsl_idx, ref_row, token_ids, model_output)
-    
+    # Store (entry, qsl_idx, ref_row, token_ids, model_output)
+    entry_metadata = []
+
     logger.info(f"Parsing {len(entries)} {dataset_name} entries...")
     for entry in tqdm(entries, desc=f"Parsing {dataset_name}", unit="entry"):
         seq_id = entry['seq_id']
         qsl_idx = entry['qsl_idx']
         repeat_idx = entry.get('repeat_idx', 0)
         hex_data = entry['data']
-        
+
         ref_row = reference_df.iloc[qsl_idx]
         ground_truth = ref_row.get('ground_truth', None)
-        
+
         # Decode tokens to text
         token_ids = decode_hex_to_tokens(hex_data)
         model_output = detokenize(token_ids, tokenizer)
-        
+
         # Parse code from model output
         extracted_code = evaluator['parse'](model_output)
-        
+
         entry_metadata.append({
             'entry': entry,
             'qsl_idx': qsl_idx,
@@ -160,44 +162,49 @@ def process_livecodebench_batch(
             'extracted_code': extracted_code,
             'ground_truth': ground_truth
         })
-        
+
         # Add to work queue if code was extracted
         if extracted_code is not None and not pd.isna(ground_truth):
             work_items.append((extracted_code, ground_truth))
         else:
             work_items.append(None)  # Placeholder for skipped items
-    
+
     # Second pass: batch evaluate code in parallel
-    logger.info(f"Evaluating {len([w for w in work_items if w is not None])} {dataset_name} code samples with parallel workers...")
-    
+    logger.info(
+        f"Evaluating {len([w for w in work_items if w is not None])} {dataset_name} code samples with parallel workers...")
+
     results_list = []
     outputs_list = []
-    
+
     # Submit all work items
     future_to_idx = {}
     for idx, work_item in enumerate(work_items):
         if work_item is not None:
-            future = lcb_executor.submit(evaluate_livecodebench_worker, work_item)
+            future = lcb_executor.submit(
+                evaluate_livecodebench_worker, work_item)
             future_to_idx[future] = idx
-    
+
     # Collect results with progress bar
     eval_results = [None] * len(work_items)
-    
+
     for future in tqdm(as_completed(future_to_idx.keys(), timeout=1200),
-                      total=len(future_to_idx),
-                      desc=f"Evaluating {dataset_name}",
-                      unit="sample"):
+                       total=len(future_to_idx),
+                       desc=f"Evaluating {dataset_name}",
+                       unit="sample"):
         idx = future_to_idx[future]
         try:
-            question_id, is_correct, detailed_reason = future.result(timeout=80)
+            question_id, is_correct, detailed_reason = future.result(
+                timeout=80)
             eval_results[idx] = (is_correct, detailed_reason)
         except TimeoutError:
-            logger.warning(f"Timeout evaluating sample {idx}: Test execution exceeded 80s timeout")
-            eval_results[idx] = (False, "Timeout: Test execution exceeded time limit")
+            logger.warning(
+                f"Timeout evaluating sample {idx}: Test execution exceeded 80s timeout")
+            eval_results[idx] = (
+                False, "Timeout: Test execution exceeded time limit")
         except Exception as e:
             logger.error(f"Error evaluating sample {idx}: {e}")
             eval_results[idx] = (False, f"Error: {e}")
-    
+
     # Third pass: compile final results
     for idx, metadata in enumerate(entry_metadata):
         entry = metadata['entry']
@@ -207,14 +214,14 @@ def process_livecodebench_batch(
         model_output = metadata['model_output']
         extracted_code = metadata['extracted_code']
         ground_truth = metadata['ground_truth']
-        
+
         # Get evaluation result
         if extracted_code is None or pd.isna(ground_truth):
             is_correct = False
             eval_details = "No code extracted from model output" if extracted_code is None else "No ground truth available"
         else:
             is_correct, eval_details = eval_results[idx]
-        
+
         # Record result
         result = {
             'seq_id': entry['seq_id'],
@@ -229,7 +236,7 @@ def process_livecodebench_batch(
             'model_output_preview': model_output[:200] if args.verbose else None
         }
         results_list.append(result)
-        
+
         # Store output data if requested
         if args.save_outputs:
             output_record = {
@@ -245,7 +252,7 @@ def process_livecodebench_batch(
                 'evaluation_details': eval_details
             }
             outputs_list.append(output_record)
-    
+
     return results_list, outputs_list
 
 
@@ -255,17 +262,17 @@ def evaluate_single_entry(
     dataset_name: str
 ) -> Tuple[bool, Any, str]:
     """Evaluate a single model output.
-    
+
     Args:
         model_output: Generated text from model
         ground_truth: Expected answer
         dataset_name: Dataset name (e.g., 'gpqa', 'math500')
-        
+
     Returns:
         Tuple of (is_correct, extracted_answer, evaluation_details)
     """
     evaluator = get_evaluator(dataset_name)
-    
+
     # Parse answer from model output
     extracted = evaluator['parse'](model_output)
 
@@ -288,7 +295,7 @@ def evaluate_single_entry(
                 logger.warning(f"Error evaluating: {e}")
         else:
             evaluation_details = "No ground truth available"
-    
+
     return is_correct, extracted, evaluation_details
 
 
@@ -337,37 +344,40 @@ def main():
         action="store_true",
         help="Verbose logging"
     )
-    
+
     args = parser.parse_args()
-    
+
     if args.verbose:
         logging.getLogger().setLevel(logging.DEBUG)
-    
+
     # Load MLPerf log
     mlperf_log, num_repeats = load_mlperf_log(args.mlperf_log)
-    
+
     # Load reference data
     logger.info(f"Loading reference data from {args.reference_data}")
     with open(args.reference_data, 'rb') as f:
         reference_df = pickle.load(f)
-    
+
     logger.info(f"Reference data shape: {reference_df.shape}")
     logger.info(f"Reference columns: {list(reference_df.columns)}")
-    logger.info(f"Evaluation mode: pass@{num_repeats}" if num_repeats > 1 else "Evaluation mode: single-pass")
-    
+    logger.info(
+        f"Evaluation mode: pass@{num_repeats}" if num_repeats > 1 else "Evaluation mode: single-pass")
+
     # Log unique datasets in reference data
     if 'dataset' in reference_df.columns:
         unique_datasets = reference_df['dataset'].unique()
         dataset_counts = reference_df['dataset'].value_counts()
-        logger.info(f"Unique datasets in reference data ({len(unique_datasets)} total):")
+        logger.info(
+            f"Unique datasets in reference data ({len(unique_datasets)} total):")
         for ds in sorted(unique_datasets):
             logger.info(f"  '{ds}' ({dataset_counts[ds]} samples)")
-        
+
         logger.info("\nSample rows from reference data:")
         for idx in [0, 1, 2]:
             if idx < len(reference_df):
-                logger.info(f"  Row {idx}: dataset='{reference_df.iloc[idx]['dataset']}'")
-        
+                logger.info(
+                    f"  Row {idx}: dataset='{reference_df.iloc[idx]['dataset']}'")
+
         # Show how each will be mapped to evaluators
         logger.info("\nExpected Dataset → Evaluator mapping:")
         for ds in sorted(unique_datasets):
@@ -379,83 +389,90 @@ def main():
                     if key in ds_lower:
                         matched_key = key
                         break
-                logger.info(f"  '{ds}' (normalized: '{ds_lower}') → '{matched_key}'")
+                logger.info(
+                    f"  '{ds}' (normalized: '{ds_lower}') → '{matched_key}'")
             except Exception as e:
                 logger.warning(f"  '{ds}' → ERROR: {e}")
-    
+
     # Load tokenizer
     logger.info(f"Loading tokenizer: {args.tokenizer}")
     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
-    
+
     # Group MLPerf log entries by dataset and track repeats
     logger.info("Grouping MLPerf log entries by dataset...")
     dataset_entries = defaultdict(list)
-    
+
     # Track unique (qsl_idx, repeat_idx) pairs to verify coverage
     qsl_repeat_pairs = set()
-    
+
     for entry in mlperf_log:
         qsl_idx = entry['qsl_idx']
         repeat_idx = entry.get('repeat_idx', 0)
-        
+
         if qsl_idx >= len(reference_df):
-            logger.warning(f"qsl_idx {qsl_idx} out of range (max: {len(reference_df)-1})")
+            logger.warning(
+                f"qsl_idx {qsl_idx} out of range (max: {len(reference_df)-1})")
             continue
-        
+
         ref_row = reference_df.iloc[qsl_idx]
         dataset_name = validate_dataset_name(ref_row['dataset'])
         dataset_entries[dataset_name].append(entry)
         qsl_repeat_pairs.add((qsl_idx, repeat_idx))
-    
+
     # Count unique qsl_idx values
     unique_qsl_indices = set(pair[0] for pair in qsl_repeat_pairs)
-    
+
     logger.info(f"Grouped entries by dataset:")
     for ds_name, entries in sorted(dataset_entries.items()):
         logger.info(f"  {ds_name}: {len(entries)} entries")
     logger.info(f"Unique samples (qsl_idx): {len(unique_qsl_indices)}")
     logger.info(f"Total entries (samples × repeats): {len(qsl_repeat_pairs)}")
-    
+
     # Pre-load LiveCodeBench benchmark if needed
     lcb_executor = None
     if any('livecodebench' in ds for ds in dataset_entries.keys()):
         try:
-            logger.info("Pre-loading LiveCodeBench benchmark for parallel evaluation...")
+            logger.info(
+                "Pre-loading LiveCodeBench benchmark for parallel evaluation...")
             os.environ['TQDM_DISABLE'] = '1'  # Disable tqdm in workers
             _ = load_lcb_benchmark()
             logger.info("LiveCodeBench benchmark loaded successfully")
-            
+
             # Create shared ProcessPoolExecutor for all LCB evaluations
-            max_workers = min(multiprocessing.cpu_count(), args.num_lcb_workers)
+            max_workers = min(
+                multiprocessing.cpu_count(),
+                args.num_lcb_workers)
             lcb_executor = ProcessPoolExecutor(max_workers=max_workers)
-            logger.info(f"Created ProcessPoolExecutor with {max_workers} workers for LiveCodeBench")
+            logger.info(
+                f"Created ProcessPoolExecutor with {max_workers} workers for LiveCodeBench")
         except Exception as e:
             logger.warning(f"Failed to pre-load LiveCodeBench benchmark: {e}")
             logger.warning("LiveCodeBench evaluation may be slower")
-    
+
     # Process each dataset separately with its own progress bar
     logger.info("\nProcessing MLPerf log entries by dataset...")
-    
+
     results = []
     # Track stats for each repeat and aggregated
     dataset_stats = defaultdict(lambda: {
         "per_repeat": {i: {"correct": 0, "total": 0} for i in range(num_repeats)},
-        "aggregated": {"correct": 0, "total": 0}  # pass@k: at least one correct
+        # pass@k: at least one correct
+        "aggregated": {"correct": 0, "total": 0}
     })
     # Track results per (qsl_idx, repeat_idx) for aggregation
     qsl_results = defaultdict(lambda: {i: None for i in range(num_repeats)})
     outputs_data = []  # For saving detokenized outputs
-    
+
     try:
         for dataset_name in sorted(dataset_entries.keys()):
             entries = dataset_entries[dataset_name]
             logger.info(f"\n{'=' * 80}")
             logger.info(f"Processing {dataset_name}: {len(entries)} samples")
             logger.info(f"{'=' * 80}")
-            
+
             evaluator = get_evaluator(dataset_name)
             is_livecodebench = 'livecodebench' in dataset_name.lower()
-            
+
             if is_livecodebench and lcb_executor is not None:
                 # Batched LiveCodeBench evaluation
                 results_batch, outputs_batch = process_livecodebench_batch(
@@ -465,46 +482,49 @@ def main():
                 results.extend(results_batch)
                 if args.save_outputs:
                     outputs_data.extend(outputs_batch)
-                
+
                 # Update per-repeat stats and track for aggregation
                 for res in results_batch:
                     repeat_idx = res['repeat_idx']
                     qsl_idx = res['qsl_idx']
                     is_correct = res['is_correct']
-                    
+
                     # Track result for aggregation
-                    qsl_results[(dataset_name, qsl_idx)][repeat_idx] = is_correct
-                    
+                    qsl_results[(dataset_name, qsl_idx)
+                                ][repeat_idx] = is_correct
+
                     # Update per-repeat stats
                     dataset_stats[dataset_name]["per_repeat"][repeat_idx]["total"] += 1
                     if is_correct:
                         dataset_stats[dataset_name]["per_repeat"][repeat_idx]["correct"] += 1
             else:
                 # Sequential evaluation for non-LCB datasets
-                for entry in tqdm(entries, desc=f"Evaluating {dataset_name}", unit="entry"):
+                for entry in tqdm(
+                        entries, desc=f"Evaluating {dataset_name}", unit="entry"):
                     seq_id = entry['seq_id']
                     qsl_idx = entry['qsl_idx']
                     repeat_idx = entry.get('repeat_idx', 0)
                     hex_data = entry['data']
-                    
+
                     ref_row = reference_df.iloc[qsl_idx]
                     ground_truth = ref_row.get('ground_truth', None)
-                    
+
                     # Decode tokens to text
                     token_ids = decode_hex_to_tokens(hex_data)
                     model_output = detokenize(token_ids, tokenizer)
-                    
+
                     # Evaluate
                     try:
                         is_correct, extracted, eval_details = evaluate_single_entry(
                             model_output, ground_truth, dataset_name
                         )
                     except Exception as e:
-                        logger.warning(f"Evaluation error for qsl_idx={qsl_idx}, repeat={repeat_idx}, dataset={dataset_name}: {e}")
+                        logger.warning(
+                            f"Evaluation error for qsl_idx={qsl_idx}, repeat={repeat_idx}, dataset={dataset_name}: {e}")
                         is_correct = False
                         extracted = None
                         eval_details = f"Evaluation error: {e}"
-                    
+
                     # Record result
                     result = {
                         'seq_id': seq_id,
@@ -519,7 +539,7 @@ def main():
                         'model_output_preview': model_output[:200] if args.verbose else None
                     }
                     results.append(result)
-                    
+
                     # Store output data for pickle export
                     if args.save_outputs:
                         output_record = {
@@ -535,22 +555,23 @@ def main():
                             'evaluation_details': eval_details
                         }
                         outputs_data.append(output_record)
-                    
+
                     # Track result for this (qsl_idx, repeat_idx)
-                    qsl_results[(dataset_name, qsl_idx)][repeat_idx] = is_correct
-                    
+                    qsl_results[(dataset_name, qsl_idx)
+                                ][repeat_idx] = is_correct
+
                     # Update per-repeat stats
                     dataset_stats[dataset_name]["per_repeat"][repeat_idx]["total"] += 1
                     if is_correct:
                         dataset_stats[dataset_name]["per_repeat"][repeat_idx]["correct"] += 1
-    
+
     finally:
         # Clean up LiveCodeBench executor
         if lcb_executor is not None:
             logger.info("Shutting down LiveCodeBench ProcessPoolExecutor")
             lcb_executor.shutdown(wait=True)
             os.environ.pop('TQDM_DISABLE', None)
-    
+
     # Aggregate results across repeats (pass@k logic)
     logger.info("\nAggregating results across repeats...")
     for (dataset_name, qsl_idx), repeat_results in qsl_results.items():
@@ -558,32 +579,41 @@ def main():
         is_pass_k_correct = any(
             result for result in repeat_results.values() if result is not None and result
         )
-        
+
         # Update aggregated stats
         dataset_stats[dataset_name]["aggregated"]["total"] += 1
         if is_pass_k_correct:
             dataset_stats[dataset_name]["aggregated"]["correct"] += 1
-    
-    # Calculate overall stats (aggregated pass@k if num_repeats > 1, else per-repeat[0])
+
+    # Calculate overall stats (aggregated pass@k if num_repeats > 1, else
+    # per-repeat[0])
     if num_repeats > 1:
-        total_correct = sum(stats["aggregated"]["correct"] for stats in dataset_stats.values())
-        total_samples = sum(stats["aggregated"]["total"] for stats in dataset_stats.values())
+        total_correct = sum(stats["aggregated"]["correct"]
+                            for stats in dataset_stats.values())
+        total_samples = sum(stats["aggregated"]["total"]
+                            for stats in dataset_stats.values())
     else:
-        total_correct = sum(stats["per_repeat"][0]["correct"] for stats in dataset_stats.values())
-        total_samples = sum(stats["per_repeat"][0]["total"] for stats in dataset_stats.values())
-    
-    overall_accuracy = (total_correct / total_samples * 100) if total_samples > 0 else 0.0
-    
+        total_correct = sum(stats["per_repeat"][0]["correct"]
+                            for stats in dataset_stats.values())
+        total_samples = sum(stats["per_repeat"][0]["total"]
+                            for stats in dataset_stats.values())
+
+    overall_accuracy = (
+        total_correct /
+        total_samples *
+        100) if total_samples > 0 else 0.0
+
     # Print results
     print("\n" + "=" * 80)
     print("MLPerf Accuracy Evaluation Results")
     print("=" * 80)
-    print(f"Evaluation mode: pass@{num_repeats}" if num_repeats > 1 else "Evaluation mode: single-pass")
+    print(f"Evaluation mode: pass@{num_repeats}" if num_repeats >
+          1 else "Evaluation mode: single-pass")
     print(f"Total unique samples: {total_samples}")
-    print(f"Overall pass@{num_repeats} accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})" if num_repeats > 1 
+    print(f"Overall pass@{num_repeats} accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})" if num_repeats > 1
           else f"Overall accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
     print("=" * 80)
-    
+
     if num_repeats > 1:
         print("\nPer-Dataset pass@k Results (aggregated):")
         print("-" * 80)
@@ -591,8 +621,9 @@ def main():
             stats = dataset_stats[dataset_name]["aggregated"]
             if stats["total"] > 0:
                 accuracy = (stats["correct"] / stats["total"] * 100)
-                print(f"{dataset_name:20s}: {accuracy:6.2f}% ({stats['correct']:4d}/{stats['total']:4d})")
-        
+                print(
+                    f"{dataset_name:20s}: {accuracy:6.2f}% ({stats['correct']:4d}/{stats['total']:4d})")
+
         print("\n" + "=" * 80)
         print("Per-Dataset, Per-Repeat Breakdown:")
         print("-" * 80)
@@ -602,7 +633,8 @@ def main():
                 stats = dataset_stats[dataset_name]["per_repeat"][repeat_idx]
                 if stats["total"] > 0:
                     accuracy = (stats["correct"] / stats["total"] * 100)
-                    print(f"  Repeat {repeat_idx}: {accuracy:6.2f}% ({stats['correct']:4d}/{stats['total']:4d})")
+                    print(
+                        f"  Repeat {repeat_idx}: {accuracy:6.2f}% ({stats['correct']:4d}/{stats['total']:4d})")
     else:
         print("\nPer-Dataset Breakdown:")
         print("-" * 80)
@@ -610,29 +642,34 @@ def main():
             stats = dataset_stats[dataset_name]["per_repeat"][0]
             if stats["total"] > 0:
                 accuracy = (stats["correct"] / stats["total"] * 100)
-                print(f"{dataset_name:20s}: {accuracy:6.2f}% ({stats['correct']:4d}/{stats['total']:4d})")
-    
+                print(
+                    f"{dataset_name:20s}: {accuracy:6.2f}% ({stats['correct']:4d}/{stats['total']:4d})")
+
     print("=" * 80)
-    
+
     # Save detokenized outputs to pickle if requested
     if args.save_outputs:
         logger.info(f"Saving detokenized outputs to {args.save_outputs}...")
-        
+
         # Sort by (qsl_idx, repeat_idx) for ordered output
-        outputs_data_sorted = sorted(outputs_data, key=lambda x: (x['qsl_idx'], x.get('repeat_idx', 0)))
-        
+        outputs_data_sorted = sorted(
+            outputs_data, key=lambda x: (
+                x['qsl_idx'], x.get(
+                    'repeat_idx', 0)))
+
         # Convert to DataFrame for easier inspection
         outputs_df = pd.DataFrame(outputs_data_sorted)
-        
+
         output_path = Path(args.save_outputs)
         output_path.parent.mkdir(parents=True, exist_ok=True)
-        
+
         with open(output_path, 'wb') as f:
             pickle.dump(outputs_df, f)
-        
-        logger.info(f"Saved {len(outputs_df)} detokenized outputs (ordered by qsl_idx) to: {output_path}")
+
+        logger.info(
+            f"Saved {len(outputs_df)} detokenized outputs (ordered by qsl_idx) to: {output_path}")
         logger.info(f"Columns: {list(outputs_df.columns)}")
-    
+
     # Save detailed results if requested
     if args.output_file:
         # Build per-dataset stats
@@ -678,18 +715,17 @@ def main():
             },
             "detailed_results": results if args.verbose else None
         }
-        
+
         output_path = Path(args.output_file)
         output_path.parent.mkdir(parents=True, exist_ok=True)
-        
+
         with open(output_path, 'w') as f:
             json.dump(output_data, f, indent=2)
-        
+
         logger.info(f"Results saved to: {output_path}")
-    
+
     logger.info("Evaluation complete!")
 
 
 if __name__ == "__main__":
     main()
-
diff --git a/language/gpt-oss/mlperf/offline_sut.py b/language/gpt-oss/mlperf/offline_sut.py
index fe3348d65c..7c4e8e9ee4 100644
--- a/language/gpt-oss/mlperf/offline_sut.py
+++ b/language/gpt-oss/mlperf/offline_sut.py
@@ -95,9 +95,10 @@ def process_single_query(query_sample):
             """Process a single query (backend batches automatically via continuous batching)."""
             # Check if we should stop (e.g., KeyboardInterrupt)
             if self.should_stop.is_set():
-                logger.info(f"Skipping query {query_sample.id} due to shutdown")
+                logger.info(
+                    f"Skipping query {query_sample.id} due to shutdown")
                 return None, None, None
-            
+
             query_id = query_sample.id
             input_ids = self.dataset[query_sample.index]
 
@@ -128,23 +129,26 @@ def process_single_query(query_sample):
                 # Process results as they complete
                 completed_count = 0
                 cancelled_count = 0
-                
+
                 for future in as_completed(futures):
                     # Check if shutdown was requested
                     if self.should_stop.is_set():
-                        logger.info("Shutdown requested, cancelling remaining futures...")
+                        logger.info(
+                            "Shutdown requested, cancelling remaining futures...")
                         for f in futures:
                             f.cancel()
-                        cancelled_count = sum(1 for f in futures if f.cancelled())
-                        logger.info(f"Cancelled {cancelled_count} pending futures")
+                        cancelled_count = sum(
+                            1 for f in futures if f.cancelled())
+                        logger.info(
+                            f"Cancelled {cancelled_count} pending futures")
                         break
                     try:
                         query_id, query_sample, response = future.result()
-                        
+
                         # Skip if query was cancelled/skipped
                         if query_id is None:
                             continue
-                        
+
                         output_ids = response.get("output_ids", [])
 
                         # Store results
diff --git a/loadgen/issue_query_controller.cc b/loadgen/issue_query_controller.cc
index 376ed98b8b..832be62ba8 100644
--- a/loadgen/issue_query_controller.cc
+++ b/loadgen/issue_query_controller.cc
@@ -460,8 +460,8 @@ void IssueQueryController::IssueQueriesInternal(size_t query_stride,
 #if USE_NEW_LOGGING_FORMAT
             std::stringstream ss;
             ss << "IssueQueryThread " << thread_idx
-               << " Ending early: Too many outstanding queries."
-               << " issued " << queries_issued_total << " outstanding "
+               << " Ending early: Too many outstanding queries." << " issued "
+               << queries_issued_total << " outstanding "
                << queries_outstanding;
             MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
 #else
@@ -500,8 +500,8 @@ void IssueQueryController::IssueQueriesInternal(size_t query_stride,
 #if USE_NEW_LOGGING_FORMAT
         std::stringstream ss;
         ss << "IssueQueryThread " << thread_idx
-           << " Ending early: Max query count reached."
-           << " query_count " << queries_issued;
+           << " Ending early: Max query count reached." << " query_count "
+           << queries_issued;
         MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
 #else
         detail.Error("IssueQueryThread ", std::to_string(thread_idx),
@@ -520,8 +520,8 @@ void IssueQueryController::IssueQueriesInternal(size_t query_stride,
 #if USE_NEW_LOGGING_FORMAT
         std::stringstream ss;
         ss << "IssueQueryThread " << thread_idx
-           << " Ending early: Max test duration reached."
-           << " duration_ns " << duration.count();
+           << " Ending early: Max test duration reached." << " duration_ns "
+           << duration.count();
         MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
 #else
         detail.Error("IssueQueryThread ", std::to_string(thread_idx),
diff --git a/loadgen/loadgen.cc b/loadgen/loadgen.cc
index 0a5d2585ab..a828d350c8 100644
--- a/loadgen/loadgen.cc
+++ b/loadgen/loadgen.cc
@@ -294,7 +294,7 @@ std::vector<QueryMetadata> GenerateQueries(
     // For MultiStream, loaded samples is properly padded.
     // For Offline, we create a 'remainder' query at the end of this function.
     min_queries = loaded_samples.size() / samples_per_query;
-    
+
     // For repeated sampling, multiply min_queries by repeats
     if (mode == TestMode::AccuracyOnly) {
       min_queries *= settings.repeats_per_sample;
@@ -407,15 +407,17 @@ std::vector<QueryMetadata> GenerateQueries(
                                : sample_distribution(sample_rng)];
       }
     }
-    
-    // Handle repeated sampling: create repeats_per_sample queries for the same sample(s)
-    // In PerformanceOnly mode, this is always 1 (single query per sample)
+
+    // Handle repeated sampling: create repeats_per_sample queries for the same
+    // sample(s) In PerformanceOnly mode, this is always 1 (single query per
+    // sample)
     for (uint64_t k = 0; k < settings.repeats_per_sample; k++) {
-      queries.emplace_back(samples, timestamp, response_delegate, sequence_gen, k);
+      queries.emplace_back(samples, timestamp, response_delegate, sequence_gen,
+                           k);
       prev_timestamp = timestamp;
       timestamp += schedule_distribution(schedule_rng);
     }
-    
+
     // In equal_issue mode, the min_queries will be bumped up by a multiple of
     // the dataset size if the test time has not met the threshold.
     if (enable_equal_issue && (queries.size() >= min_queries) &&
@@ -435,10 +437,11 @@ std::vector<QueryMetadata> GenerateQueries(
       for (auto& s : samples) {
         s = loaded_samples[sample_distribution(sample_rng)];
       }
-      
+
       // Handle repeated sampling for remainder query as well
       for (uint64_t k = 0; k < settings.repeats_per_sample; k++) {
-        queries.emplace_back(samples, timestamp, response_delegate, sequence_gen, k);
+        queries.emplace_back(samples, timestamp, response_delegate,
+                             sequence_gen, k);
       }
     }
   }
diff --git a/loadgen/logging.cc b/loadgen/logging.cc
index d9d4883be7..f7aa1271a8 100644
--- a/loadgen/logging.cc
+++ b/loadgen/logging.cc
@@ -288,16 +288,16 @@ void AsyncLog::LogAccuracy(uint64_t seq_id, const QuerySampleIndex qsl_idx,
   *accuracy_out_ << (accuracy_needs_comma_ ? ",\n{ " : "\n{ ");
 
   if (!use_tokens_) {
-    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx,
-            "repeat_idx", repeat_index, "data", response);
+    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "repeat_idx",
+            repeat_index, "data", response);
   } else if (!needs_first_token_) {
-    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx,
-            "repeat_idx", repeat_index, "data", response, "token_count", n_tokens);
+    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "repeat_idx",
+            repeat_index, "data", response, "token_count", n_tokens);
   } else {
     const size_t i = seq_id - latencies_first_sample_sequence_id_;
-    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx,
-            "repeat_idx", repeat_index, "data", response, "token_data",
-            token_records_[i], "token_count", n_tokens);
+    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "repeat_idx",
+            repeat_index, "data", response, "token_data", token_records_[i],
+            "token_count", n_tokens);
   }
 
   *accuracy_out_ << " }";
@@ -814,8 +814,7 @@ void Logger::CollectTlsLoggerStats(TlsLogger* tls_logger) {
   if (max_entry_vector_size > kTlsLogReservedEntryCount) {
 #if USE_NEW_LOGGING_FORMAT
     std::stringstream msg;
-    msg << "Logging allocation detected:"
-        << " tid: " << tls_logger->Tid()
+    msg << "Logging allocation detected:" << " tid: " << tls_logger->Tid()
         << " reserved_entries: " << kTlsLogReservedEntryCount
         << " max_entries: " << max_entry_vector_size;
     MLPERF_LOG_WARNING((*this), "warning_generic_message", msg.str());
diff --git a/loadgen/test_settings.h b/loadgen/test_settings.h
index 5470277e2c..5d49d3a012 100644
--- a/loadgen/test_settings.h
+++ b/loadgen/test_settings.h
@@ -271,7 +271,7 @@ struct TestSettings {
   /// \brief Infer token latencies
   bool infer_token_latencies = false;
   uint64_t token_latency_scaling_factor;
-  
+
   /// \brief Enable repeated sampling in accuracy mode
   /// \details When enabled, each sample is issued
   /// repeats_per_sample times and multiple responses are collected.
diff --git a/loadgen/test_settings_internal.h b/loadgen/test_settings_internal.h
index 44a430d869..3aad54fa8e 100644
--- a/loadgen/test_settings_internal.h
+++ b/loadgen/test_settings_internal.h
@@ -88,7 +88,7 @@ struct TestSettingsInternal {
 
   bool infer_token_latencies = false;
   int64_t token_latency_scaling_factor;
-  
+
   uint64_t repeats_per_sample;
 };
 

From bee73b2e95d24583af3e73db97e256829c48ab45 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Tue, 2 Dec 2025 09:34:18 -0800
Subject: [PATCH 133/165] fix harmonize tokens -> text

---
 language/gpt-oss/eval_accuracy.py             |  8 +---
 .../gpt-oss/preprocess/harmonize_inputs.py    | 38 ++++++-------------
 2 files changed, 14 insertions(+), 32 deletions(-)

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
index 2aa2aa0eae..c8308813a2 100644
--- a/language/gpt-oss/eval_accuracy.py
+++ b/language/gpt-oss/eval_accuracy.py
@@ -80,12 +80,8 @@ def validate_dataframe(df: pd.DataFrame) -> None:
         raise ValueError("Missing required column: 'dataset'")
 
     # Check for tok_model_output_len (either single or with suffixes)
-    has_tok_len = False
-    if pass_k == 1:
-        has_tok_len = 'tok_model_output_len' in df.columns
-    else:
-        has_tok_len = all(
-            f'tok_model_output_len_{i}' in df.columns for i in range(pass_k))
+    has_tok_len = all(
+        f'tok_model_output_len_{i}' in df.columns for i in range(pass_k))
 
     if not has_tok_len:
         raise ValueError("Missing required tok_model_output_len column(s)")
diff --git a/language/gpt-oss/preprocess/harmonize_inputs.py b/language/gpt-oss/preprocess/harmonize_inputs.py
index 510ec81195..9f9d16293c 100644
--- a/language/gpt-oss/preprocess/harmonize_inputs.py
+++ b/language/gpt-oss/preprocess/harmonize_inputs.py
@@ -12,6 +12,7 @@
 import pandas as pd
 from multiprocessing import Pool, cpu_count
 from tqdm import tqdm
+from transformers import AutoTokenizer
 from openai_harmony import (
     load_harmony_encoding,
     HarmonyEncodingName,
@@ -482,6 +483,11 @@ def process_row(args):
     successful_count = 0
     error_count = 0
 
+    # Load tokenizer for decoding tokens to text
+    print("Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-120b", trust_remote_code=True)
+    print("Tokenizer loaded successfully")
+
     # Initialize columns for harmony tokenized input
     df['tok_input'] = None
     df['tok_input_len'] = None
@@ -500,34 +506,14 @@ def process_row(args):
             index, convo, tokens, dataset_name = result
             successful_count += 1
 
-            # Convert conversation to string format
-            conversation_parts = []
-            for message in convo.messages:
-                # Get role from message.author.role
-                role = message.author.role.value if hasattr(
-                    message.author.role, 'value') else str(
-                    message.author.role)
-
-                # Get content from message.content (which is a list)
-                content_parts = []
-                for content_item in message.content:
-                    if hasattr(content_item, 'text'):
-                        content_parts.append(content_item.text)
-                    else:
-                        content_parts.append(str(content_item))
-                content = ' '.join(content_parts)
-
-                # Format as "Role: content"
-                conversation_parts.append(f"{role}: {content}")
-
-            conversation_string = '\n'.join(conversation_parts)
+            # Decode tokens to text using the tokenizer
+            tokens_list = tokens.tolist() if hasattr(tokens, 'tolist') else list(tokens)
+            text_input = tokenizer.decode(tokens_list, skip_special_tokens=False)
 
             # Update the original DataFrame with successful data
-            df.at[index, 'tok_input'] = tokens.tolist() if hasattr(
-                tokens, 'tolist') else list(tokens)
-            df.at[index, 'tok_input_len'] = len(
-                tokens) if hasattr(tokens, '__len__') else 0
-            df.at[index, 'text_input'] = conversation_string
+            df.at[index, 'tok_input'] = tokens_list
+            df.at[index, 'tok_input_len'] = len(tokens_list)
+            df.at[index, 'text_input'] = text_input
 
     # Verify input and output have identical column lists
     input_columns = list(pd.read_pickle(args.data_file).columns)

From 5039fd6d01583ad402b8078aeaa07f22187f0234 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 2 Dec 2025 17:34:49 +0000
Subject: [PATCH 134/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/preprocess/harmonize_inputs.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/language/gpt-oss/preprocess/harmonize_inputs.py b/language/gpt-oss/preprocess/harmonize_inputs.py
index 9f9d16293c..07830b34c7 100644
--- a/language/gpt-oss/preprocess/harmonize_inputs.py
+++ b/language/gpt-oss/preprocess/harmonize_inputs.py
@@ -485,7 +485,8 @@ def process_row(args):
 
     # Load tokenizer for decoding tokens to text
     print("Loading tokenizer...")
-    tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-120b", trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(
+        "openai/gpt-oss-120b", trust_remote_code=True)
     print("Tokenizer loaded successfully")
 
     # Initialize columns for harmony tokenized input
@@ -508,7 +509,8 @@ def process_row(args):
 
             # Decode tokens to text using the tokenizer
             tokens_list = tokens.tolist() if hasattr(tokens, 'tolist') else list(tokens)
-            text_input = tokenizer.decode(tokens_list, skip_special_tokens=False)
+            text_input = tokenizer.decode(
+                tokens_list, skip_special_tokens=False)
 
             # Update the original DataFrame with successful data
             df.at[index, 'tok_input'] = tokens_list

From db4d2900652403012a3d43ccdb04fa2ba0e78e49 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Tue, 2 Dec 2025 09:38:27 -0800
Subject: [PATCH 135/165] remove file

---
 language/gpt-oss/run_infer_trtllm.py | 642 ---------------------------
 1 file changed, 642 deletions(-)
 delete mode 100644 language/gpt-oss/run_infer_trtllm.py

diff --git a/language/gpt-oss/run_infer_trtllm.py b/language/gpt-oss/run_infer_trtllm.py
deleted file mode 100644
index adac3c6271..0000000000
--- a/language/gpt-oss/run_infer_trtllm.py
+++ /dev/null
@@ -1,642 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script to send text prompts to TensorRT-LLM server via OpenAI completions endpoint.
-Supports round-robin load balancing across multiple server endpoints.
-
-Usage:
-    python run_infer_trtllm.py --input-tokens tokenized_data.pkl [options]
-
-Arguments:
-    --input-tokens     Path to pickle file containing data with text_input column from harmony-tokens.py
-    --server-url       TensorRT-LLM server URL(s) - comma-separated for round-robin (e.g., "localhost:8000,localhost:8001")
-    --max-samples      Maximum number of samples to process (default: all)
-    --max-tokens       Maximum tokens to generate per request (default: 100)
-    --max-concurrency  Maximum number of concurrent requests (default: 256)
-    --output           Output pickle file for responses (optional)
-    --pass-k           Number of inference passes per sample for pass@k strategy (default: 1)
-
-Examples:
-    # Single server
-    python run_infer_trtllm.py --input-tokens data.pkl --server-url localhost:8000
-
-    # Multiple servers with round-robin
-    python run_infer_trtllm.py --input-tokens data.pkl --server-url localhost:8000,localhost:8001,localhost:8002
-"""
-
-import asyncio
-import argparse
-import time
-import logging
-from typing import List, Dict, Any, Tuple
-import pandas as pd
-from tqdm import tqdm
-from transformers import AutoTokenizer
-import httpx
-from openai import AsyncOpenAI
-
-# Set up logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-
-# Suppress verbose HTTP logs from httpx and openai
-logging.getLogger("httpx").setLevel(logging.WARNING)
-logging.getLogger("openai").setLevel(logging.WARNING)
-
-# Initialize tokenizer
-MODEL_NAME = "openai/gpt-oss-120b"
-tokenizer = None
-
-
-def get_tokenizer():
-    """Get or initialize the tokenizer."""
-    global tokenizer
-    if tokenizer is None:
-        logger.info(f"Loading tokenizer for {MODEL_NAME}...")
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-        logger.info("Tokenizer loaded successfully")
-    return tokenizer
-
-
-class TRTLLMClient:
-    """Client for TensorRT-LLM server using OpenAI-compatible endpoint with round-robin support."""
-
-    def __init__(self,
-                 server_urls: List[str] = None,
-                 temperature: float = 0.001,
-                 top_k: int = 1,
-                 top_p: float = 1.0,
-                 max_concurrency: int = 256,
-                 timeout: int = 1200):
-        # Support multiple server URLs for round-robin load balancing
-        if server_urls is None:
-            server_urls = ["localhost:8000"]
-        self.server_urls = server_urls
-        self.num_servers = len(server_urls)
-        self.current_server_index = 0
-
-        self.temperature = temperature
-        self.top_k = top_k
-        self.top_p = top_p
-        self.max_concurrency = max_concurrency
-        self.timeout = timeout
-        self.model_name = MODEL_NAME
-
-        # Initialize async OpenAI clients (one per server)
-        self.http_clients = []
-        self.async_clients = []
-        self.concurrency_semaphore = None
-
-        logger.info(
-            f"Initialized client with {self.num_servers} server(s): {', '.join(self.server_urls)}")
-
-    async def initialize(self):
-        """Initialize OpenAI clients for all servers."""
-        # Create semaphore for concurrency control
-        self.concurrency_semaphore = asyncio.Semaphore(self.max_concurrency)
-
-        # Create HTTP and OpenAI clients for each server
-        for server_url in self.server_urls:
-            # Setup HTTP client with proper connection limits for high
-            # concurrency
-            http_client = httpx.AsyncClient(
-                timeout=httpx.Timeout(self.timeout),
-                limits=httpx.Limits(
-                    max_keepalive_connections=self.max_concurrency * 2,
-                    max_connections=self.max_concurrency * 2,
-                ),
-                http2=True
-            )
-
-            # Setup OpenAI client with the configured HTTP client
-            async_client = AsyncOpenAI(
-                api_key='dummy',  # TensorRT-LLM server doesn't require real API key
-                base_url=f"http://{server_url}/v1/",
-                timeout=self.timeout,
-                max_retries=10,
-                http_client=http_client,
-            )
-
-            self.http_clients.append(http_client)
-            self.async_clients.append(async_client)
-
-        logger.info(f"Initialized {len(self.async_clients)} OpenAI client(s)")
-
-    def _get_next_client(self) -> AsyncOpenAI:
-        """Get the next client using round-robin selection."""
-        client = self.async_clients[self.current_server_index]
-        self.current_server_index = (
-            self.current_server_index + 1) % self.num_servers
-        return client
-
-    async def send_request(
-            self, prompt: str, max_tokens: int = 100,
-            sample_id: int = 0, pass_num: int = 0) -> Tuple[int, int, Dict[str, Any], float]:
-        """Send a single request to the TensorRT-LLM server using round-robin.
-
-        Args:
-            prompt: Text prompt to send
-            max_tokens: Maximum tokens to generate
-            sample_id: Sample identifier
-            pass_num: Pass number for pass@k strategy
-
-        Returns:
-            Tuple of (sample_id, pass_num, response, latency)
-        """
-        # Prepare generation parameters using OpenAI completions format (as per
-        # TensorRT-LLM docs)
-        extra_body = {
-            # TensorRT-LLM specific parameters
-            "min_tokens": 1,
-        }
-
-        # Only include top_k if it's not 0 (so it can default to None on server
-        # side)
-        if self.top_k != 0:
-            extra_body["top_k"] = self.top_k
-
-        gen_params = {
-            "model": self.model_name,
-            "prompt": prompt,
-            "max_tokens": max_tokens,
-            "temperature": self.temperature,
-            "top_p": self.top_p,
-            "stream": False,
-            "extra_body": extra_body,
-        }
-
-        try:
-            # Track latency: time from request sent to response received
-            start_time = time.time()
-
-            # Select client using round-robin
-            client = self._get_next_client()
-
-            # Use semaphore for concurrency control
-            async with self.concurrency_semaphore:
-                completion = await client.completions.create(**gen_params)
-
-            end_time = time.time()
-            latency = end_time - start_time
-
-            # Extract response text from completions format
-            response_text = completion.choices[0].text
-
-            # Tokenize the response to get output_ids (similar to SGLang
-            # format)
-            tokenizer = get_tokenizer()
-            output_ids = tokenizer.encode(
-                response_text, add_special_tokens=False)
-
-            # Format response similar to SGLang format for compatibility
-            response = {
-                "output_ids": output_ids,
-                "text": response_text,
-                "meta_info": {
-                    "completion_tokens": len(output_ids),
-                }
-            }
-
-            return sample_id, pass_num, response, latency
-
-        except Exception as e:
-            logger.error(f"Request {sample_id} (pass {pass_num}) failed: {e}")
-            return sample_id, pass_num, {"error": str(e)}, None
-
-    async def shutdown(self):
-        """Clean up resources for all clients."""
-        for http_client in self.http_clients:
-            if http_client:
-                await http_client.aclose()
-
-
-def load_tokenized_data(data_file: str) -> pd.DataFrame:
-    """Load data from pickle file produced by harmony-tokens.py."""
-    logger.info(f"Loading data from {data_file}")
-
-    # Load DataFrame from pickle
-    df = pd.read_pickle(data_file)
-    logger.info(f"Loaded DataFrame with shape: {df.shape}")
-
-    # Check if text_input column exists and has valid data
-    if 'text_input' in df.columns:
-        # Check for any None values in text_input
-        failed_mask = df['text_input'].isna()
-        failed_count = failed_mask.sum()
-
-        if failed_count > 0:
-            failed_indices = df[failed_mask].index.unique()
-            error_msg = f"Found {failed_count} samples with missing text_input at indices: {failed_indices.tolist()}"
-            logger.error(error_msg)
-            raise AssertionError(error_msg)
-
-        # Check first sample
-        first_text = df.iloc[0]['text_input']
-        if isinstance(first_text, str):
-            logger.info(
-                f"First sample text length: {len(first_text)} characters")
-        else:
-            logger.warning(
-                "text_input column exists but first sample is not a string")
-
-        logger.info(f"All {len(df)} samples have valid text_input")
-    else:
-        logger.error("No 'text_input' column found in DataFrame")
-        raise ValueError("DataFrame must contain 'text_input' column")
-
-    return df
-
-
-async def send_requests_async(
-        tokenized_df: pd.DataFrame, server_urls: List[str],
-        max_tokens: int = 100, max_concurrency: int = 256,
-        temperature: float = 0.001, top_k: int = 1, top_p: float = 1.0,
-        timeout: int = 1200, pass_k: int = 1):
-    """Send all requests to TensorRT-LLM server(s) asynchronously with round-robin load balancing.
-
-    Args:
-        server_urls: List of server URLs for round-robin load balancing
-        pass_k: Number of inference passes per sample for pass@k strategy
-
-    Returns:
-        tuple: (responses_by_pass, latencies_by_pass) - Dict mapping (sample_id, pass_num) to response/latency
-    """
-    num_samples = len(tokenized_df)
-    total_requests = num_samples * pass_k
-    logger.info(
-        f"Sending {total_requests} requests ({num_samples} samples × {pass_k} passes) with {max_concurrency} concurrent workers...")
-
-    # Initialize client with multiple servers for round-robin
-    client = TRTLLMClient(
-        server_urls=server_urls,
-        temperature=temperature,
-        top_k=top_k,
-        top_p=top_p,
-        max_concurrency=max_concurrency,
-        timeout=timeout
-    )
-    await client.initialize()
-
-    # Prepare all tasks - create pass_k requests per sample
-    tasks = []
-    for idx, row in tokenized_df.iterrows():
-        for pass_num in range(pass_k):
-            task = client.send_request(
-                row['text_input'],
-                max_tokens=max_tokens,
-                sample_id=idx,
-                pass_num=pass_num
-            )
-            tasks.append(task)
-
-    start_time = time.time()
-
-    # Execute all tasks concurrently with progress bar
-    results = []
-    for coro in tqdm(
-            asyncio.as_completed(tasks),
-            total=len(tasks),
-            desc="Sending requests",
-            unit="request"):
-        result = await coro
-        results.append(result)
-
-    # Group results by sample_id and pass_num
-    responses_by_pass = {}
-    latencies_by_pass = {}
-    for sample_id, pass_num, response, latency in results:
-        responses_by_pass[(sample_id, pass_num)] = response
-        latencies_by_pass[(sample_id, pass_num)] = latency
-
-    total_time = time.time() - start_time
-    logger.info(
-        f"Completed {total_requests} requests in {total_time:.2f} seconds")
-    logger.info(f"Average rate: {total_requests/total_time:.2f} requests/sec")
-
-    # Log latency statistics
-    valid_latencies = [
-        lat for lat in latencies_by_pass.values() if lat is not None]
-    if valid_latencies:
-        avg_latency = sum(valid_latencies) / len(valid_latencies)
-        min_latency = min(valid_latencies)
-        max_latency = max(valid_latencies)
-        logger.info(
-            f"Latency stats - Avg: {avg_latency:.3f}s, Min: {min_latency:.3f}s, Max: {max_latency:.3f}s")
-
-    # Shutdown client
-    await client.shutdown()
-
-    return responses_by_pass, latencies_by_pass
-
-
-def extract_response_ids(
-        responses_by_pass: Dict[tuple, Dict[str, Any]], tokenized_df: pd.DataFrame, pass_k: int) -> Dict[tuple, List[int]]:
-    """Extract response output_ids from TensorRT-LLM responses for all passes.
-
-    Args:
-        responses_by_pass: Dict mapping (sample_id, pass_num) to response
-        tokenized_df: DataFrame with samples
-        pass_k: Number of passes per sample
-
-    Returns:
-        Dict mapping (sample_id, pass_num) to output_ids list
-    """
-    logger.info("Extracting response output_ids...")
-
-    response_ids_by_pass = {}
-    total_responses = len(tokenized_df) * pass_k
-
-    with tqdm(total=total_responses, desc="Extracting responses", unit="response") as pbar:
-        for idx, row in tokenized_df.iterrows():
-            for pass_num in range(pass_k):
-                response = responses_by_pass.get((idx, pass_num), {})
-                response_id = []
-                if "error" not in response and "output_ids" in response:
-                    try:
-                        # TensorRT-LLM returns the generated token IDs in the
-                        # 'output_ids' field
-                        response_id = response["output_ids"]
-                    except Exception as e:
-                        logger.warning(
-                            f"Failed to extract response for sample {idx}, pass {pass_num}: {e}")
-                response_ids_by_pass[(idx, pass_num)] = response_id
-                pbar.update(1)
-
-    logger.info("Response output_ids extraction complete")
-    return response_ids_by_pass
-
-
-def detokenize_output_ids(
-        response_ids_by_pass: Dict[tuple, List[int]], pass_k: int) -> Dict[tuple, str]:
-    """Detokenize output_ids back to text using AutoTokenizer for all passes.
-
-    Args:
-        response_ids_by_pass: Dict mapping (sample_id, pass_num) to output_ids
-        pass_k: Number of passes per sample
-
-    Returns:
-        Dict mapping (sample_id, pass_num) to detokenized text
-    """
-    logger.info("Detokenizing output_ids to text...")
-
-    tokenizer = get_tokenizer()
-    detokenized_texts_by_pass = {}
-
-    for (sample_id, pass_num), token_ids in tqdm(
-            response_ids_by_pass.items(), desc="Detokenizing outputs", unit="output"):
-        try:
-            # Detokenize the token IDs back to text
-            text = tokenizer.decode(token_ids, skip_special_tokens=True)
-            detokenized_texts_by_pass[(sample_id, pass_num)] = text
-        except Exception as e:
-            logger.warning(
-                f"Failed to detokenize output for sample {sample_id}, pass {pass_num}: {e}")
-            detokenized_texts_by_pass[(sample_id, pass_num)] = ""
-
-    logger.info("Output detokenization complete")
-    return detokenized_texts_by_pass
-
-
-def save_responses(responses_by_pass: Dict[tuple, Dict[str, Any]],
-                   response_ids_by_pass: Dict[tuple, List[int]],
-                   detokenized_texts_by_pass: Dict[tuple, str],
-                   latencies_by_pass: Dict[tuple, float],
-                   tokenized_df: pd.DataFrame, pass_k: int, output_file: str = None) -> pd.DataFrame:
-    """Save all responses to DataFrame and optionally to pickle file.
-
-    Args:
-        responses_by_pass: Dict mapping (sample_id, pass_num) to response
-        response_ids_by_pass: Dict mapping (sample_id, pass_num) to output_ids
-        detokenized_texts_by_pass: Dict mapping (sample_id, pass_num) to text
-        latencies_by_pass: Dict mapping (sample_id, pass_num) to latency
-        tokenized_df: Original DataFrame with samples
-        pass_k: Number of passes per sample
-        output_file: Optional output pickle file
-
-    Returns:
-        DataFrame with columns for each pass (e.g., model_output_0, model_output_1, ...)
-    """
-    logger.info("Processing responses and updating DataFrame...")
-
-    # Work with the original DataFrame
-    result_df = tokenized_df.copy()
-
-    # Create columns for each pass with _0, _1, _2, ... suffixes
-    for pass_num in range(pass_k):
-        # Lists to store data for this pass
-        model_outputs = []
-        tok_model_outputs = []
-        tok_model_output_lens = []
-        infer_times = []
-
-        for idx in tokenized_df.index:
-            key = (idx, pass_num)
-            detokenized_text = detokenized_texts_by_pass.get(key, "")
-            response_ids = response_ids_by_pass.get(key, [])
-            latency = latencies_by_pass.get(key, None)
-
-            model_outputs.append(detokenized_text)
-            tok_model_outputs.append(response_ids)
-            tok_model_output_lens.append(len(response_ids))
-            infer_times.append(latency)
-
-        # Add columns with suffixes
-        result_df[f'model_output_{pass_num}'] = model_outputs
-        result_df[f'tok_model_output_{pass_num}'] = tok_model_outputs
-        result_df[f'tok_model_output_len_{pass_num}'] = tok_model_output_lens
-        result_df[f'infer_time_{pass_num}'] = infer_times
-
-    # Calculate output token lengths for logging
-    all_output_token_lengths = []
-    for idx in tokenized_df.index:
-        for pass_num in range(pass_k):
-            key = (idx, pass_num)
-            response = responses_by_pass.get(key, {})
-            response_ids = response_ids_by_pass.get(key, [])
-            try:
-                output_token_length = response.get(
-                    "meta_info", {}).get(
-                    "completion_tokens", len(response_ids))
-                all_output_token_lengths.append(output_token_length)
-            except Exception as e:
-                logger.warning(
-                    f"Failed to calculate output tokens for sample {idx}, pass {pass_num}: {e}")
-                all_output_token_lengths.append(len(response_ids))
-
-    logger.info(f"Updated DataFrame with shape: {result_df.shape}")
-    new_columns = [
-        f'model_output_{i}, tok_model_output_{i}, tok_model_output_len_{i}, infer_time_{i}' for i in range(pass_k)]
-    logger.info(f"Added columns for {pass_k} passes: {', '.join(new_columns)}")
-    if all_output_token_lengths:
-        logger.info(
-            f"Average output token length: {sum(all_output_token_lengths)/len(all_output_token_lengths):.1f}")
-
-    # Save to pickle file if output_file is provided
-    if output_file:
-        logger.info(f"Saving responses to {output_file}...")
-        result_df.to_pickle(output_file)
-        logger.info(f"Responses saved to {output_file}")
-
-    return result_df
-
-
-async def process_requests_async(tokenized_df: pd.DataFrame, server_urls: List[str],
-                                 max_samples: int = None, max_tokens: int = 100,
-                                 max_concurrency: int = 256, output_file: str = None,
-                                 temperature: float = 0.001, top_k: int = 1, top_p: float = 1.0,
-                                 timeout: int = 1200, pass_k: int = 1) -> pd.DataFrame:
-    """Main processing function that handles requests and response extraction.
-
-    Args:
-        server_urls: List of server URLs for round-robin load balancing
-        pass_k: Number of inference passes per sample for pass@k strategy
-    """
-
-    # Step 1: Limit samples if specified
-    if max_samples is not None:
-        tokenized_df = tokenized_df.head(max_samples)
-        logger.info(f"Limited to first {max_samples} samples")
-
-    # Step 2: Send all requests asynchronously (k passes per sample)
-    responses_by_pass, latencies_by_pass = await send_requests_async(
-        tokenized_df,
-        server_urls,
-        max_tokens,
-        max_concurrency,
-        temperature,
-        top_k,
-        top_p,
-        timeout,
-        pass_k)
-
-    # Step 3: Extract response output_ids for all passes
-    response_ids_by_pass = extract_response_ids(
-        responses_by_pass, tokenized_df, pass_k)
-
-    # Step 4: Detokenize output_ids to text for model_output for all passes
-    detokenized_texts_by_pass = detokenize_output_ids(
-        response_ids_by_pass, pass_k)
-
-    # Step 5: Save all results and return DataFrame
-    result_df = save_responses(
-        responses_by_pass,
-        response_ids_by_pass,
-        detokenized_texts_by_pass,
-        latencies_by_pass,
-        tokenized_df,
-        pass_k,
-        output_file)
-
-    return result_df
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Send text prompts to TensorRT-LLM server via OpenAI completions endpoint")
-    parser.add_argument("--input-tokens", required=True,
-                        help="Path to pickle file containing data with text_input column from harmony-tokens.py")
-    parser.add_argument("--server-url", default="localhost:8000",
-                        help="TensorRT-LLM server URL(s) - comma-separated for round-robin load balancing (default: localhost:8000)")
-    parser.add_argument("--max-samples", type=int, default=None,
-                        help="Maximum number of samples to process (default: all)")
-    parser.add_argument("--max-tokens", type=int, default=100,
-                        help="Maximum tokens to generate per request")
-    parser.add_argument("--max-concurrency", type=int, default=256,
-                        help="Maximum number of concurrent requests (default: 256)")
-    parser.add_argument("--output", default=None,
-                        help="Output pickle file for responses (optional)")
-    parser.add_argument("--pass-k", type=int, default=1,
-                        help="Number of inference passes per sample for pass@k strategy (default: 1)")
-    parser.add_argument("--temperature", type=float, default=0.001,
-                        help="Temperature for sampling (default: 0.001)")
-    parser.add_argument("--top-k", type=int, default=1,
-                        help="Top-k for sampling (default: 1)")
-    parser.add_argument("--top-p", type=float, default=1.0,
-                        help="Top-p for sampling (default: 1.0)")
-    parser.add_argument("--timeout", type=int, default=1200,
-                        help="Timeout for requests (default: 1200)")
-
-    args = parser.parse_args()
-
-    # Parse comma-separated server URLs
-    server_urls = [url.strip() for url in args.server_url.split(',')]
-    logger.info(
-        f"Configured {len(server_urls)} server(s) for round-robin load balancing")
-
-    # Test connection
-    async def test_connection():
-        logger.info(f"Testing server connection(s)...")
-        client = TRTLLMClient(
-            server_urls=server_urls,
-            temperature=args.temperature,
-            top_k=args.top_k,
-            top_p=args.top_p,
-            max_concurrency=1,
-            timeout=args.timeout
-        )
-        await client.initialize()
-
-        try:
-            _, _, test_response, _ = await client.send_request(
-                prompt="Test", max_tokens=5, sample_id=0, pass_num=0)
-            if "error" in test_response:
-                logger.error(
-                    f"Server connection failed: {test_response['error']}")
-                logger.error(
-                    "Make sure your TensorRT-LLM server(s) are running with OpenAI endpoint enabled.")
-                return False
-            logger.info("Server connection successful")
-            return True
-        finally:
-            await client.shutdown()
-
-    # Run connection test
-    if not asyncio.run(test_connection()):
-        return
-
-    # Load pre-tokenized data
-    tokenized_df = load_tokenized_data(args.input_tokens)
-
-    # Process requests and get result DataFrame
-    result_df = asyncio.run(process_requests_async(
-        tokenized_df, server_urls,
-        max_samples=args.max_samples,
-        max_tokens=args.max_tokens,
-        max_concurrency=args.max_concurrency,
-        output_file=args.output,
-        temperature=args.temperature,
-        top_k=args.top_k,
-        top_p=args.top_p,
-        timeout=args.timeout,
-        pass_k=args.pass_k))
-
-    # Print summary
-    logger.info(f"\nProcessing completed:")
-    logger.info(f"  - Total samples processed: {len(result_df)}")
-    logger.info(f"  - Number of passes per sample: {args.pass_k}")
-    logger.info(
-        f"  - Average input token length: {result_df['tok_input_len'].mean():.1f}")
-
-    # Calculate average output length across all passes
-    if args.pass_k == 1:
-        avg_output_len = result_df['tok_model_output_len_0'].mean()
-        logger.info(f"  - Average output token length: {avg_output_len:.1f}")
-    else:
-        all_output_lens = []
-        for i in range(args.pass_k):
-            all_output_lens.extend(
-                result_df[f'tok_model_output_len_{i}'].tolist())
-        avg_output_len = sum(all_output_lens) / \
-            len(all_output_lens) if all_output_lens else 0
-        logger.info(
-            f"  - Average output token length (across all passes): {avg_output_len:.1f}")
-
-    if args.output:
-        logger.info(f"  - Results saved to: {args.output}")
-    else:
-        logger.info("  - Results returned as DataFrame (not saved to file)")
-
-
-if __name__ == "__main__":
-    main()

From dbb0fd9c99d77f2880aae6c9f1cf734e32483fa4 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 3 Dec 2025 11:23:30 +0000
Subject: [PATCH 136/165] fix prompt of summarization

---
 language/gpt-oss/preprocess/harmonize_inputs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/language/gpt-oss/preprocess/harmonize_inputs.py b/language/gpt-oss/preprocess/harmonize_inputs.py
index 07830b34c7..8ec1223f93 100644
--- a/language/gpt-oss/preprocess/harmonize_inputs.py
+++ b/language/gpt-oss/preprocess/harmonize_inputs.py
@@ -316,9 +316,9 @@ def create_arxiv_summarization_prompt(
     """
     instructions = (
         "You are an expert at reading and summarizing academic research papers. "
-        "Your task is to provide clear, concise, and accurate summaries of research papers. "
-        "Focus on the key contributions, methodology, results, and implications. "
-        "Structure your summary to be accessible while maintaining technical accuracy."
+        "Your task is to provide clear and comprehensive summaries. "
+        "Cover the research purpose, methods, main findings, and significance. "
+        "Be thorough but avoid unnecessary details. Ignore citations and references."
     )
 
     messages = _create_base_messages(reasoning_effort, instructions)

From 57c6dae1d69a8d17f36bfd98ff1a1d54746240ff Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 3 Dec 2025 03:36:24 -0800
Subject: [PATCH 137/165] move stuff to sglang

---
 language/gpt-oss/{ => sglang}/eval_accuracy.py    | 0
 language/gpt-oss/{archive => sglang}/run_infer.py | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename language/gpt-oss/{ => sglang}/eval_accuracy.py (100%)
 rename language/gpt-oss/{archive => sglang}/run_infer.py (100%)

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/sglang/eval_accuracy.py
similarity index 100%
rename from language/gpt-oss/eval_accuracy.py
rename to language/gpt-oss/sglang/eval_accuracy.py
diff --git a/language/gpt-oss/archive/run_infer.py b/language/gpt-oss/sglang/run_infer.py
similarity index 100%
rename from language/gpt-oss/archive/run_infer.py
rename to language/gpt-oss/sglang/run_infer.py

From da35468afc4f77125a69297b411d78d3bf000375 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 3 Dec 2025 12:21:44 +0000
Subject: [PATCH 138/165] allow use of parquet

---
 language/gpt-oss/README.md                  | 56 +++++++++++++---
 language/gpt-oss/eval_mlperf_accuracy.py    | 73 +++++++++++++++++++--
 language/gpt-oss/eval_mlperf_performance.py | 21 +++++-
 language/gpt-oss/run_mlperf.py              |  2 +-
 language/gpt-oss/utils/tokenization.py      | 31 +++++++--
 5 files changed, 158 insertions(+), 25 deletions(-)

diff --git a/language/gpt-oss/README.md b/language/gpt-oss/README.md
index 593721aebd..49e438e3fa 100644
--- a/language/gpt-oss/README.md
+++ b/language/gpt-oss/README.md
@@ -6,13 +6,8 @@ This is the reference implementation for GPT-OSS-120B. This is a proposal and is
 * Model: `openai/gpt-oss-120b`, commit id: [`b5c939d`](https://huggingface.co/openai/gpt-oss-120b/tree/b5c939de8f754692c1647ca79fbf85e8c1e70f8a)
 * Dataset: Please request access at [this link](https://drive.google.com/drive/folders/1DCfEXHqe69okrqKbSyV-8VUw413JqpPY?usp=drive_link) - **this is a tentative dataset**
 
-Verify the dataset contents by computing the `sha1sum`:
-```bash
-$ sha1sum gptoss-*-eval.pkl
-35228fcf5581b916e70920748baf2c016ea2c06b  gptoss-acc-eval.pkl
-ddec911ad479fc4b30ef1c050c9dea63134c090e  gptoss-perf-eval.pkl
+Datasets are now provided in **Parquet format** (recommended) for better performance and smaller file size (50% smaller than pickle). Pickle format is still supported for backward compatibility.
 
-```
 
 ## Environment setup
 Work on reference implementation is done using the sglang containers at [https://hub.docker.com/r/lmsysorg/sglang/tags](https://hub.docker.com/r/lmsysorg/sglang/tags). For enroot setup, a script is provided under [`setup_enroot.sh`](./setup_enroot.sh). For all sections below, we shall assume this environment is instantiated.
@@ -37,6 +32,32 @@ The script uses `python3 -m sglang.launch_server` tp instantiate the model, with
 
 Then, run a benchmark script that uses the client to send/recv requests.
 ### Run the inference
+
+**Note:** All scripts now support both Parquet (`.parquet`) and Pickle (`.pkl`) formats for dataset files. Parquet is recommended as it offers:
+- 50% smaller file size
+- Faster loading times
+- Cross-language compatibility
+- Type-safe schema preservation
+
+Example usage:
+```bash
+# first, install loadgen
+pip install $(git rev-parse --show-toplevel)/loadgen
+
+# Using Parquet format (recommended)
+python3 run_mlperf.py \
+  --scenario offline \
+  --input-file /path/to/dataset.parquet \
+  --accuracy
+
+# Using Pickle format (backward compatible)
+python3 run_mlperf.py \
+  --scenario offline \
+  --input-file /path/to/dataset.pkl \
+  --accuracy
+```
+
+Full command-line options:
 ```bash
 python3 run_mlperf.py --help
 usage: run_mlperf.py [-h] [--scenario {offline,server}] --input-file INPUT_FILE [--max-samples MAX_SAMPLES] [--mlperf-conf MLPERF_CONF]
@@ -51,7 +72,7 @@ options:
   --scenario {offline,server}
                         MLPerf scenario mode
   --input-file INPUT_FILE
-                        Path to tokenized dataset (pickle file)
+                        Path to tokenized dataset (parquet or pickle file)
   --max-samples MAX_SAMPLES
                         Maximum number of samples to use (None for all)
   --mlperf-conf MLPERF_CONF
@@ -76,7 +97,24 @@ options:
 ```
 
 ### Evaluate the accuracy
-Run `run_mlperf.py` with `--accuracy`, and then use the generated `mlperf_log_accuracy.json` to evaluate the accuracy of the run. Usage is as below.
+Run `run_mlperf.py` with `--accuracy`, and then use the generated `mlperf_log_accuracy.json` to evaluate the accuracy of the run.
+
+Example usage:
+```bash
+# Using Parquet format (recommended)
+python3 eval_mlperf_accuracy.py \
+  --mlperf-log mlperf_results/offline/accuracy/mlperf_log_accuracy.json \
+  --reference-data /path/to/acc_eval_inputs.parquet \
+  --tokenizer openai/gpt-oss-120b
+
+# Using Pickle format (backward compatible)
+python3 eval_mlperf_accuracy.py \
+  --mlperf-log mlperf_results/offline/accuracy/mlperf_log_accuracy.json \
+  --reference-data /path/to/acc_eval_inputs.pkl \
+  --tokenizer openai/gpt-oss-120b
+```
+
+Full command-line options:
 ```bash
 python3 eval_mlperf_accuracy.py --help
 usage: eval_mlperf_accuracy.py [-h] --mlperf-log MLPERF_LOG --reference-data REFERENCE_DATA [--tokenizer TOKENIZER] [--output-file OUTPUT_FILE]
@@ -89,7 +127,7 @@ options:
   --mlperf-log MLPERF_LOG
                         Path to mlperf_log_accuracy.json
   --reference-data REFERENCE_DATA
-                        Path to reference pickle file (DataFrame with dataset, ground_truth, etc.)
+                        Path to reference parquet or pickle file (DataFrame with dataset, ground_truth, etc.)
   --tokenizer TOKENIZER
                         HuggingFace tokenizer name or path
   --output-file OUTPUT_FILE
diff --git a/language/gpt-oss/eval_mlperf_accuracy.py b/language/gpt-oss/eval_mlperf_accuracy.py
index 34bbbc02fd..f28e68f483 100644
--- a/language/gpt-oss/eval_mlperf_accuracy.py
+++ b/language/gpt-oss/eval_mlperf_accuracy.py
@@ -28,6 +28,7 @@
 from collections import defaultdict
 from concurrent.futures import ProcessPoolExecutor, as_completed, TimeoutError
 
+import numpy as np
 import pandas as pd
 from transformers import AutoTokenizer
 from tqdm import tqdm
@@ -313,7 +314,7 @@ def main():
         "--reference-data",
         type=str,
         required=True,
-        help="Path to reference pickle file (DataFrame with dataset, ground_truth, etc.)"
+        help="Path to reference parquet or pickle file (DataFrame with dataset, ground_truth, etc.)"
     )
     parser.add_argument(
         "--tokenizer",
@@ -355,8 +356,30 @@ def main():
 
     # Load reference data
     logger.info(f"Loading reference data from {args.reference_data}")
-    with open(args.reference_data, 'rb') as f:
-        reference_df = pickle.load(f)
+    if args.reference_data.endswith('.parquet'):
+        reference_df = pd.read_parquet(args.reference_data)
+        logger.info("Loaded reference data from Parquet file")
+    elif args.reference_data.endswith('.pkl') or args.reference_data.endswith('.pickle'):
+        with open(args.reference_data, 'rb') as f:
+            reference_df = pickle.load(f)
+        logger.info("Loaded reference data from Pickle file")
+    else:
+        # Try parquet first, then pickle
+        try:
+            reference_df = pd.read_parquet(args.reference_data)
+            logger.info("Auto-detected Parquet format")
+        except Exception:
+            with open(args.reference_data, 'rb') as f:
+                reference_df = pickle.load(f)
+            logger.info("Auto-detected Pickle format")
+
+    # Convert numpy arrays to native Python types for JSON serialization
+    for col in reference_df.columns:
+        # Check if column contains numpy arrays
+        if reference_df[col].dtype == object:
+            reference_df[col] = reference_df[col].apply(
+                lambda x: x.tolist() if isinstance(x, np.ndarray) else x
+            )
 
     logger.info(f"Reference data shape: {reference_df.shape}")
     logger.info(f"Reference columns: {list(reference_df.columns)}")
@@ -457,7 +480,9 @@ def main():
     dataset_stats = defaultdict(lambda: {
         "per_repeat": {i: {"correct": 0, "total": 0} for i in range(num_repeats)},
         # pass@k: at least one correct
-        "aggregated": {"correct": 0, "total": 0}
+        "aggregated": {"correct": 0, "total": 0},
+        # pass@1 with k repeats: average correctness across repeats
+        "averaged": {"correct_sum": 0, "total": 0}
     })
     # Track results per (qsl_idx, repeat_idx) for aggregation
     qsl_results = defaultdict(lambda: {i: None for i in range(num_repeats)})
@@ -580,11 +605,16 @@ def main():
             result for result in repeat_results.values() if result is not None and result
         )
 
-        # Update aggregated stats
+        # Update aggregated stats (pass@k)
         dataset_stats[dataset_name]["aggregated"]["total"] += 1
         if is_pass_k_correct:
             dataset_stats[dataset_name]["aggregated"]["correct"] += 1
 
+        # Update averaged stats (pass@1 with k repeats)
+        correct_count = sum(1 for result in repeat_results.values() if result is not None and result)
+        dataset_stats[dataset_name]["averaged"]["correct_sum"] += correct_count
+        dataset_stats[dataset_name]["averaged"]["total"] += 1
+
     # Calculate overall stats (aggregated pass@k if num_repeats > 1, else
     # per-repeat[0])
     if num_repeats > 1:
@@ -592,6 +622,15 @@ def main():
                             for stats in dataset_stats.values())
         total_samples = sum(stats["aggregated"]["total"]
                             for stats in dataset_stats.values())
+        
+        # Calculate overall pass@1 with k repeats accuracy
+        total_averaged_correct_sum = sum(stats["averaged"]["correct_sum"]
+                                         for stats in dataset_stats.values())
+        total_averaged_samples = sum(stats["averaged"]["total"]
+                                      for stats in dataset_stats.values())
+        overall_averaged_accuracy = (
+            total_averaged_correct_sum / (total_averaged_samples * num_repeats) * 100
+        ) if total_averaged_samples > 0 else 0.0
     else:
         total_correct = sum(stats["per_repeat"][0]["correct"]
                             for stats in dataset_stats.values())
@@ -610,8 +649,11 @@ def main():
     print(f"Evaluation mode: pass@{num_repeats}" if num_repeats >
           1 else "Evaluation mode: single-pass")
     print(f"Total unique samples: {total_samples}")
-    print(f"Overall pass@{num_repeats} accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})" if num_repeats > 1
-          else f"Overall accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
+    if num_repeats > 1:
+        print(f"Overall pass@{num_repeats} accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
+        print(f"Overall pass@1 with {num_repeats} repeats: {overall_averaged_accuracy:.2f}%")
+    else:
+        print(f"Overall accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
     print("=" * 80)
 
     if num_repeats > 1:
@@ -624,6 +666,16 @@ def main():
                 print(
                     f"{dataset_name:20s}: {accuracy:6.2f}% ({stats['correct']:4d}/{stats['total']:4d})")
 
+        print("\nPer-Dataset pass@1 with k repeats Results:")
+        print("-" * 80)
+        for dataset_name in sorted(dataset_stats.keys()):
+            stats = dataset_stats[dataset_name]["averaged"]
+            if stats["total"] > 0:
+                accuracy = (stats["correct_sum"] / (stats["total"] * num_repeats) * 100)
+                avg_correct = stats["correct_sum"] / stats["total"]
+                print(
+                    f"{dataset_name:20s}: {accuracy:6.2f}% (avg {avg_correct:.2f}/{num_repeats} correct per sample)")
+
         print("\n" + "=" * 80)
         print("Per-Dataset, Per-Repeat Breakdown:")
         print("-" * 80)
@@ -678,6 +730,7 @@ def main():
             if num_repeats > 1:
                 # Aggregated pass@k stats
                 agg_stats = stats["aggregated"]
+                avg_stats = stats["averaged"]
                 per_dataset_stats[dataset_name] = {
                     "pass_k": num_repeats,
                     "aggregated": {
@@ -685,6 +738,11 @@ def main():
                         "total": agg_stats["total"],
                         "accuracy": (agg_stats["correct"] / agg_stats["total"] * 100) if agg_stats["total"] > 0 else 0.0
                     },
+                    "averaged": {
+                        "correct_sum": avg_stats["correct_sum"],
+                        "total": avg_stats["total"],
+                        "accuracy": (avg_stats["correct_sum"] / (avg_stats["total"] * num_repeats) * 100) if avg_stats["total"] > 0 else 0.0
+                    },
                     "per_repeat": {}
                 }
                 # Add per-repeat stats
@@ -711,6 +769,7 @@ def main():
                 "total_samples": total_samples,
                 "total_correct": total_correct,
                 "overall_accuracy": overall_accuracy,
+                "overall_averaged_accuracy": overall_averaged_accuracy if num_repeats > 1 else None,
                 "per_dataset": per_dataset_stats
             },
             "detailed_results": results if args.verbose else None
diff --git a/language/gpt-oss/eval_mlperf_performance.py b/language/gpt-oss/eval_mlperf_performance.py
index 5ef2b90afe..beffb1a07f 100755
--- a/language/gpt-oss/eval_mlperf_performance.py
+++ b/language/gpt-oss/eval_mlperf_performance.py
@@ -91,7 +91,7 @@ def parse_args() -> argparse.Namespace:
         "--reference-data",
         type=str,
         default=None,
-        help="Path to reference pickle file (DataFrame with prompts, dataset, etc.) - optional"
+        help="Path to reference parquet or pickle file (DataFrame with prompts, dataset, etc.) - optional"
     )
 
     parser.add_argument(
@@ -517,8 +517,23 @@ def main():
     if args.reference_data:
         logger.info(f"Loading reference data from {args.reference_data}")
         try:
-            with open(args.reference_data, 'rb') as f:
-                reference_df = pickle.load(f)
+            if args.reference_data.endswith('.parquet'):
+                reference_df = pd.read_parquet(args.reference_data)
+                logger.info("Loaded reference data from Parquet file")
+            elif args.reference_data.endswith('.pkl') or args.reference_data.endswith('.pickle'):
+                with open(args.reference_data, 'rb') as f:
+                    reference_df = pickle.load(f)
+                logger.info("Loaded reference data from Pickle file")
+            else:
+                # Try parquet first, then pickle
+                try:
+                    reference_df = pd.read_parquet(args.reference_data)
+                    logger.info("Auto-detected Parquet format")
+                except Exception:
+                    with open(args.reference_data, 'rb') as f:
+                        reference_df = pickle.load(f)
+                    logger.info("Auto-detected Pickle format")
+            
             logger.info(f"✓ Reference data loaded: {reference_df.shape}")
             logger.info(f"  Columns: {list(reference_df.columns)}")
         except Exception as e:
diff --git a/language/gpt-oss/run_mlperf.py b/language/gpt-oss/run_mlperf.py
index 9280999e13..2edfaa37f5 100755
--- a/language/gpt-oss/run_mlperf.py
+++ b/language/gpt-oss/run_mlperf.py
@@ -83,7 +83,7 @@ def create_argument_parser() -> argparse.ArgumentParser:
         "--input-file",
         type=str,
         required=True,
-        help="Path to tokenized dataset (pickle file)"
+        help="Path to tokenized dataset (parquet or pickle file)"
     )
 
     parser.add_argument(
diff --git a/language/gpt-oss/utils/tokenization.py b/language/gpt-oss/utils/tokenization.py
index a64d77a5f9..4b786b11df 100644
--- a/language/gpt-oss/utils/tokenization.py
+++ b/language/gpt-oss/utils/tokenization.py
@@ -3,6 +3,7 @@
 
 import logging
 from typing import List, Dict, Any, Optional
+import numpy as np
 import pandas as pd
 from transformers import AutoTokenizer
 
@@ -76,10 +77,10 @@ def load_tokenized_dataset(
     dataset_path: str,
     max_samples: Optional[int] = None
 ) -> Dict[str, Any]:
-    """Load a tokenized dataset from pickle file.
+    """Load a tokenized dataset from parquet or pickle file.
 
     Args:
-        dataset_path: Path to the pickle file containing tokenized data
+        dataset_path: Path to the parquet or pickle file containing tokenized data
         max_samples: Maximum number of samples to load (None for all)
 
     Returns:
@@ -90,9 +91,29 @@ def load_tokenized_dataset(
     """
     logger.info(f"Loading tokenized dataset from {dataset_path}")
 
-    # Load DataFrame from pickle
-    df = pd.read_pickle(dataset_path)
-    logger.info(f"Loaded DataFrame with shape: {df.shape}")
+    # Load DataFrame based on file extension
+    if dataset_path.endswith('.parquet'):
+        df = pd.read_parquet(dataset_path)
+        logger.info(f"Loaded Parquet DataFrame with shape: {df.shape}")
+    elif dataset_path.endswith('.pkl') or dataset_path.endswith('.pickle'):
+        df = pd.read_pickle(dataset_path)
+        logger.info(f"Loaded Pickle DataFrame with shape: {df.shape}")
+    else:
+        # Try to auto-detect based on file content
+        try:
+            df = pd.read_parquet(dataset_path)
+            logger.info(f"Auto-detected Parquet format, loaded DataFrame with shape: {df.shape}")
+        except Exception:
+            df = pd.read_pickle(dataset_path)
+            logger.info(f"Auto-detected Pickle format, loaded DataFrame with shape: {df.shape}")
+
+    # Convert numpy arrays to native Python types for JSON serialization
+    for col in df.columns:
+        # Check if column contains numpy arrays
+        if df[col].dtype == object:
+            df[col] = df[col].apply(
+                lambda x: x.tolist() if isinstance(x, np.ndarray) else x
+            )
 
     # Limit samples if specified
     if max_samples is not None:

From 72cd475a3c272484815ec89c046af2a96a5526f3 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 3 Dec 2025 12:22:30 +0000
Subject: [PATCH 139/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/eval_mlperf_accuracy.py    | 22 +++++++++++++--------
 language/gpt-oss/eval_mlperf_performance.py |  2 +-
 language/gpt-oss/utils/tokenization.py      |  6 ++++--
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/language/gpt-oss/eval_mlperf_accuracy.py b/language/gpt-oss/eval_mlperf_accuracy.py
index f28e68f483..e3654118ad 100644
--- a/language/gpt-oss/eval_mlperf_accuracy.py
+++ b/language/gpt-oss/eval_mlperf_accuracy.py
@@ -611,7 +611,8 @@ def main():
             dataset_stats[dataset_name]["aggregated"]["correct"] += 1
 
         # Update averaged stats (pass@1 with k repeats)
-        correct_count = sum(1 for result in repeat_results.values() if result is not None and result)
+        correct_count = sum(
+            1 for result in repeat_results.values() if result is not None and result)
         dataset_stats[dataset_name]["averaged"]["correct_sum"] += correct_count
         dataset_stats[dataset_name]["averaged"]["total"] += 1
 
@@ -622,14 +623,15 @@ def main():
                             for stats in dataset_stats.values())
         total_samples = sum(stats["aggregated"]["total"]
                             for stats in dataset_stats.values())
-        
+
         # Calculate overall pass@1 with k repeats accuracy
         total_averaged_correct_sum = sum(stats["averaged"]["correct_sum"]
                                          for stats in dataset_stats.values())
         total_averaged_samples = sum(stats["averaged"]["total"]
-                                      for stats in dataset_stats.values())
+                                     for stats in dataset_stats.values())
         overall_averaged_accuracy = (
-            total_averaged_correct_sum / (total_averaged_samples * num_repeats) * 100
+            total_averaged_correct_sum /
+            (total_averaged_samples * num_repeats) * 100
         ) if total_averaged_samples > 0 else 0.0
     else:
         total_correct = sum(stats["per_repeat"][0]["correct"]
@@ -650,10 +652,13 @@ def main():
           1 else "Evaluation mode: single-pass")
     print(f"Total unique samples: {total_samples}")
     if num_repeats > 1:
-        print(f"Overall pass@{num_repeats} accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
-        print(f"Overall pass@1 with {num_repeats} repeats: {overall_averaged_accuracy:.2f}%")
+        print(
+            f"Overall pass@{num_repeats} accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
+        print(
+            f"Overall pass@1 with {num_repeats} repeats: {overall_averaged_accuracy:.2f}%")
     else:
-        print(f"Overall accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
+        print(
+            f"Overall accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
     print("=" * 80)
 
     if num_repeats > 1:
@@ -671,7 +676,8 @@ def main():
         for dataset_name in sorted(dataset_stats.keys()):
             stats = dataset_stats[dataset_name]["averaged"]
             if stats["total"] > 0:
-                accuracy = (stats["correct_sum"] / (stats["total"] * num_repeats) * 100)
+                accuracy = (stats["correct_sum"] /
+                            (stats["total"] * num_repeats) * 100)
                 avg_correct = stats["correct_sum"] / stats["total"]
                 print(
                     f"{dataset_name:20s}: {accuracy:6.2f}% (avg {avg_correct:.2f}/{num_repeats} correct per sample)")
diff --git a/language/gpt-oss/eval_mlperf_performance.py b/language/gpt-oss/eval_mlperf_performance.py
index beffb1a07f..aa3c275f37 100755
--- a/language/gpt-oss/eval_mlperf_performance.py
+++ b/language/gpt-oss/eval_mlperf_performance.py
@@ -533,7 +533,7 @@ def main():
                     with open(args.reference_data, 'rb') as f:
                         reference_df = pickle.load(f)
                     logger.info("Auto-detected Pickle format")
-            
+
             logger.info(f"✓ Reference data loaded: {reference_df.shape}")
             logger.info(f"  Columns: {list(reference_df.columns)}")
         except Exception as e:
diff --git a/language/gpt-oss/utils/tokenization.py b/language/gpt-oss/utils/tokenization.py
index 4b786b11df..10cc73b148 100644
--- a/language/gpt-oss/utils/tokenization.py
+++ b/language/gpt-oss/utils/tokenization.py
@@ -102,10 +102,12 @@ def load_tokenized_dataset(
         # Try to auto-detect based on file content
         try:
             df = pd.read_parquet(dataset_path)
-            logger.info(f"Auto-detected Parquet format, loaded DataFrame with shape: {df.shape}")
+            logger.info(
+                f"Auto-detected Parquet format, loaded DataFrame with shape: {df.shape}")
         except Exception:
             df = pd.read_pickle(dataset_path)
-            logger.info(f"Auto-detected Pickle format, loaded DataFrame with shape: {df.shape}")
+            logger.info(
+                f"Auto-detected Pickle format, loaded DataFrame with shape: {df.shape}")
 
     # Convert numpy arrays to native Python types for JSON serialization
     for col in df.columns:

From 792324999477f452649c54c93129a12653050f96 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Wed, 3 Dec 2025 15:09:48 +0000
Subject: [PATCH 140/165] fix scores for pass@1 with k repeats

---
 .../gpt-oss/{sglang => }/eval_accuracy.py     |  0
 language/gpt-oss/eval_mlperf_accuracy.py      | 38 ++++++++++---------
 2 files changed, 21 insertions(+), 17 deletions(-)
 rename language/gpt-oss/{sglang => }/eval_accuracy.py (100%)

diff --git a/language/gpt-oss/sglang/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
similarity index 100%
rename from language/gpt-oss/sglang/eval_accuracy.py
rename to language/gpt-oss/eval_accuracy.py
diff --git a/language/gpt-oss/eval_mlperf_accuracy.py b/language/gpt-oss/eval_mlperf_accuracy.py
index e3654118ad..28b228154a 100644
--- a/language/gpt-oss/eval_mlperf_accuracy.py
+++ b/language/gpt-oss/eval_mlperf_accuracy.py
@@ -652,10 +652,9 @@ def main():
           1 else "Evaluation mode: single-pass")
     print(f"Total unique samples: {total_samples}")
     if num_repeats > 1:
-        print(
-            f"Overall pass@{num_repeats} accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
-        print(
-            f"Overall pass@1 with {num_repeats} repeats: {overall_averaged_accuracy:.2f}%")
+        print(f"Overall pass@{num_repeats} accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
+        total_score = total_averaged_correct_sum / num_repeats
+        print(f"Overall pass@1 with {num_repeats} repeats: {overall_averaged_accuracy:.2f}% ({total_score:.1f}/{total_averaged_samples})")
     else:
         print(
             f"Overall accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
@@ -676,11 +675,10 @@ def main():
         for dataset_name in sorted(dataset_stats.keys()):
             stats = dataset_stats[dataset_name]["averaged"]
             if stats["total"] > 0:
-                accuracy = (stats["correct_sum"] /
-                            (stats["total"] * num_repeats) * 100)
-                avg_correct = stats["correct_sum"] / stats["total"]
+                accuracy = (stats["correct_sum"] / (stats["total"] * num_repeats) * 100)
+                total_score = stats["correct_sum"] / num_repeats
                 print(
-                    f"{dataset_name:20s}: {accuracy:6.2f}% (avg {avg_correct:.2f}/{num_repeats} correct per sample)")
+                    f"{dataset_name:20s}: {accuracy:6.2f}% ({total_score:7.1f}/{stats['total']:4d})")
 
         print("\n" + "=" * 80)
         print("Per-Dataset, Per-Repeat Breakdown:")
@@ -745,7 +743,7 @@ def main():
                         "accuracy": (agg_stats["correct"] / agg_stats["total"] * 100) if agg_stats["total"] > 0 else 0.0
                     },
                     "averaged": {
-                        "correct_sum": avg_stats["correct_sum"],
+                        "score": avg_stats["correct_sum"] / num_repeats,
                         "total": avg_stats["total"],
                         "accuracy": (avg_stats["correct_sum"] / (avg_stats["total"] * num_repeats) * 100) if avg_stats["total"] > 0 else 0.0
                     },
@@ -769,15 +767,21 @@ def main():
                     "accuracy": (single_stats["correct"] / single_stats["total"] * 100) if single_stats["total"] > 0 else 0.0
                 }
 
+        summary = {
+            "num_repeats": num_repeats,
+            "total_samples": total_samples,
+            "total_correct": total_correct,
+            "overall_accuracy": overall_accuracy,
+            "per_dataset": per_dataset_stats
+        }
+        
+        # Add averaged metrics if num_repeats > 1
+        if num_repeats > 1:
+            summary["overall_averaged_score"] = total_averaged_correct_sum / num_repeats
+            summary["overall_averaged_accuracy"] = overall_averaged_accuracy
+        
         output_data = {
-            "summary": {
-                "num_repeats": num_repeats,
-                "total_samples": total_samples,
-                "total_correct": total_correct,
-                "overall_accuracy": overall_accuracy,
-                "overall_averaged_accuracy": overall_averaged_accuracy if num_repeats > 1 else None,
-                "per_dataset": per_dataset_stats
-            },
+            "summary": summary,
             "detailed_results": results if args.verbose else None
         }
 

From 957c53d1950e4315dafaf64dddf4696d8ce61502 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 8 Dec 2025 23:10:41 -0800
Subject: [PATCH 141/165] add extra-args option

---
 language/gpt-oss/sglang/run_server.sh | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/language/gpt-oss/sglang/run_server.sh b/language/gpt-oss/sglang/run_server.sh
index 3fd01b6629..7d53a96ebd 100755
--- a/language/gpt-oss/sglang/run_server.sh
+++ b/language/gpt-oss/sglang/run_server.sh
@@ -6,6 +6,7 @@ dp=1
 model_path=openai/gpt-oss-120b
 eagle_path=""
 stream_interval=500
+extra_args=""
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -26,21 +27,21 @@ while [[ $# -gt 0 ]]; do
             shift 2
             ;;
         *)
-            echo "Unknown argument: $1"
-            exit 1
+            extra_args="$extra_args $1"
+            shift 1
             ;;
     esac
 done
 
 args=" --model-path $model_path \
     --host 0.0.0.0 \
-    --port 30000 \
     --tp-size=1 \
     --data-parallel-size=$dp \
     --max-running-requests $((dp * 512)) \
     --mem-fraction-static 0.85 \
     --chunked-prefill-size 16384 \
     --ep-size=1 \
+    --enable-metrics \
     --stream-interval $stream_interval "
 
 if [ -n "$eagle_path" ]; then
@@ -54,4 +55,4 @@ fi
 
 
 set -x;
-python3 -m sglang.launch_server $args
+python3 -m sglang.launch_server $args $extra_args

From 44f662bf54bcc56ffa0d98c61880cad842c645c3 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 9 Dec 2025 07:11:45 +0000
Subject: [PATCH 142/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/eval_mlperf_accuracy.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/language/gpt-oss/eval_mlperf_accuracy.py b/language/gpt-oss/eval_mlperf_accuracy.py
index 28b228154a..f24359aa79 100644
--- a/language/gpt-oss/eval_mlperf_accuracy.py
+++ b/language/gpt-oss/eval_mlperf_accuracy.py
@@ -652,9 +652,11 @@ def main():
           1 else "Evaluation mode: single-pass")
     print(f"Total unique samples: {total_samples}")
     if num_repeats > 1:
-        print(f"Overall pass@{num_repeats} accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
+        print(
+            f"Overall pass@{num_repeats} accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
         total_score = total_averaged_correct_sum / num_repeats
-        print(f"Overall pass@1 with {num_repeats} repeats: {overall_averaged_accuracy:.2f}% ({total_score:.1f}/{total_averaged_samples})")
+        print(
+            f"Overall pass@1 with {num_repeats} repeats: {overall_averaged_accuracy:.2f}% ({total_score:.1f}/{total_averaged_samples})")
     else:
         print(
             f"Overall accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
@@ -675,7 +677,8 @@ def main():
         for dataset_name in sorted(dataset_stats.keys()):
             stats = dataset_stats[dataset_name]["averaged"]
             if stats["total"] > 0:
-                accuracy = (stats["correct_sum"] / (stats["total"] * num_repeats) * 100)
+                accuracy = (stats["correct_sum"] /
+                            (stats["total"] * num_repeats) * 100)
                 total_score = stats["correct_sum"] / num_repeats
                 print(
                     f"{dataset_name:20s}: {accuracy:6.2f}% ({total_score:7.1f}/{stats['total']:4d})")
@@ -774,12 +777,12 @@ def main():
             "overall_accuracy": overall_accuracy,
             "per_dataset": per_dataset_stats
         }
-        
+
         # Add averaged metrics if num_repeats > 1
         if num_repeats > 1:
             summary["overall_averaged_score"] = total_averaged_correct_sum / num_repeats
             summary["overall_averaged_accuracy"] = overall_averaged_accuracy
-        
+
         output_data = {
             "summary": summary,
             "detailed_results": results if args.verbose else None

From a45344b16f75fcbe41a5266b23da2aba9d6c8585 Mon Sep 17 00:00:00 2001
From: v-shobhit <161510941+v-shobhit@users.noreply.github.com>
Date: Wed, 10 Dec 2025 11:11:38 +0530
Subject: [PATCH 143/165] Update user.conf

---
 language/gpt-oss/mlperf/user.conf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/language/gpt-oss/mlperf/user.conf b/language/gpt-oss/mlperf/user.conf
index dc6baf3d19..4544ea59b7 100644
--- a/language/gpt-oss/mlperf/user.conf
+++ b/language/gpt-oss/mlperf/user.conf
@@ -1,11 +1,11 @@
 gpt-oss-120b.Offline.target_qps = 10.0
 gpt-oss-120b.Offline.min_duration = 60000
-# gpt-oss-120b.Offline.performance_sample_count = 8036
+# gpt-oss-120b.Offline.performance_sample_count = 6396
 gpt-oss-120b.Offline.performance_sample_count = 1724
 
 gpt-oss-120b.Server.target_qps = 10
 gpt-oss-120b.Server.min_duration = 60000
-# gpt-oss-120b.Server.performance_sample_count = 8036
+# gpt-oss-120b.Server.performance_sample_count = 6396
 gpt-oss-120b.Server.performance_sample_count = 1724
 
 gpt-oss-120b.Server.target_latency = 0

From 100903b3ade8fbe83bfe0397e2606bff2652f299 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Fri, 12 Dec 2025 12:44:42 +0000
Subject: [PATCH 144/165] updates to use v4

---
 language/gpt-oss/eval_accuracy.py        | 98 ++++++++++--------------
 language/gpt-oss/eval_mlperf_accuracy.py |  6 ++
 language/gpt-oss/mlperf/user.conf        |  6 +-
 language/gpt-oss/sglang/run_server.sh    |  1 -
 language/gpt-oss/utils/tokenization.py   | 14 ++--
 5 files changed, 59 insertions(+), 66 deletions(-)

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
index c8308813a2..0195291ca3 100644
--- a/language/gpt-oss/eval_accuracy.py
+++ b/language/gpt-oss/eval_accuracy.py
@@ -516,7 +516,7 @@ def get_evaluator(dataset_name: str) -> Dict[str, Any]:
 def process_row(row: pd.Series) -> Dict[str, Any]:
     """Process a single row and return extracted answer and accuracy."""
     dataset_name = validate_dataset_name(row['dataset'])
-    raw_output = validate_text_input(row['model_output'])
+    raw_output = validate_text_input(row['model_output_0'])
     ground_truth = row['ground_truth']
 
     evaluator = get_evaluator(dataset_name)
@@ -555,15 +555,14 @@ def process_dataframe(df: pd.DataFrame,
 
     # Initialize columns for each pass
     for pass_num in range(pass_k):
-        suffix = f'_{pass_num}' if pass_k > 1 else ''
+        suffix = f'_{pass_num}'
         df_output[f'extracted_answer{suffix}'] = None
         df_output[f'prompt_accuracy{suffix}'] = 0.0
         df_output[f'evaluation_details{suffix}'] = None
 
-    # Add aggregated columns for pass@k
-    if pass_k > 1:
-        df_output['prompt_accuracy'] = 0.0  # Will be max of all passes
-        df_output['evaluation_details'] = None  # Will aggregate details
+    # Add aggregated columns (max across all passes)
+    df_output['prompt_accuracy'] = 0.0
+    df_output['evaluation_details'] = None
 
     # Check if we have LiveCodeBench datasets to evaluate
     has_livecodebench = any('livecodebench' in str(ds).lower()
@@ -610,7 +609,7 @@ def process_dataframe(df: pd.DataFrame,
                 logger.info(
                     f"Parsing {len(group_indices)} rows for dataset '{dataset_name}' across {pass_k} passes")
                 for pass_num in range(pass_k):
-                    suffix = f'_{pass_num}' if pass_k > 1 else ''
+                    suffix = f'_{pass_num}'
                     model_output_col = f'model_output{suffix}'
                     extracted_answer_col = f'extracted_answer{suffix}'
                     evaluation_details_col = f'evaluation_details{suffix}'
@@ -629,7 +628,7 @@ def process_dataframe(df: pd.DataFrame,
                 all_work_items = []
                 work_item_metadata = []  # (idx, pass_num)
                 for pass_num in range(pass_k):
-                    suffix = f'_{pass_num}' if pass_k > 1 else ''
+                    suffix = f'_{pass_num}'
                     extracted_answer_col = f'extracted_answer{suffix}'
                     for idx in group_indices:
                         row = df_output.loc[idx]
@@ -660,7 +659,7 @@ def process_dataframe(df: pd.DataFrame,
                                        total=len(future_to_metadata),
                                        desc=f"Evaluating LiveCodeBench (all passes)"):
                         idx, pass_num = future_to_metadata[future]
-                        suffix = f'_{pass_num}' if pass_k > 1 else ''
+                        suffix = f'_{pass_num}'
                         prompt_accuracy_col = f'prompt_accuracy{suffix}'
                         evaluation_details_col = f'evaluation_details{suffix}'
 
@@ -701,7 +700,7 @@ def process_dataframe(df: pd.DataFrame,
             else:
                 # Sequential pass processing for non-LCB datasets
                 for pass_num in range(pass_k):
-                    suffix = f'_{pass_num}' if pass_k > 1 else ''
+                    suffix = f'_{pass_num}'
                     model_output_col = f'model_output{suffix}'
                     extracted_answer_col = f'extracted_answer{suffix}'
                     prompt_accuracy_col = f'prompt_accuracy{suffix}'
@@ -749,24 +748,23 @@ def process_dataframe(df: pd.DataFrame,
                             f"{dataset_name} pass {pass_num} results: {correct_count}/{total_evaluated} correct ({accuracy:.1f}% accuracy)")
 
             # Aggregate results across all passes (take max)
-            if pass_k > 1:
-                logger.info(
-                    f"Aggregating results across {pass_k} passes for dataset '{dataset_name}'")
-                for idx in group_indices:
-                    # Get all accuracy values for this row
-                    accuracies = []
-                    for pass_num in range(pass_k):
-                        acc = df_output.at[idx, f'prompt_accuracy_{pass_num}']
-                        accuracies.append(acc if not pd.isna(acc) else 0.0)
+            logger.info(
+                f"Aggregating results across {pass_k} passes for dataset '{dataset_name}'")
+            for idx in group_indices:
+                # Get all accuracy values for this row
+                accuracies = []
+                for pass_num in range(pass_k):
+                    acc = df_output.at[idx, f'prompt_accuracy_{pass_num}']
+                    accuracies.append(acc if not pd.isna(acc) else 0.0)
 
-                    # Set aggregated accuracy as max
-                    max_accuracy = max(accuracies)
-                    df_output.at[idx, 'prompt_accuracy'] = max_accuracy
+                # Set aggregated accuracy as max
+                max_accuracy = max(accuracies)
+                df_output.at[idx, 'prompt_accuracy'] = max_accuracy
 
-                    # Find which pass achieved max accuracy
-                    max_pass = accuracies.index(max_accuracy)
-                    df_output.at[idx,
-                                 'evaluation_details'] = f"Best pass: {max_pass} (accuracy: {max_accuracy:.1f}%)"
+                # Find which pass achieved max accuracy
+                max_pass = accuracies.index(max_accuracy)
+                df_output.at[idx,
+                             'evaluation_details'] = f"Best pass: {max_pass} (accuracy: {max_accuracy:.1f}%)"
 
         return df_output
     finally:
@@ -798,32 +796,19 @@ def print_evaluation_results(df_evaluated: pd.DataFrame,
     # Detect pass@k
     pass_k = detect_pass_k(df_evaluated)
 
-    # Calculate statistics
-    if pass_k > 1:
-        # For pass@k, use the aggregated prompt_accuracy (max across passes)
-        # Count from first pass
-        evaluated = df_evaluated['extracted_answer_0'].notna().sum()
-        correct = (df_evaluated['prompt_accuracy'] > 0).sum()
-        accuracy = df_evaluated['prompt_accuracy'].mean()
-
-        # Calculate average token length across all passes
-        all_output_lens = []
-        for i in range(pass_k):
-            all_output_lens.extend(
-                df_evaluated[f'tok_model_output_len_{i}'].tolist())
-        mean_output_len = float(
-            sum(all_output_lens) /
-            len(all_output_lens)) if all_output_lens else 0.0
-    else:
-        # Single pass format
-        suffix = '' if 'extracted_answer' in df_evaluated.columns else '_0'
-        evaluated = df_evaluated[f'extracted_answer{suffix}'].notna().sum()
-        correct = (df_evaluated[f'prompt_accuracy{suffix}'] > 0).sum()
-        accuracy = df_evaluated[f'prompt_accuracy{suffix}'].mean()
+    # Calculate statistics - always use aggregated prompt_accuracy (max across passes)
+    evaluated = df_evaluated['extracted_answer_0'].notna().sum()
+    correct = (df_evaluated['prompt_accuracy'] > 0).sum()
+    accuracy = df_evaluated['prompt_accuracy'].mean()
 
-        # tok_model_output_len is now a required column
-        tok_len_col = 'tok_model_output_len' if 'tok_model_output_len' in df_evaluated.columns else 'tok_model_output_len_0'
-        mean_output_len = float(df_evaluated[tok_len_col].mean())
+    # Calculate average token length across all passes
+    all_output_lens = []
+    for i in range(pass_k):
+        all_output_lens.extend(
+            df_evaluated[f'tok_model_output_len_{i}'].tolist())
+    mean_output_len = float(
+        sum(all_output_lens) /
+        len(all_output_lens)) if all_output_lens else 0.0
 
     # Use exact_match as the metric key
     metric_key = 'exact_match'
@@ -834,14 +819,13 @@ def print_evaluation_results(df_evaluated: pd.DataFrame,
         metric_key: float(accuracy),
         'tokens_per_sample': mean_output_len,
         'num-samples': len(df_evaluated),
+        'pass_k': pass_k,
     }
 
-    if pass_k > 1:
-        results['pass_k'] = pass_k
-        # Also report individual pass accuracies
-        for i in range(pass_k):
-            pass_acc = df_evaluated[f'prompt_accuracy_{i}'].mean()
-            results[f'{metric_key}_pass_{i}'] = float(pass_acc)
+    # Report individual pass accuracies
+    for i in range(pass_k):
+        pass_acc = df_evaluated[f'prompt_accuracy_{i}'].mean()
+        results[f'{metric_key}_pass_{i}'] = float(pass_acc)
 
     print("\nResults\n")
     print(results)
diff --git a/language/gpt-oss/eval_mlperf_accuracy.py b/language/gpt-oss/eval_mlperf_accuracy.py
index f24359aa79..15490317c1 100644
--- a/language/gpt-oss/eval_mlperf_accuracy.py
+++ b/language/gpt-oss/eval_mlperf_accuracy.py
@@ -383,6 +383,12 @@ def main():
 
     logger.info(f"Reference data shape: {reference_df.shape}")
     logger.info(f"Reference columns: {list(reference_df.columns)}")
+
+    # Validate required columns exist
+    required_columns = ['dataset', 'ground_truth']
+    missing_columns = [col for col in required_columns if col not in reference_df.columns]
+    if missing_columns:
+        raise ValueError(f"Reference data missing required columns: {missing_columns}")
     logger.info(
         f"Evaluation mode: pass@{num_repeats}" if num_repeats > 1 else "Evaluation mode: single-pass")
 
diff --git a/language/gpt-oss/mlperf/user.conf b/language/gpt-oss/mlperf/user.conf
index 4544ea59b7..7b123ea88b 100644
--- a/language/gpt-oss/mlperf/user.conf
+++ b/language/gpt-oss/mlperf/user.conf
@@ -1,15 +1,15 @@
 gpt-oss-120b.Offline.target_qps = 10.0
 gpt-oss-120b.Offline.min_duration = 60000
 # gpt-oss-120b.Offline.performance_sample_count = 6396
-gpt-oss-120b.Offline.performance_sample_count = 1724
+gpt-oss-120b.Offline.performance_sample_count = 4395
 
 gpt-oss-120b.Server.target_qps = 10
 gpt-oss-120b.Server.min_duration = 60000
 # gpt-oss-120b.Server.performance_sample_count = 6396
-gpt-oss-120b.Server.performance_sample_count = 1724
+gpt-oss-120b.Server.performance_sample_count = 4395
 
 gpt-oss-120b.Server.target_latency = 0
 gpt-oss-120b.Server.ttft_latency = 2000
 gpt-oss-120b.Server.tpot_latency = 20
 
-gpt-oss-120b.*.repeats_per_sample = 5
+# gpt-oss-120b.*.repeats_per_sample = 5
diff --git a/language/gpt-oss/sglang/run_server.sh b/language/gpt-oss/sglang/run_server.sh
index 7d53a96ebd..f988ea6b5f 100755
--- a/language/gpt-oss/sglang/run_server.sh
+++ b/language/gpt-oss/sglang/run_server.sh
@@ -35,7 +35,6 @@ done
 
 args=" --model-path $model_path \
     --host 0.0.0.0 \
-    --tp-size=1 \
     --data-parallel-size=$dp \
     --max-running-requests $((dp * 512)) \
     --mem-fraction-static 0.85 \
diff --git a/language/gpt-oss/utils/tokenization.py b/language/gpt-oss/utils/tokenization.py
index 10cc73b148..03791e3e59 100644
--- a/language/gpt-oss/utils/tokenization.py
+++ b/language/gpt-oss/utils/tokenization.py
@@ -122,19 +122,23 @@ def load_tokenized_dataset(
         df = df.head(max_samples)
         logger.info(f"Limited to {max_samples} samples")
 
-    # Extract tokenized prompts
-    if 'tok_input' not in df.columns:
+    # Extract tokenized prompts - support both column names
+    if 'tok_input' in df.columns: # pre-v4.0
+        token_col = 'tok_input'
+    elif 'input_tokens' in df.columns: # v4.0+
+        token_col = 'input_tokens'
+    else:
         raise ValueError(
-            "Dataset must have 'tok_input' column with tokenized prompts")
+            "Dataset must have 'tok_input' or 'input_tokens' column with tokenized prompts")
 
     # Verify tokenization
-    failed_mask = df['tok_input'].isna()
+    failed_mask = df[token_col].isna()
     if failed_mask.any():
         failed_count = failed_mask.sum()
         logger.error(f"Found {failed_count} samples with failed tokenization")
         raise ValueError(f"{failed_count} samples have invalid tokenization")
 
-    prompts = df['tok_input'].tolist()
+    prompts = df[token_col].tolist()
     logger.info(f"Loaded {len(prompts)} tokenized prompts")
 
     # Log statistics

From a6041d82cb0ec5cd9013d1822b983052e5610dd0 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Fri, 12 Dec 2025 12:46:19 +0000
Subject: [PATCH 145/165] [Automated Commit] Format Codebase

---
 language/gpt-oss/eval_accuracy.py        | 3 ++-
 language/gpt-oss/eval_mlperf_accuracy.py | 6 ++++--
 language/gpt-oss/utils/tokenization.py   | 4 ++--
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss/eval_accuracy.py
index 0195291ca3..fb46fdcb9b 100644
--- a/language/gpt-oss/eval_accuracy.py
+++ b/language/gpt-oss/eval_accuracy.py
@@ -796,7 +796,8 @@ def print_evaluation_results(df_evaluated: pd.DataFrame,
     # Detect pass@k
     pass_k = detect_pass_k(df_evaluated)
 
-    # Calculate statistics - always use aggregated prompt_accuracy (max across passes)
+    # Calculate statistics - always use aggregated prompt_accuracy (max across
+    # passes)
     evaluated = df_evaluated['extracted_answer_0'].notna().sum()
     correct = (df_evaluated['prompt_accuracy'] > 0).sum()
     accuracy = df_evaluated['prompt_accuracy'].mean()
diff --git a/language/gpt-oss/eval_mlperf_accuracy.py b/language/gpt-oss/eval_mlperf_accuracy.py
index 15490317c1..f9c1e38f9f 100644
--- a/language/gpt-oss/eval_mlperf_accuracy.py
+++ b/language/gpt-oss/eval_mlperf_accuracy.py
@@ -386,9 +386,11 @@ def main():
 
     # Validate required columns exist
     required_columns = ['dataset', 'ground_truth']
-    missing_columns = [col for col in required_columns if col not in reference_df.columns]
+    missing_columns = [
+        col for col in required_columns if col not in reference_df.columns]
     if missing_columns:
-        raise ValueError(f"Reference data missing required columns: {missing_columns}")
+        raise ValueError(
+            f"Reference data missing required columns: {missing_columns}")
     logger.info(
         f"Evaluation mode: pass@{num_repeats}" if num_repeats > 1 else "Evaluation mode: single-pass")
 
diff --git a/language/gpt-oss/utils/tokenization.py b/language/gpt-oss/utils/tokenization.py
index 03791e3e59..533f5c6691 100644
--- a/language/gpt-oss/utils/tokenization.py
+++ b/language/gpt-oss/utils/tokenization.py
@@ -123,9 +123,9 @@ def load_tokenized_dataset(
         logger.info(f"Limited to {max_samples} samples")
 
     # Extract tokenized prompts - support both column names
-    if 'tok_input' in df.columns: # pre-v4.0
+    if 'tok_input' in df.columns:  # pre-v4.0
         token_col = 'tok_input'
-    elif 'input_tokens' in df.columns: # v4.0+
+    elif 'input_tokens' in df.columns:  # v4.0+
         token_col = 'input_tokens'
     else:
         raise ValueError(

From bebb3287dacab2607a2ca4ee6c8c72ea30b3a8aa Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 15 Dec 2025 05:06:41 -0800
Subject: [PATCH 146/165] remove loadgen changes for repeats

---
 loadgen/issue_query_controller.cc |  5 ++---
 loadgen/issue_query_controller.h  |  4 +---
 loadgen/loadgen.cc                | 35 +++++--------------------------
 loadgen/logging.cc                | 16 +++++++-------
 loadgen/logging.h                 |  3 +--
 loadgen/mlperf.conf               | 10 +--------
 loadgen/test_settings.h           |  9 --------
 loadgen/test_settings_internal.cc |  4 +---
 loadgen/test_settings_internal.h  |  2 --
 9 files changed, 18 insertions(+), 70 deletions(-)

diff --git a/loadgen/issue_query_controller.cc b/loadgen/issue_query_controller.cc
index 832be62ba8..c1abea9d14 100644
--- a/loadgen/issue_query_controller.cc
+++ b/loadgen/issue_query_controller.cc
@@ -30,8 +30,7 @@ namespace loadgen {
 QueryMetadata::QueryMetadata(
     const std::vector<QuerySampleIndex>& query_sample_indices,
     std::chrono::nanoseconds scheduled_delta,
-    ResponseDelegate* response_delegate, SequenceGen* sequence_gen,
-    uint64_t repeat_index)
+    ResponseDelegate* response_delegate, SequenceGen* sequence_gen)
     : scheduled_delta(scheduled_delta),
       response_delegate(response_delegate),
       sequence_id(sequence_gen->NextQueryId()),
@@ -39,7 +38,7 @@ QueryMetadata::QueryMetadata(
   samples_.reserve(query_sample_indices.size());
   for (QuerySampleIndex qsi : query_sample_indices) {
     samples_.push_back({this, sequence_gen->NextSampleId(), qsi,
-                        sequence_gen->NextAccLogRng(), repeat_index});
+                        sequence_gen->NextAccLogRng()});
   }
   query_to_send.reserve(query_sample_indices.size());
   for (auto& s : samples_) {
diff --git a/loadgen/issue_query_controller.h b/loadgen/issue_query_controller.h
index 2cdb4e0a92..5668c574ed 100644
--- a/loadgen/issue_query_controller.h
+++ b/loadgen/issue_query_controller.h
@@ -83,7 +83,6 @@ struct SampleMetadata {
   uint64_t sequence_id;
   QuerySampleIndex sample_index;
   double accuracy_log_val;
-  uint64_t repeat_index;  // Index for repeated sampling (0 to k-1)
 };
 
 /// \brief Maintains data and timing info for a query and all its samples.
@@ -91,8 +90,7 @@ class QueryMetadata {
  public:
   QueryMetadata(const std::vector<QuerySampleIndex>& query_sample_indices,
                 std::chrono::nanoseconds scheduled_delta,
-                ResponseDelegate* response_delegate, SequenceGen* sequence_gen,
-                uint64_t repeat_index = 0);
+                ResponseDelegate* response_delegate, SequenceGen* sequence_gen);
   QueryMetadata(QueryMetadata&& src);
 
   void NotifyOneSampleCompleted(PerfClock::time_point timestamp);
diff --git a/loadgen/loadgen.cc b/loadgen/loadgen.cc
index a828d350c8..42b2140de2 100644
--- a/loadgen/loadgen.cc
+++ b/loadgen/loadgen.cc
@@ -121,8 +121,7 @@ struct ResponseDelegateDetailed : public ResponseDelegate {
 
       if (sample_data_copy) {
         log.LogAccuracy(sample->sequence_id, sample->sample_index,
-                        LogBinaryAsHexString{sample_data_copy}, n_tokens,
-                        sample->repeat_index);
+                        LogBinaryAsHexString{sample_data_copy}, n_tokens);
         delete sample_data_copy;
       }
 
@@ -264,12 +263,6 @@ std::vector<QueryMetadata> GenerateQueries(
   auto tracer =
       MakeScopedTracer([](AsyncTrace& trace) { trace("GenerateQueries"); });
 
-  // In PerformanceOnly mode, repeats_per_sample must be 1
-  if (mode != TestMode::AccuracyOnly) {
-    assert(settings.repeats_per_sample == 1 &&
-           "repeats_per_sample must be 1 in PerformanceOnly mode");
-  }
-
   auto& loaded_samples = loaded_sample_set.set;
 
   // Generate 2x more samples than we think we'll need given the expected
@@ -294,11 +287,6 @@ std::vector<QueryMetadata> GenerateQueries(
     // For MultiStream, loaded samples is properly padded.
     // For Offline, we create a 'remainder' query at the end of this function.
     min_queries = loaded_samples.size() / samples_per_query;
-
-    // For repeated sampling, multiply min_queries by repeats
-    if (mode == TestMode::AccuracyOnly) {
-      min_queries *= settings.repeats_per_sample;
-    }
   }
 
   std::vector<QueryMetadata> queries;
@@ -407,17 +395,9 @@ std::vector<QueryMetadata> GenerateQueries(
                                : sample_distribution(sample_rng)];
       }
     }
-
-    // Handle repeated sampling: create repeats_per_sample queries for the same
-    // sample(s) In PerformanceOnly mode, this is always 1 (single query per
-    // sample)
-    for (uint64_t k = 0; k < settings.repeats_per_sample; k++) {
-      queries.emplace_back(samples, timestamp, response_delegate, sequence_gen,
-                           k);
-      prev_timestamp = timestamp;
-      timestamp += schedule_distribution(schedule_rng);
-    }
-
+    queries.emplace_back(samples, timestamp, response_delegate, sequence_gen);
+    prev_timestamp = timestamp;
+    timestamp += schedule_distribution(schedule_rng);
     // In equal_issue mode, the min_queries will be bumped up by a multiple of
     // the dataset size if the test time has not met the threshold.
     if (enable_equal_issue && (queries.size() >= min_queries) &&
@@ -437,12 +417,7 @@ std::vector<QueryMetadata> GenerateQueries(
       for (auto& s : samples) {
         s = loaded_samples[sample_distribution(sample_rng)];
       }
-
-      // Handle repeated sampling for remainder query as well
-      for (uint64_t k = 0; k < settings.repeats_per_sample; k++) {
-        queries.emplace_back(samples, timestamp, response_delegate,
-                             sequence_gen, k);
-      }
+      queries.emplace_back(samples, timestamp, response_delegate, sequence_gen);
     }
   }
 
diff --git a/loadgen/logging.cc b/loadgen/logging.cc
index f7aa1271a8..807c1954a8 100644
--- a/loadgen/logging.cc
+++ b/loadgen/logging.cc
@@ -280,24 +280,22 @@ void AsyncLog::StopTrace() {
 
 void AsyncLog::LogAccuracy(uint64_t seq_id, const QuerySampleIndex qsl_idx,
                            const LogBinaryAsHexString& response,
-                           int64_t n_tokens, uint64_t repeat_index) {
+                           int64_t n_tokens = 0) {
   std::unique_lock<std::mutex> lock(log_mutex_);
   if (!accuracy_out_) {
     return;
   }
   *accuracy_out_ << (accuracy_needs_comma_ ? ",\n{ " : "\n{ ");
-
   if (!use_tokens_) {
-    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "repeat_idx",
-            repeat_index, "data", response);
+    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
+            response);
   } else if (!needs_first_token_) {
-    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "repeat_idx",
-            repeat_index, "data", response, "token_count", n_tokens);
+    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
+            response, "token_count", n_tokens);
   } else {
     const size_t i = seq_id - latencies_first_sample_sequence_id_;
-    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "repeat_idx",
-            repeat_index, "data", response, "token_data", token_records_[i],
-            "token_count", n_tokens);
+    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
+            response, "token_data", token_records_[i], "token_count", n_tokens);
   }
 
   *accuracy_out_ << " }";
diff --git a/loadgen/logging.h b/loadgen/logging.h
index 2c770438d6..8f1a398e9d 100644
--- a/loadgen/logging.h
+++ b/loadgen/logging.h
@@ -213,8 +213,7 @@ class AsyncLog {
   void SetCurrentPidTid(uint64_t pid, uint64_t tid);
 
   void LogAccuracy(uint64_t seq_id, const QuerySampleIndex qsl_idx,
-                   const LogBinaryAsHexString& response, int64_t n_tokens,
-                   uint64_t repeat_index = 0);
+                   const LogBinaryAsHexString& response, int64_t n_tokens);
   void CacheToken(uint64_t seq_id, const LogBinaryAsHexString& response);
 
   template <typename... Args>
diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf
index af5b9f81a7..1b825514bd 100644
--- a/loadgen/mlperf.conf
+++ b/loadgen/mlperf.conf
@@ -25,7 +25,6 @@ stable-diffusion-xl.*.performance_sample_count_override = 5000
 rgat.*.performance_sample_count_override = 788379
 pointpainting.*.performance_sample_count_override = 1024
 deepseek-r1.*.performance_sample_count_override = 4388
-gpt-oss-120b.*.performance_sample_count_override = 8036
 whisper.*.performance_sample_count_override = 1633
 # set to 0 to let entire sample set to be performance sample
 3d-unet.*.performance_sample_count_override = 0
@@ -67,7 +66,6 @@ llama3_1-8b.*.sample_concatenate_permutation = 1
 llama3_1-8b-edge.*.sample_concatenate_permutation = 1
 llama3_1-8b-interactive.*.sample_concatenate_permutation = 1
 deepseek-r1.*.sample_concatenate_permutation = 1
-gpt-oss-120b.*.sample_concatenate_permutation = 1
 whisper.*.sample_concatenate_permutation = 1
 
 *.Server.target_latency = 10
@@ -92,7 +90,6 @@ llama3_1-8b.*.use_token_latencies = 1
 llama3_1-8b-edge.*.use_token_latencies = 1
 llama3_1-8b-interactive.*.use_token_latencies = 1
 deepseek-r1.*.use_token_latencies = 1
-gpt-oss-120b.*.use_token_latencies = 1
 whisper.*.use_token_latencies = 1
 
 # gptj benchmark infers token latencies
@@ -135,10 +132,6 @@ deepseek-r1.Server.target_latency = 0
 deepseek-r1.Server.ttft_latency = 2000
 deepseek-r1.Server.tpot_latency = 80
 
-gpt-oss-120b.Server.target_latency = 0
-gpt-oss-120b.Server.ttft_latency = 2000
-gpt-oss-120b.Server.tpot_latency = 20
-
 *.Offline.target_latency_percentile = 90
 *.Offline.min_duration = 600000
 
@@ -162,11 +155,10 @@ llama3_1-8b-edge.Offline.min_query_count = 5000
 mixtral-8x7b.Offline.min_query_count = 15000
 rgat.Offline.min_query_count = 788379
 deepseek-r1.Offline.min_query_count = 4388
-gpt-oss-120b.Offline.min_query_count = 8036
 whisper.Offline.min_query_count = 1633
 
 # These fields should be defined and overridden by user.conf.
 *.SingleStream.target_latency = 10
 *.MultiStream.target_latency = 80
-*.Server.target_qps = 8.0
+*.Server.target_qps = 1.0
 *.Offline.target_qps = 1.0
diff --git a/loadgen/test_settings.h b/loadgen/test_settings.h
index 5d49d3a012..584d073bb8 100644
--- a/loadgen/test_settings.h
+++ b/loadgen/test_settings.h
@@ -271,15 +271,6 @@ struct TestSettings {
   /// \brief Infer token latencies
   bool infer_token_latencies = false;
   uint64_t token_latency_scaling_factor;
-
-  /// \brief Enable repeated sampling in accuracy mode
-  /// \details When enabled, each sample is issued
-  /// repeats_per_sample times and multiple responses are collected.
-  /// This is used for code generation benchmarks like gpt-oss where multiple
-  /// solutions are generated and evaluated (pass@k metric).
-  /// Default is 1 (single sample). Set to k (e.g., 5 for pass@5) to enable.
-  /// Must be 1 for PerformanceOnly mode.
-  uint64_t repeats_per_sample = 1;
   /**@}*/
 };
 
diff --git a/loadgen/test_settings_internal.cc b/loadgen/test_settings_internal.cc
index 425d4db833..3f2cd88473 100644
--- a/loadgen/test_settings_internal.cc
+++ b/loadgen/test_settings_internal.cc
@@ -54,8 +54,7 @@ TestSettingsInternal::TestSettingsInternal(
       server_ttft_latency(requested.server_ttft_latency),
       server_tpot_latency(requested.server_tpot_latency),
       infer_token_latencies(requested.infer_token_latencies),
-      token_latency_scaling_factor(requested.token_latency_scaling_factor),
-      repeats_per_sample(requested.repeats_per_sample) {
+      token_latency_scaling_factor(requested.token_latency_scaling_factor) {
   // Target QPS, target latency, and max_async_queries.
   switch (requested.scenario) {
     case TestScenario::SingleStream:
@@ -780,7 +779,6 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model,
   lookupkv(model, scenario, "max_query_count", &max_query_count, nullptr);
   lookupkv(model, scenario, "performance_sample_count_override",
            &performance_sample_count_override, nullptr);
-  lookupkv(model, scenario, "repeats_per_sample", &repeats_per_sample, nullptr);
   lookupkv(model, "SingleStream", "target_latency", nullptr,
            &single_stream_expected_latency_ns, 1000 * 1000);
   lookupkv(model, "MultiStream", "target_latency", nullptr,
diff --git a/loadgen/test_settings_internal.h b/loadgen/test_settings_internal.h
index 3aad54fa8e..ab2773bd18 100644
--- a/loadgen/test_settings_internal.h
+++ b/loadgen/test_settings_internal.h
@@ -88,8 +88,6 @@ struct TestSettingsInternal {
 
   bool infer_token_latencies = false;
   int64_t token_latency_scaling_factor;
-
-  uint64_t repeats_per_sample;
 };
 
 /// \brief A namespace of collections of FindPeakPerformance helper functions,

From 373d57d10007c062264a5803fe78eb521f3d3e04 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 15 Dec 2025 05:08:10 -0800
Subject: [PATCH 147/165] gpt-oss -> gpt-oss-120b

---
 language/{gpt-oss => gpt-oss-120b}/.gitignore                     | 0
 language/{gpt-oss => gpt-oss-120b}/README.md                      | 0
 language/{gpt-oss => gpt-oss-120b}/archive/collect_results_csv.py | 0
 language/{gpt-oss => gpt-oss-120b}/archive/plot.py                | 0
 language/{gpt-oss => gpt-oss-120b}/archive/plot_results.py        | 0
 language/{gpt-oss => gpt-oss-120b}/archive/summarize_eval.py      | 0
 language/{gpt-oss => gpt-oss-120b}/backends/__init__.py           | 0
 language/{gpt-oss => gpt-oss-120b}/backends/base_backend.py       | 0
 language/{gpt-oss => gpt-oss-120b}/backends/sglang_backend.py     | 0
 language/{gpt-oss => gpt-oss-120b}/eval_accuracy.py               | 0
 language/{gpt-oss => gpt-oss-120b}/eval_mlperf_accuracy.py        | 0
 language/{gpt-oss => gpt-oss-120b}/eval_mlperf_performance.py     | 0
 language/{gpt-oss => gpt-oss-120b}/mlperf/__init__.py             | 0
 language/{gpt-oss => gpt-oss-120b}/mlperf/base_sut.py             | 0
 language/{gpt-oss => gpt-oss-120b}/mlperf/offline_sut.py          | 0
 language/{gpt-oss => gpt-oss-120b}/mlperf/qsl.py                  | 0
 language/{gpt-oss => gpt-oss-120b}/mlperf/server_sut.py           | 0
 language/{gpt-oss => gpt-oss-120b}/mlperf/user.conf               | 0
 language/{gpt-oss => gpt-oss-120b}/preprocess/harmonize_inputs.py | 0
 language/{gpt-oss => gpt-oss-120b}/requirements.txt               | 0
 language/{gpt-oss => gpt-oss-120b}/run_mlperf.py                  | 0
 language/{gpt-oss => gpt-oss-120b}/setup.sh                       | 0
 language/{gpt-oss => gpt-oss-120b}/setup_enroot.sh                | 0
 language/{gpt-oss => gpt-oss-120b}/sglang/run_infer.py            | 0
 language/{gpt-oss => gpt-oss-120b}/sglang/run_server.sh           | 0
 language/{gpt-oss => gpt-oss-120b}/submodules/LiveCodeBench       | 0
 language/{gpt-oss => gpt-oss-120b}/submodules/prm800k             | 0
 language/{gpt-oss => gpt-oss-120b}/utils/__init__.py              | 0
 language/{gpt-oss => gpt-oss-120b}/utils/tokenization.py          | 0
 29 files changed, 0 insertions(+), 0 deletions(-)
 rename language/{gpt-oss => gpt-oss-120b}/.gitignore (100%)
 rename language/{gpt-oss => gpt-oss-120b}/README.md (100%)
 rename language/{gpt-oss => gpt-oss-120b}/archive/collect_results_csv.py (100%)
 rename language/{gpt-oss => gpt-oss-120b}/archive/plot.py (100%)
 rename language/{gpt-oss => gpt-oss-120b}/archive/plot_results.py (100%)
 rename language/{gpt-oss => gpt-oss-120b}/archive/summarize_eval.py (100%)
 rename language/{gpt-oss => gpt-oss-120b}/backends/__init__.py (100%)
 rename language/{gpt-oss => gpt-oss-120b}/backends/base_backend.py (100%)
 rename language/{gpt-oss => gpt-oss-120b}/backends/sglang_backend.py (100%)
 rename language/{gpt-oss => gpt-oss-120b}/eval_accuracy.py (100%)
 rename language/{gpt-oss => gpt-oss-120b}/eval_mlperf_accuracy.py (100%)
 rename language/{gpt-oss => gpt-oss-120b}/eval_mlperf_performance.py (100%)
 rename language/{gpt-oss => gpt-oss-120b}/mlperf/__init__.py (100%)
 rename language/{gpt-oss => gpt-oss-120b}/mlperf/base_sut.py (100%)
 rename language/{gpt-oss => gpt-oss-120b}/mlperf/offline_sut.py (100%)
 rename language/{gpt-oss => gpt-oss-120b}/mlperf/qsl.py (100%)
 rename language/{gpt-oss => gpt-oss-120b}/mlperf/server_sut.py (100%)
 rename language/{gpt-oss => gpt-oss-120b}/mlperf/user.conf (100%)
 rename language/{gpt-oss => gpt-oss-120b}/preprocess/harmonize_inputs.py (100%)
 rename language/{gpt-oss => gpt-oss-120b}/requirements.txt (100%)
 rename language/{gpt-oss => gpt-oss-120b}/run_mlperf.py (100%)
 rename language/{gpt-oss => gpt-oss-120b}/setup.sh (100%)
 rename language/{gpt-oss => gpt-oss-120b}/setup_enroot.sh (100%)
 rename language/{gpt-oss => gpt-oss-120b}/sglang/run_infer.py (100%)
 rename language/{gpt-oss => gpt-oss-120b}/sglang/run_server.sh (100%)
 rename language/{gpt-oss => gpt-oss-120b}/submodules/LiveCodeBench (100%)
 rename language/{gpt-oss => gpt-oss-120b}/submodules/prm800k (100%)
 rename language/{gpt-oss => gpt-oss-120b}/utils/__init__.py (100%)
 rename language/{gpt-oss => gpt-oss-120b}/utils/tokenization.py (100%)

diff --git a/language/gpt-oss/.gitignore b/language/gpt-oss-120b/.gitignore
similarity index 100%
rename from language/gpt-oss/.gitignore
rename to language/gpt-oss-120b/.gitignore
diff --git a/language/gpt-oss/README.md b/language/gpt-oss-120b/README.md
similarity index 100%
rename from language/gpt-oss/README.md
rename to language/gpt-oss-120b/README.md
diff --git a/language/gpt-oss/archive/collect_results_csv.py b/language/gpt-oss-120b/archive/collect_results_csv.py
similarity index 100%
rename from language/gpt-oss/archive/collect_results_csv.py
rename to language/gpt-oss-120b/archive/collect_results_csv.py
diff --git a/language/gpt-oss/archive/plot.py b/language/gpt-oss-120b/archive/plot.py
similarity index 100%
rename from language/gpt-oss/archive/plot.py
rename to language/gpt-oss-120b/archive/plot.py
diff --git a/language/gpt-oss/archive/plot_results.py b/language/gpt-oss-120b/archive/plot_results.py
similarity index 100%
rename from language/gpt-oss/archive/plot_results.py
rename to language/gpt-oss-120b/archive/plot_results.py
diff --git a/language/gpt-oss/archive/summarize_eval.py b/language/gpt-oss-120b/archive/summarize_eval.py
similarity index 100%
rename from language/gpt-oss/archive/summarize_eval.py
rename to language/gpt-oss-120b/archive/summarize_eval.py
diff --git a/language/gpt-oss/backends/__init__.py b/language/gpt-oss-120b/backends/__init__.py
similarity index 100%
rename from language/gpt-oss/backends/__init__.py
rename to language/gpt-oss-120b/backends/__init__.py
diff --git a/language/gpt-oss/backends/base_backend.py b/language/gpt-oss-120b/backends/base_backend.py
similarity index 100%
rename from language/gpt-oss/backends/base_backend.py
rename to language/gpt-oss-120b/backends/base_backend.py
diff --git a/language/gpt-oss/backends/sglang_backend.py b/language/gpt-oss-120b/backends/sglang_backend.py
similarity index 100%
rename from language/gpt-oss/backends/sglang_backend.py
rename to language/gpt-oss-120b/backends/sglang_backend.py
diff --git a/language/gpt-oss/eval_accuracy.py b/language/gpt-oss-120b/eval_accuracy.py
similarity index 100%
rename from language/gpt-oss/eval_accuracy.py
rename to language/gpt-oss-120b/eval_accuracy.py
diff --git a/language/gpt-oss/eval_mlperf_accuracy.py b/language/gpt-oss-120b/eval_mlperf_accuracy.py
similarity index 100%
rename from language/gpt-oss/eval_mlperf_accuracy.py
rename to language/gpt-oss-120b/eval_mlperf_accuracy.py
diff --git a/language/gpt-oss/eval_mlperf_performance.py b/language/gpt-oss-120b/eval_mlperf_performance.py
similarity index 100%
rename from language/gpt-oss/eval_mlperf_performance.py
rename to language/gpt-oss-120b/eval_mlperf_performance.py
diff --git a/language/gpt-oss/mlperf/__init__.py b/language/gpt-oss-120b/mlperf/__init__.py
similarity index 100%
rename from language/gpt-oss/mlperf/__init__.py
rename to language/gpt-oss-120b/mlperf/__init__.py
diff --git a/language/gpt-oss/mlperf/base_sut.py b/language/gpt-oss-120b/mlperf/base_sut.py
similarity index 100%
rename from language/gpt-oss/mlperf/base_sut.py
rename to language/gpt-oss-120b/mlperf/base_sut.py
diff --git a/language/gpt-oss/mlperf/offline_sut.py b/language/gpt-oss-120b/mlperf/offline_sut.py
similarity index 100%
rename from language/gpt-oss/mlperf/offline_sut.py
rename to language/gpt-oss-120b/mlperf/offline_sut.py
diff --git a/language/gpt-oss/mlperf/qsl.py b/language/gpt-oss-120b/mlperf/qsl.py
similarity index 100%
rename from language/gpt-oss/mlperf/qsl.py
rename to language/gpt-oss-120b/mlperf/qsl.py
diff --git a/language/gpt-oss/mlperf/server_sut.py b/language/gpt-oss-120b/mlperf/server_sut.py
similarity index 100%
rename from language/gpt-oss/mlperf/server_sut.py
rename to language/gpt-oss-120b/mlperf/server_sut.py
diff --git a/language/gpt-oss/mlperf/user.conf b/language/gpt-oss-120b/mlperf/user.conf
similarity index 100%
rename from language/gpt-oss/mlperf/user.conf
rename to language/gpt-oss-120b/mlperf/user.conf
diff --git a/language/gpt-oss/preprocess/harmonize_inputs.py b/language/gpt-oss-120b/preprocess/harmonize_inputs.py
similarity index 100%
rename from language/gpt-oss/preprocess/harmonize_inputs.py
rename to language/gpt-oss-120b/preprocess/harmonize_inputs.py
diff --git a/language/gpt-oss/requirements.txt b/language/gpt-oss-120b/requirements.txt
similarity index 100%
rename from language/gpt-oss/requirements.txt
rename to language/gpt-oss-120b/requirements.txt
diff --git a/language/gpt-oss/run_mlperf.py b/language/gpt-oss-120b/run_mlperf.py
similarity index 100%
rename from language/gpt-oss/run_mlperf.py
rename to language/gpt-oss-120b/run_mlperf.py
diff --git a/language/gpt-oss/setup.sh b/language/gpt-oss-120b/setup.sh
similarity index 100%
rename from language/gpt-oss/setup.sh
rename to language/gpt-oss-120b/setup.sh
diff --git a/language/gpt-oss/setup_enroot.sh b/language/gpt-oss-120b/setup_enroot.sh
similarity index 100%
rename from language/gpt-oss/setup_enroot.sh
rename to language/gpt-oss-120b/setup_enroot.sh
diff --git a/language/gpt-oss/sglang/run_infer.py b/language/gpt-oss-120b/sglang/run_infer.py
similarity index 100%
rename from language/gpt-oss/sglang/run_infer.py
rename to language/gpt-oss-120b/sglang/run_infer.py
diff --git a/language/gpt-oss/sglang/run_server.sh b/language/gpt-oss-120b/sglang/run_server.sh
similarity index 100%
rename from language/gpt-oss/sglang/run_server.sh
rename to language/gpt-oss-120b/sglang/run_server.sh
diff --git a/language/gpt-oss/submodules/LiveCodeBench b/language/gpt-oss-120b/submodules/LiveCodeBench
similarity index 100%
rename from language/gpt-oss/submodules/LiveCodeBench
rename to language/gpt-oss-120b/submodules/LiveCodeBench
diff --git a/language/gpt-oss/submodules/prm800k b/language/gpt-oss-120b/submodules/prm800k
similarity index 100%
rename from language/gpt-oss/submodules/prm800k
rename to language/gpt-oss-120b/submodules/prm800k
diff --git a/language/gpt-oss/utils/__init__.py b/language/gpt-oss-120b/utils/__init__.py
similarity index 100%
rename from language/gpt-oss/utils/__init__.py
rename to language/gpt-oss-120b/utils/__init__.py
diff --git a/language/gpt-oss/utils/tokenization.py b/language/gpt-oss-120b/utils/tokenization.py
similarity index 100%
rename from language/gpt-oss/utils/tokenization.py
rename to language/gpt-oss-120b/utils/tokenization.py

From 1a5fda64223e8363cad23825ebe2e966377e9d4b Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Mon, 15 Dec 2025 13:08:42 +0000
Subject: [PATCH 148/165] [Automated Commit] Format Codebase

---
 .../gpt-oss-120b/backends/sglang_backend.py   | 66 ++++++++++++-------
 language/gpt-oss-120b/mlperf/base_sut.py      | 12 ++--
 2 files changed, 48 insertions(+), 30 deletions(-)

diff --git a/language/gpt-oss-120b/backends/sglang_backend.py b/language/gpt-oss-120b/backends/sglang_backend.py
index c17ab37451..83fd12f3ad 100644
--- a/language/gpt-oss-120b/backends/sglang_backend.py
+++ b/language/gpt-oss-120b/backends/sglang_backend.py
@@ -53,19 +53,23 @@ def initialize(self) -> None:
             return
 
         logger.info(f"Connecting to SGLang server at {self.server_url}")
-        logger.info(f"Configuring connection pool with max_pool_size={self.max_pool_size}")
+        logger.info(
+            f"Configuring connection pool with max_pool_size={self.max_pool_size}")
         # Create session with larger connection pool for high concurrency
         # Default pool size is 10, but we may have 100s-1000s of concurrent
         # requests
         self.session = requests.Session()
 
         # Increase connection pool size to support high concurrency
-        # pool_maxsize should be >= max_concurrency to avoid "pool is full" warnings
+        # pool_maxsize should be >= max_concurrency to avoid "pool is full"
+        # warnings
         adapter = requests.adapters.HTTPAdapter(
-            pool_connections=min(100, self.max_pool_size // 10),  # Number of connection pools to cache
+            # Number of connection pools to cache
+            pool_connections=min(100, self.max_pool_size // 10),
             pool_maxsize=self.max_pool_size,     # Maximum number of connections in the pool
             max_retries=3,                       # Retry failed requests
-            pool_block=False                     # Don't block when pool is full, create new connections
+            # Don't block when pool is full, create new connections
+            pool_block=False
         )
         self.session.mount('http://', adapter)
         self.session.mount('https://', adapter)
@@ -228,7 +232,7 @@ async def generate_stream(
                 - is_finished: True if generation is complete
                 - accumulated_token_ids: All tokens generated so far
                 - metadata: Additional info (TTFT, completion_tokens, etc.)
-                
+
         Note:
             SGLang's streaming API behavior:
             - Returns 'output_ids', 'text', and 'meta_info' in each chunk
@@ -302,49 +306,63 @@ async def generate_stream(
                             delta_text = chunk.get("text", "")
 
                             # Check if this is the final chunk
-                            # SGLang uses 'finish_reason' in meta_info, not 'finished' flag
+                            # SGLang uses 'finish_reason' in meta_info, not
+                            # 'finished' flag
                             meta_info = chunk.get("meta_info", {})
                             finish_reason = meta_info.get("finish_reason")
-                            is_finished = (finish_reason is not None and finish_reason != "null") or chunk.get("finished", False)
+                            is_finished = (
+                                finish_reason is not None and finish_reason != "null") or chunk.get(
+                                "finished", False)
 
                             # Extract token information from chunk
                             # SGLang's output_ids can have retractions, so use meta_info.completion_tokens
                             # which is the reliable cumulative count
                             chunk_output_ids = chunk.get("output_ids", [])
-                            completion_tokens = meta_info.get("completion_tokens", 0)
-                            
+                            completion_tokens = meta_info.get(
+                                "completion_tokens", 0)
+
                             if completion_tokens > 0:
-                                # Use completion_tokens as the authoritative count
+                                # Use completion_tokens as the authoritative
+                                # count
                                 previous_count = len(accumulated_token_ids)
-                                
+
                                 if completion_tokens > previous_count:
                                     # New tokens generated
                                     num_new_tokens = completion_tokens - previous_count
-                                    
-                                    if chunk_output_ids and len(chunk_output_ids) >= num_new_tokens:
+
+                                    if chunk_output_ids and len(
+                                            chunk_output_ids) >= num_new_tokens:
                                         # Use actual token IDs from chunk
-                                        delta_token_ids = chunk_output_ids[-num_new_tokens:] if num_new_tokens > 0 else []
+                                        delta_token_ids = chunk_output_ids[-num_new_tokens:] if num_new_tokens > 0 else [
+                                        ]
                                     else:
-                                        # Fallback: create placeholder tokens for counting
-                                        delta_token_ids = list(range(previous_count, completion_tokens))
-                                    
-                                    accumulated_token_ids.extend(delta_token_ids)
+                                        # Fallback: create placeholder tokens
+                                        # for counting
+                                        delta_token_ids = list(
+                                            range(previous_count, completion_tokens))
+
+                                    accumulated_token_ids.extend(
+                                        delta_token_ids)
                                 else:
                                     delta_token_ids = []
-                                
+
                             else:
-                                # No completion_tokens - fallback to output_ids or text estimation
+                                # No completion_tokens - fallback to output_ids
+                                # or text estimation
                                 if chunk_output_ids:
                                     delta_token_ids = chunk_output_ids
-                                    accumulated_token_ids.extend(delta_token_ids)
+                                    accumulated_token_ids.extend(
+                                        delta_token_ids)
                                 elif delta_text:
                                     # Estimate from text length
-                                    estimated_tokens = max(1, len(delta_text) // 4)
+                                    estimated_tokens = max(
+                                        1, len(delta_text) // 4)
                                     delta_token_ids = [0] * estimated_tokens
-                                    accumulated_token_ids.extend(delta_token_ids)
+                                    accumulated_token_ids.extend(
+                                        delta_token_ids)
                                 else:
                                     delta_token_ids = []
-                            
+
                             # Accumulate text
                             if delta_text:
                                 accumulated_text += delta_text
diff --git a/language/gpt-oss-120b/mlperf/base_sut.py b/language/gpt-oss-120b/mlperf/base_sut.py
index f96af57e16..1919d56a2a 100644
--- a/language/gpt-oss-120b/mlperf/base_sut.py
+++ b/language/gpt-oss-120b/mlperf/base_sut.py
@@ -35,10 +35,10 @@ def __init__(
         self.sut = None
         self.results = {}
         self.progress_bar = progress_bar
-        
+
         # Graceful shutdown support (set on KeyboardInterrupt)
         self.should_stop = threading.Event()
-        
+
         logger.info(f"Initializing {self.name}")
 
     @abc.abstractmethod
@@ -74,17 +74,17 @@ def start(self) -> lg.ConstructSUT:
 
     def stop(self) -> None:
         """Stop the SUT and clean up resources.
-        
+
         Signals graceful shutdown and allows subclasses to cancel pending work.
         """
         logger.info(f"Stopping {self.name}...")
-        
+
         # Signal all workers/tasks to stop
         self.should_stop.set()
-        
+
         # Subclasses should override to add their own cleanup
         # (e.g., cancel tasks, clear queues)
-        
+
         if self.sut:
             lg.DestroySUT(self.sut)
             self.sut = None

From 999fc89d451490056c1ff918b0fd02fbc12e5344 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 15 Dec 2025 13:10:43 +0000
Subject: [PATCH 149/165] update README

---
 language/gpt-oss-120b/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/language/gpt-oss-120b/README.md b/language/gpt-oss-120b/README.md
index 49e438e3fa..71287b0473 100644
--- a/language/gpt-oss-120b/README.md
+++ b/language/gpt-oss-120b/README.md
@@ -3,12 +3,13 @@ This is the reference implementation for GPT-OSS-120B. This is a proposal and is
 
 ## Model and Dataset download
 
+#### TODO: Replace this with mlc download link when available
+
 * Model: `openai/gpt-oss-120b`, commit id: [`b5c939d`](https://huggingface.co/openai/gpt-oss-120b/tree/b5c939de8f754692c1647ca79fbf85e8c1e70f8a)
 * Dataset: Please request access at [this link](https://drive.google.com/drive/folders/1DCfEXHqe69okrqKbSyV-8VUw413JqpPY?usp=drive_link) - **this is a tentative dataset**
 
 Datasets are now provided in **Parquet format** (recommended) for better performance and smaller file size (50% smaller than pickle). Pickle format is still supported for backward compatibility.
 
-
 ## Environment setup
 Work on reference implementation is done using the sglang containers at [https://hub.docker.com/r/lmsysorg/sglang/tags](https://hub.docker.com/r/lmsysorg/sglang/tags). For enroot setup, a script is provided under [`setup_enroot.sh`](./setup_enroot.sh). For all sections below, we shall assume this environment is instantiated.
 

From 4983645d2097d36a981254a6f607540d1d228b8b Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 15 Dec 2025 05:15:58 -0800
Subject: [PATCH 150/165] remove archive

---
 .../archive/collect_results_csv.py            | 295 --------------
 language/gpt-oss-120b/archive/plot.py         | 354 -----------------
 language/gpt-oss-120b/archive/plot_results.py | 355 -----------------
 .../gpt-oss-120b/archive/summarize_eval.py    | 371 ------------------
 4 files changed, 1375 deletions(-)
 delete mode 100755 language/gpt-oss-120b/archive/collect_results_csv.py
 delete mode 100644 language/gpt-oss-120b/archive/plot.py
 delete mode 100755 language/gpt-oss-120b/archive/plot_results.py
 delete mode 100644 language/gpt-oss-120b/archive/summarize_eval.py

diff --git a/language/gpt-oss-120b/archive/collect_results_csv.py b/language/gpt-oss-120b/archive/collect_results_csv.py
deleted file mode 100755
index 10c93a2a37..0000000000
--- a/language/gpt-oss-120b/archive/collect_results_csv.py
+++ /dev/null
@@ -1,295 +0,0 @@
-#!/usr/bin/env python3
-"""
-Collect results from multiple summarize_eval.py JSON outputs into a CSV.
-
-The CSV format shows:
-- Each row represents one dataset from one JSON file
-- Columns: run_1, run_2, ..., run_k, pass@k
-- Values are the "correct" counts (number of correct answers)
-"""
-
-import argparse
-import json
-import csv
-import sys
-import glob
-from pathlib import Path
-from typing import List, Dict, Any
-from collections import defaultdict
-
-
-def expand_glob_patterns(patterns: List[str]) -> List[str]:
-    """Expand glob patterns to actual file paths.
-
-    Args:
-        patterns: List of file paths or glob patterns (e.g., '*.json', 'results_*_summarize.json')
-
-    Returns:
-        List of actual file paths (sorted)
-    """
-    expanded_files = []
-
-    for pattern in patterns:
-        # If it's a literal file path that exists, use it directly
-        if Path(pattern).exists() and not any(
-                c in pattern for c in ['*', '?', '[', ']']):
-            expanded_files.append(pattern)
-        else:
-            # Try to expand as a glob pattern
-            matches = glob.glob(pattern)
-            if matches:
-                expanded_files.extend(matches)
-            else:
-                # If no matches and it's not a glob pattern, report the file as
-                # missing
-                if not any(c in pattern for c in ['*', '?', '[', ']']):
-                    print(
-                        f"Warning: File not found: {pattern}",
-                        file=sys.stderr)
-                else:
-                    print(
-                        f"Warning: No files matched pattern: {pattern}",
-                        file=sys.stderr)
-
-    # Remove duplicates and sort
-    return sorted(set(expanded_files))
-
-
-def load_json_summary(json_path: str) -> Dict[str, Any]:
-    """Load a JSON summary file."""
-    with open(json_path, 'r') as f:
-        return json.load(f)
-
-
-def extract_results(json_data: Dict[str, Any]) -> Dict[str, Dict[str, int]]:
-    """Extract per-pass and aggregated correct counts by dataset.
-
-    Returns:
-        Dictionary mapping dataset name to results:
-        {
-            'aime': {
-                'run_1': 735,
-                'run_2': 740,
-                ...
-                'pass@k': 875
-            }
-        }
-    """
-    pass_k = json_data['pass_k']
-    results = defaultdict(dict)
-    overall_results = {}
-
-    # Extract per-pass results
-    if 'per_pass_results' in json_data:
-        for pass_result in json_data['per_pass_results']:
-            pass_num = pass_result['pass_number']
-            run_label = f"run_{pass_num + 1}"  # Convert 0-indexed to 1-indexed
-
-            # Calculate sum of individual datasets for verification
-            sum_correct = 0
-            for dataset_stat in pass_result['datasets']:
-                dataset_name = dataset_stat['dataset']
-                correct = dataset_stat['correct']
-                results[dataset_name][run_label] = correct
-                sum_correct += correct
-
-            # Extract overall from JSON
-            if 'overall' in pass_result:
-                overall_correct = pass_result['overall']['correct']
-                overall_results[run_label] = overall_correct
-
-                # Assert that the sum matches the overall
-                assert sum_correct == overall_correct, (
-                    f"Mismatch in {run_label}: sum of datasets ({sum_correct}) != "
-                    f"overall ({overall_correct})"
-                )
-
-    # Extract aggregated results
-    if 'aggregated_results' in json_data:
-        # Calculate sum of individual datasets for verification
-        sum_correct = 0
-        for dataset_stat in json_data['aggregated_results']['datasets']:
-            dataset_name = dataset_stat['dataset']
-            correct = dataset_stat['correct']
-            results[dataset_name][f'pass@{pass_k}'] = correct
-            sum_correct += correct
-
-        # Extract overall from JSON
-        if 'overall' in json_data['aggregated_results']:
-            overall_correct = json_data['aggregated_results']['overall']['correct']
-            overall_results[f'pass@{pass_k}'] = overall_correct
-
-            # Assert that the sum matches the overall
-            assert sum_correct == overall_correct, (
-                f"Mismatch in pass@{pass_k}: sum of datasets ({sum_correct}) != "
-                f"overall ({overall_correct})"
-            )
-
-    # Handle single-pass results
-    elif 'results' in json_data:
-        # Calculate sum of individual datasets for verification
-        sum_correct = 0
-        for dataset_stat in json_data['results']['datasets']:
-            dataset_name = dataset_stat['dataset']
-            correct = dataset_stat['correct']
-            results[dataset_name]['run_1'] = correct
-            results[dataset_name]['pass@1'] = correct
-            sum_correct += correct
-
-        # Extract overall from JSON if available
-        if 'overall' in json_data['results']:
-            overall_correct = json_data['results']['overall']['correct']
-            overall_results['run_1'] = overall_correct
-            overall_results['pass@1'] = overall_correct
-
-            # Assert that the sum matches the overall
-            assert sum_correct == overall_correct, (
-                f"Mismatch in run_1: sum of datasets ({sum_correct}) != "
-                f"overall ({overall_correct})"
-            )
-
-    # Add overall results
-    if overall_results:
-        results['overall'] = overall_results
-
-    return dict(results)
-
-
-def collect_to_csv(json_files: List[str], output_csv: str,
-                   dataset_order: List[str] = None):
-    """Collect results from multiple JSON files into a CSV.
-
-    Args:
-        json_files: List of JSON file paths
-        output_csv: Output CSV file path
-        dataset_order: Optional list to specify dataset order (e.g., ['aime', 'gpqa', 'livecodebench'])
-    """
-    all_results = []
-    pass_k = None
-
-    # Load all JSON files
-    for json_path in json_files:
-        json_data = load_json_summary(json_path)
-
-        # Determine pass@k value
-        if pass_k is None:
-            pass_k = json_data['pass_k']
-        elif pass_k != json_data['pass_k']:
-            print(f"Warning: {json_path} has pass@{json_data['pass_k']} but expected pass@{pass_k}",
-                  file=sys.stderr)
-
-        # Extract results
-        results = extract_results(json_data)
-        all_results.append({
-            'source_file': json_path,
-            'results': results
-        })
-
-    if not all_results:
-        print("Error: No results to process", file=sys.stderr)
-        return
-
-    # Determine column order
-    run_columns = [f"run_{i+1}" for i in range(pass_k)]
-    pass_column = f"pass@{pass_k}"
-    columns = ['dataset'] + run_columns + [pass_column]
-
-    # Collect all unique datasets
-    all_datasets = set()
-    for result in all_results:
-        all_datasets.update(result['results'].keys())
-
-    # Sort datasets (use provided order or alphabetical)
-    # Always put 'overall' at the end
-    all_datasets_no_overall = all_datasets - {'overall'}
-
-    if dataset_order:
-        # Use provided order, put remaining datasets at the end
-        sorted_datasets = []
-        for ds in dataset_order:
-            if ds.lower() in [d.lower() for d in all_datasets_no_overall]:
-                # Find the actual dataset name (case-sensitive)
-                actual_name = next(
-                    d for d in all_datasets_no_overall if d.lower() == ds.lower())
-                sorted_datasets.append(actual_name)
-        # Add any datasets not in the order list (excluding 'overall')
-        remaining = sorted(
-            [d for d in all_datasets_no_overall if d not in sorted_datasets])
-        sorted_datasets.extend(remaining)
-    else:
-        sorted_datasets = sorted(all_datasets_no_overall)
-
-    # Add 'overall' at the end if it exists
-    if 'overall' in all_datasets:
-        sorted_datasets.append('overall')
-
-    # Write CSV
-    with open(output_csv, 'w', newline='') as f:
-        writer = csv.writer(f)
-
-        # Write header
-        writer.writerow(columns)
-
-        # Write data rows
-        for result in all_results:
-            for dataset in sorted_datasets:
-                if dataset in result['results']:
-                    row = [dataset]
-                    dataset_results = result['results'][dataset]
-
-                    # Add run columns
-                    for run_col in run_columns:
-                        row.append(dataset_results.get(run_col, ''))
-
-                    # Add pass@k column
-                    row.append(dataset_results.get(pass_column, ''))
-
-                    writer.writerow(row)
-
-    print(f"CSV saved to: {output_csv}")
-    print(
-        f"Collected {len(all_results)} result sets across {len(sorted_datasets)} datasets")
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Collect multiple JSON summaries into a CSV. Supports glob patterns.",
-        epilog="Examples:\n"
-               "  %(prog)s results_*_summarize.json\n"
-               "  %(prog)s data/*.json -o output.csv\n"
-               "  %(prog)s run1.json run2.json run3.json --dataset-order aime gpqa livecodebench",
-        formatter_class=argparse.RawDescriptionHelpFormatter)
-    parser.add_argument("json_files", nargs='+',
-                        help="One or more JSON files or glob patterns (e.g., '*.json', 'results_*_summarize.json')")
-    parser.add_argument("-o", "--output", default="collected_results.csv",
-                        help="Output CSV file (default: collected_results.csv)")
-    parser.add_argument("--dataset-order", nargs='*',
-                        help="Optional dataset order (e.g., aime gpqa livecodebench)")
-
-    args = parser.parse_args()
-
-    # Expand glob patterns
-    expanded_files = expand_glob_patterns(args.json_files)
-
-    if not expanded_files:
-        print(
-            "Error: No JSON files found matching the provided patterns",
-            file=sys.stderr)
-        sys.exit(1)
-
-    print(f"Found {len(expanded_files)} JSON files:")
-    for f in expanded_files:
-        print(f"  - {f}")
-    print()
-
-    try:
-        collect_to_csv(expanded_files, args.output, args.dataset_order)
-    except Exception as e:
-        print(f"Error: {e}", file=sys.stderr)
-        import traceback
-        traceback.print_exc()
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/language/gpt-oss-120b/archive/plot.py b/language/gpt-oss-120b/archive/plot.py
deleted file mode 100644
index d85d470c22..0000000000
--- a/language/gpt-oss-120b/archive/plot.py
+++ /dev/null
@@ -1,354 +0,0 @@
-#!/usr/bin/env python3
-"""
-Histogram analysis of token input length (ISL) and output length (OSL) across datasets.
-Creates 8 histograms as specified.
-"""
-
-import pandas as pd
-import matplotlib.pyplot as plt
-import seaborn as sns
-import numpy as np
-from pathlib import Path
-import argparse
-
-
-def load_data(pkl_path):
-    """Load the pickle file and return the DataFrame."""
-    print(f"Loading data from {pkl_path}...")
-    df = pd.read_pickle(pkl_path)
-    print(f"Loaded {len(df)} rows with columns: {list(df.columns)}")
-    return df
-
-
-def create_per_dataset_histogram(df, column_name, title, filename, output_dir):
-    """Create individual histograms for each dataset."""
-    datasets = sorted(df['dataset'].unique())
-    print(f"Creating {filename}...")
-    print(f"  Datasets: {datasets}")
-    print(f"  Total samples: {len(df)}")
-
-    # Determine grid layout
-    n_datasets = len(datasets)
-    n_cols = 3
-    n_rows = (n_datasets + n_cols - 1) // n_cols
-
-    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 6 * n_rows))
-    if n_datasets == 1:
-        axes = np.array([axes])
-    axes = axes.flatten()
-
-    for i, dataset in enumerate(datasets):
-        ax = axes[i]
-        dataset_data = df[df['dataset'] == dataset][column_name]
-
-        # Create histogram
-        ax.hist(
-            dataset_data,
-            bins=30,
-            alpha=0.7,
-            edgecolor='black',
-            linewidth=0.5,
-            color='skyblue' if 'OSL' in title else 'lightcoral')
-
-        ax.set_title(
-            f'{dataset}\n(n={len(dataset_data)})',
-            fontsize=12,
-            fontweight='bold')
-        ax.set_xlabel(title, fontsize=10)
-        ax.set_ylabel('Frequency', fontsize=10)
-        ax.grid(True, alpha=0.3)
-
-        # Add percentile lines
-        p50 = dataset_data.quantile(0.50)
-        p75 = dataset_data.quantile(0.75)
-        p99 = dataset_data.quantile(0.99)
-
-        ax.axvline(
-            p50,
-            color='green',
-            linestyle='--',
-            linewidth=2,
-            alpha=0.7,
-            label=f'50th: {p50:.1f}')
-        ax.axvline(
-            p75,
-            color='orange',
-            linestyle='--',
-            linewidth=2,
-            alpha=0.7,
-            label=f'75th: {p75:.1f}')
-        ax.axvline(
-            p99,
-            color='red',
-            linestyle='--',
-            linewidth=2,
-            alpha=0.7,
-            label=f'99th: {p99:.1f}')
-        ax.legend(loc='upper right', fontsize=8)
-
-        # Add statistics
-        mean_val = dataset_data.mean()
-        median_val = dataset_data.median()
-        std_val = dataset_data.std()
-        stats_text = f'Mean: {mean_val:.1f}\nMedian: {median_val:.1f}\nStd: {std_val:.1f}'
-        ax.text(0.98, 0.78, stats_text,
-                transform=ax.transAxes,
-                verticalalignment='top',
-                horizontalalignment='right',
-                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8),
-                fontsize=9)
-
-    # Hide unused subplots
-    for i in range(n_datasets, len(axes)):
-        axes[i].set_visible(False)
-
-    plt.tight_layout()
-    plt.savefig(f'{output_dir}/{filename}', dpi=300, bbox_inches='tight')
-    print(f"  Saved to {output_dir}/{filename}")
-    plt.close()
-
-
-def create_full_histogram(df, column_name, title,
-                          filename, output_dir, save_bins=False):
-    """Create a single histogram combining all datasets."""
-    print(f"Creating {filename}...")
-    print(f"  Total samples: {len(df)}")
-
-    plt.figure(figsize=(12, 8))
-
-    color = 'skyblue' if 'OSL' in title else 'lightcoral'
-    counts, bin_edges, patches = plt.hist(
-        df[column_name],
-        bins=50,
-        alpha=0.7,
-        edgecolor='black',
-        linewidth=0.5,
-        color=color)
-
-    plt.title(title, fontsize=14, fontweight='bold')
-    plt.xlabel(
-        column_name.replace(
-            'tok_',
-            '').replace(
-            '_len',
-            '').upper(),
-        fontsize=12)
-    plt.ylabel('Frequency', fontsize=12)
-    plt.grid(True, alpha=0.3)
-
-    # Add percentile lines
-    p50 = df[column_name].quantile(0.50)
-    p75 = df[column_name].quantile(0.75)
-    p99 = df[column_name].quantile(0.99)
-
-    plt.axvline(p50, color='green', linestyle='--', linewidth=2,
-                alpha=0.7, label=f'50th percentile: {p50:.1f}')
-    plt.axvline(p75, color='orange', linestyle='--', linewidth=2,
-                alpha=0.7, label=f'75th percentile: {p75:.1f}')
-    plt.axvline(p99, color='red', linestyle='--', linewidth=2,
-                alpha=0.7, label=f'99th percentile: {p99:.1f}')
-    plt.legend(loc='upper right', fontsize=10)
-
-    # Add statistics
-    mean_val = df[column_name].mean()
-    median_val = df[column_name].median()
-    std_val = df[column_name].std()
-    min_val = df[column_name].min()
-    max_val = df[column_name].max()
-
-    stats_text = f'Total samples: {len(df)}\n'
-    stats_text += f'Mean: {mean_val:.1f}\n'
-    stats_text += f'Median: {median_val:.1f}\n'
-    stats_text += f'Std: {std_val:.1f}\n'
-    stats_text += f'Min: {min_val}\n'
-    stats_text += f'Max: {max_val}'
-
-    plt.text(0.98, 0.78, stats_text,
-             transform=plt.gca().transAxes,
-             verticalalignment='top',
-             horizontalalignment='right',
-             fontsize=10,
-             bbox=dict(boxstyle='round', facecolor='lightblue' if 'OSL' in title else 'lightcoral', alpha=0.8))
-
-    plt.tight_layout()
-    plt.savefig(f'{output_dir}/{filename}', dpi=300, bbox_inches='tight')
-    print(f"  Saved to {output_dir}/{filename}")
-    plt.close()
-
-    # Save bin data to CSV if requested
-    if save_bins:
-        csv_filename = filename.replace('.png', '_bins.csv')
-
-        # Create bin data DataFrame
-        bin_data = pd.DataFrame({
-            'bin_lower': bin_edges[:-1],
-            'bin_upper': bin_edges[1:],
-            'bin_center': (bin_edges[:-1] + bin_edges[1:]) / 2,
-            'count': counts.astype(int)
-        })
-
-        csv_path = f'{output_dir}/{csv_filename}'
-
-        # Save with header containing percentile information
-        with open(csv_path, 'w') as f:
-            f.write(
-                f'# Percentiles: 50th={p50:.2f}, 75th={p75:.2f}, 99th={p99:.2f}\n')
-            f.write(
-                f'# Mean={mean_val:.2f}, Median={median_val:.2f}, Std={std_val:.2f}\n')
-            f.write(
-                f'# Min={min_val}, Max={max_val}, Total samples={len(df)}\n')
-            bin_data.to_csv(f, index=False)
-
-        print(f"  Saved bin data to {csv_path}")
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description='Create histograms of token lengths (ISL and OSL)')
-    parser.add_argument('pkl_path', help='Path to the pickle file')
-    parser.add_argument(
-        '--output-dir',
-        default='histograms',
-        help='Output directory for plots')
-
-    args = parser.parse_args()
-
-    # Create output directory
-    Path(args.output_dir).mkdir(exist_ok=True)
-
-    # Load data
-    df = load_data(args.pkl_path)
-
-    # Check if dataset column exists
-    has_dataset = 'dataset' in df.columns
-    if not has_dataset:
-        print("\nNote: 'dataset' column not found - skipping per-dataset histograms")
-        # Add a dummy dataset column for compatibility with existing code
-        df['dataset'] = 'default'
-
-    # Check if prompt_accuracy column exists
-    has_accuracy = 'prompt_accuracy' in df.columns
-
-    # Determine which output length column to use
-    if 'tok_model_output_len' in df.columns:
-        output_len_col = 'tok_model_output_len'
-    elif 'tok_model_output_len_0' in df.columns:
-        output_len_col = 'tok_model_output_len_0'
-        print("\nNote: 'tok_model_output_len' not found, using 'tok_model_output_len_0' instead")
-    else:
-        raise ValueError(
-            "Neither 'tok_model_output_len' nor 'tok_model_output_len_0' column found in data")
-
-    if has_accuracy:
-        # Filter for 100% accuracy
-        df_100 = df[df['prompt_accuracy'] == 100.0].copy()
-        print(
-            f"\nFiltered {len(df_100)} rows with prompt_accuracy == 100 (out of {len(df)} total)\n")
-    else:
-        print("\nNote: 'prompt_accuracy' column not found - skipping accuracy-based histograms\n")
-        # Create empty dataframe with dataset column for consistency
-        df_100 = pd.DataFrame(columns=df.columns)
-
-    print("=" * 60)
-    print("CREATING ISL HISTOGRAMS")
-    print("=" * 60)
-
-    # 1. Per dataset ISL histogram
-    if has_dataset:
-        create_per_dataset_histogram(
-            df, 'tok_input_len',
-            'Token Input Length (ISL)',
-            '1_per_dataset_ISL.png',
-            args.output_dir)
-    else:
-        print("Skipping per-dataset ISL: dataset column not found")
-
-    # 2. Per dataset ISL histogram (accuracy == 100)
-    if has_dataset and has_accuracy and len(df_100) > 0:
-        create_per_dataset_histogram(
-            df_100, 'tok_input_len',
-            'Token Input Length (ISL) - 100% Accuracy',
-            '2_per_dataset_ISL_acc100.png',
-            args.output_dir)
-    elif not has_dataset:
-        print("Skipping per-dataset ISL (acc==100): dataset column not found")
-    elif not has_accuracy:
-        print("Skipping per-dataset ISL (acc==100): prompt_accuracy column not found")
-    else:
-        print("Skipping per-dataset ISL (acc==100): no data with 100% accuracy")
-
-    # 3. Full ISL histogram
-    create_full_histogram(
-        df, 'tok_input_len',
-        'Token Input Length (ISL) - All Data',
-        '3_full_ISL.png',
-        args.output_dir,
-        save_bins=True)
-
-    # 4. Full ISL histogram (accuracy == 100)
-    if has_accuracy and len(df_100) > 0:
-        create_full_histogram(
-            df_100, 'tok_input_len',
-            'Token Input Length (ISL) - 100% Accuracy',
-            '4_full_ISL_acc100.png',
-            args.output_dir)
-    elif has_accuracy:
-        print("Skipping full ISL (acc==100): no data with 100% accuracy")
-    else:
-        print("Skipping full ISL (acc==100): prompt_accuracy column not found")
-
-    print("\n" + "=" * 60)
-    print("CREATING OSL HISTOGRAMS")
-    print("=" * 60)
-
-    # 5. Per dataset OSL histogram
-    if has_dataset:
-        create_per_dataset_histogram(
-            df, output_len_col,
-            'Token Output Length (OSL)',
-            '5_per_dataset_OSL.png',
-            args.output_dir)
-    else:
-        print("Skipping per-dataset OSL: dataset column not found")
-
-    # 6. Per dataset OSL histogram (accuracy == 100)
-    if has_dataset and has_accuracy and len(df_100) > 0:
-        create_per_dataset_histogram(
-            df_100, output_len_col,
-            'Token Output Length (OSL) - 100% Accuracy',
-            '6_per_dataset_OSL_acc100.png',
-            args.output_dir)
-    elif not has_dataset:
-        print("Skipping per-dataset OSL (acc==100): dataset column not found")
-    elif not has_accuracy:
-        print("Skipping per-dataset OSL (acc==100): prompt_accuracy column not found")
-    else:
-        print("Skipping per-dataset OSL (acc==100): no data with 100% accuracy")
-
-    # 7. Full OSL histogram
-    create_full_histogram(
-        df, output_len_col,
-        'Token Output Length (OSL) - All Data',
-        '7_full_OSL.png',
-        args.output_dir,
-        save_bins=True)
-
-    # 8. Full OSL histogram (accuracy == 100)
-    if has_accuracy and len(df_100) > 0:
-        create_full_histogram(
-            df_100, output_len_col,
-            'Token Output Length (OSL) - 100% Accuracy',
-            '8_full_OSL_acc100.png',
-            args.output_dir)
-    elif has_accuracy:
-        print("Skipping full OSL (acc==100): no data with 100% accuracy")
-    else:
-        print("Skipping full OSL (acc==100): prompt_accuracy column not found")
-
-    print(f"\n{'=' * 60}")
-    print(f"All histograms saved to {args.output_dir}/")
-    print(f"{'=' * 60}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/language/gpt-oss-120b/archive/plot_results.py b/language/gpt-oss-120b/archive/plot_results.py
deleted file mode 100755
index d3cc889825..0000000000
--- a/language/gpt-oss-120b/archive/plot_results.py
+++ /dev/null
@@ -1,355 +0,0 @@
-#!/usr/bin/env python3
-"""
-Generate grouped box plots from collected results CSV.
-
-Creates two plots:
-1. Individual runs box plot (run_1, run_2, ..., run_k)
-2. Pass@k box plot
-"""
-
-import argparse
-import sys
-import csv
-import re
-from pathlib import Path
-from typing import Dict, List
-from collections import defaultdict
-import matplotlib.pyplot as plt
-import numpy as np
-
-
-def load_csv_data(csv_path: str) -> Dict[str, Dict[str, List[float]]]:
-    """Load CSV data and organize by dataset.
-
-    Returns:
-        {
-            'aime': {
-                'run_1': [735, 752, 765, ...],
-                'run_2': [740, 754, 765, ...],
-                'pass@5': [875, 875, 885, ...],
-                'pass@1 with 5 repeats: (average of run_k)': [861, 857, ...]
-            },
-            'gpqa': {...},
-            ...
-        }
-    """
-    data = defaultdict(lambda: defaultdict(list))
-
-    with open(csv_path, 'r') as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            dataset = row['dataset']
-            for key, value in row.items():
-                if key != 'dataset' and value:
-                    try:
-                        # Try to parse as float to support decimal values
-                        data[dataset][key].append(float(value))
-                    except ValueError:
-                        continue
-
-    return dict(data)
-
-
-def create_combined_box_plot(dataset_name: str,
-                             dataset_data: Dict[str, List[float]],
-                             run_columns: List[str],
-                             passk_columns: List[str],
-                             output_file: str,
-                             ylabel: str = "Correct Count"):
-    """Create separate box plots for individual runs, pass@k, and computed averages in the same figure.
-
-    Args:
-        dataset_name: Name of the dataset
-        dataset_data: Data for this dataset (column -> list of values)
-        run_columns: Individual run columns to combine (e.g., ['run_1', 'run_2', ...])
-        passk_columns: Pass@k columns (e.g., ['pass@5'])
-        output_file: Output file path
-        ylabel: Y-axis label
-    """
-    # Combine all individual runs into one list
-    all_runs_data = []
-    for col in run_columns:
-        if col in dataset_data and dataset_data[col]:
-            all_runs_data.extend(dataset_data[col])
-
-    # Collect pass@k data
-    passk_data = []
-    for col in passk_columns:
-        if col in dataset_data and dataset_data[col]:
-            passk_data.extend(dataset_data[col])
-
-    # Compute averages from individual runs (average across runs for each
-    # trial)
-    average_data = []
-    if run_columns:
-        # Find the number of trials (minimum length across all run columns)
-        num_trials = min(len(dataset_data.get(col, []))
-                         for col in run_columns if col in dataset_data)
-
-        # For each trial, compute the average across all runs
-        for trial_idx in range(num_trials):
-            trial_values = []
-            for col in run_columns:
-                if col in dataset_data and trial_idx < len(dataset_data[col]):
-                    trial_values.append(dataset_data[col][trial_idx])
-
-            if trial_values:
-                average_data.append(np.mean(trial_values))
-
-    if not all_runs_data and not passk_data and not average_data:
-        print(f"Warning: No data to plot for {dataset_name}")
-        return
-
-    # Determine number of subplots needed
-    num_plots = 0
-    if all_runs_data:
-        num_plots += 1
-    if passk_data:
-        num_plots += 1
-    if average_data:
-        num_plots += 1
-
-    if num_plots == 0:
-        print(f"Warning: No data to plot for {dataset_name}")
-        return
-
-    # Create figure with subplots side by side
-    fig, axes = plt.subplots(1, num_plots, figsize=(6 * num_plots, 6))
-
-    # Make axes iterable even if there's only one subplot
-    if num_plots == 1:
-        axes = [axes]
-
-    plot_idx = 0
-
-    # Plot individual runs
-    if all_runs_data:
-        ax = axes[plot_idx]
-        plot_idx += 1
-
-        bp = ax.boxplot([all_runs_data], positions=[0], widths=0.5,
-                        patch_artist=True, showmeans=True,
-                        whis=[0, 100], showfliers=False,
-                        meanprops=dict(marker='D', markerfacecolor='red',
-                                       markeredgecolor='red', markersize=8))
-
-        # Color the box
-        bp['boxes'][0].set_facecolor(plt.cm.Set3(0.2))
-        bp['boxes'][0].set_alpha(0.7)
-
-        # Add scatter plot of individual points
-        # Add small random jitter to x-position for visibility
-        np.random.seed(42)  # For reproducibility
-        x_jitter = np.random.normal(0, 0.04, size=len(all_runs_data))
-        ax.scatter(x_jitter, all_runs_data, alpha=0.4, s=30,
-                   color='darkblue', zorder=3, edgecolors='black', linewidth=0.5)
-
-        # Set labels
-        ax.set_xticks([0])
-        ax.set_xticklabels(['Individual Runs'], fontsize=12)
-        ax.set_ylabel(ylabel, fontsize=12)
-        ax.set_title(
-            f"{dataset_name} - Individual Runs",
-            fontsize=13,
-            fontweight='bold')
-        ax.grid(True, axis='y', alpha=0.3, linestyle='--')
-
-        # Add statistics
-        mean_val = np.mean(all_runs_data)
-        std_val = np.std(all_runs_data)
-        min_val = np.min(all_runs_data)
-        max_val = np.max(all_runs_data)
-        n_samples = len(all_runs_data)
-
-        stats_text = f"n={n_samples}\nμ={mean_val:.1f}\nσ={std_val:.1f}\nmin={min_val}\nmax={max_val}"
-        props = dict(boxstyle='round', facecolor='wheat', alpha=0.3)
-        ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, fontsize=9,
-                verticalalignment='top', bbox=props, family='monospace')
-
-    # Plot pass@k
-    if passk_data:
-        ax = axes[plot_idx]
-        plot_idx += 1
-
-        passk_label = passk_columns[0] if len(passk_columns) == 1 else 'Pass@k'
-
-        bp = ax.boxplot([passk_data], positions=[0], widths=0.5,
-                        patch_artist=True, showmeans=True,
-                        whis=[0, 100], showfliers=False,
-                        meanprops=dict(marker='D', markerfacecolor='red',
-                                       markeredgecolor='red', markersize=8))
-
-        # Color the box
-        bp['boxes'][0].set_facecolor(plt.cm.Set3(0.6))
-        bp['boxes'][0].set_alpha(0.7)
-
-        # Add scatter plot of individual points
-        # Add small random jitter to x-position for visibility
-        np.random.seed(42)  # For reproducibility
-        x_jitter = np.random.normal(0, 0.04, size=len(passk_data))
-        ax.scatter(x_jitter, passk_data, alpha=0.4, s=30,
-                   color='darkorange', zorder=3, edgecolors='black', linewidth=0.5)
-
-        # Set labels
-        ax.set_xticks([0])
-        ax.set_xticklabels([passk_label], fontsize=12)
-        ax.set_ylabel(ylabel, fontsize=12)
-        ax.set_title(
-            f"{dataset_name} - {passk_label}",
-            fontsize=13,
-            fontweight='bold')
-        ax.grid(True, axis='y', alpha=0.3, linestyle='--')
-
-        # Add statistics
-        mean_val = np.mean(passk_data)
-        std_val = np.std(passk_data)
-        min_val = np.min(passk_data)
-        max_val = np.max(passk_data)
-        n_samples = len(passk_data)
-
-        stats_text = f"n={n_samples}\nμ={mean_val:.1f}\nσ={std_val:.1f}\nmin={min_val}\nmax={max_val}"
-        props = dict(boxstyle='round', facecolor='wheat', alpha=0.3)
-        ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, fontsize=9,
-                verticalalignment='top', bbox=props, family='monospace')
-
-    # Plot computed averages
-    if average_data:
-        ax = axes[plot_idx]
-
-        average_label = 'Pass@1 (avg of runs)'
-
-        bp = ax.boxplot([average_data], positions=[0], widths=0.5,
-                        patch_artist=True, showmeans=True,
-                        whis=[0, 100], showfliers=False,
-                        meanprops=dict(marker='D', markerfacecolor='red',
-                                       markeredgecolor='red', markersize=8))
-
-        # Color the box
-        bp['boxes'][0].set_facecolor(plt.cm.Set3(0.9))
-        bp['boxes'][0].set_alpha(0.7)
-
-        # Add scatter plot of individual points
-        # Add small random jitter to x-position for visibility
-        np.random.seed(42)  # For reproducibility
-        x_jitter = np.random.normal(0, 0.04, size=len(average_data))
-        ax.scatter(x_jitter, average_data, alpha=0.4, s=30,
-                   color='darkgreen', zorder=3, edgecolors='black', linewidth=0.5)
-
-        # Set labels
-        ax.set_xticks([0])
-        ax.set_xticklabels([average_label], fontsize=12)
-        ax.set_ylabel(ylabel, fontsize=12)
-        ax.set_title(
-            f"{dataset_name} - {average_label}",
-            fontsize=13,
-            fontweight='bold')
-        ax.grid(True, axis='y', alpha=0.3, linestyle='--')
-
-        # Add statistics
-        mean_val = np.mean(average_data)
-        std_val = np.std(average_data)
-        min_val = np.min(average_data)
-        max_val = np.max(average_data)
-        n_samples = len(average_data)
-
-        stats_text = f"n={n_samples}\nμ={mean_val:.1f}\nσ={std_val:.1f}\nmin={min_val:.1f}\nmax={max_val:.1f}"
-        props = dict(boxstyle='round', facecolor='wheat', alpha=0.3)
-        ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, fontsize=9,
-                verticalalignment='top', bbox=props, family='monospace')
-
-    # Adjust layout
-    plt.tight_layout()
-
-    # Save figure
-    plt.savefig(output_file, dpi=300, bbox_inches='tight')
-    print(f"Saved plot to: {output_file}")
-    plt.close()
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Generate box plots from collected results CSV")
-    parser.add_argument("csv_file",
-                        help="Input CSV file from collect_results_csv.py")
-    parser.add_argument("-o", "--output-dir", default=".",
-                        help="Output directory for plots (default: current directory)")
-    parser.add_argument("--prefix", default="boxplot",
-                        help="Prefix for output files (default: boxplot)")
-
-    args = parser.parse_args()
-
-    # Check if input file exists
-    if not Path(args.csv_file).exists():
-        print(f"Error: File not found: {args.csv_file}", file=sys.stderr)
-        sys.exit(1)
-
-    # Create output directory if needed
-    output_dir = Path(args.output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    # Load data
-    print(f"Loading data from: {args.csv_file}")
-    data = load_csv_data(args.csv_file)
-
-    if not data:
-        print("Error: No data loaded from CSV", file=sys.stderr)
-        sys.exit(1)
-
-    print(f"Loaded data for {len(data)} datasets")
-
-    # Determine which columns are runs vs pass@k
-    all_columns = set()
-    for dataset_data in data.values():
-        all_columns.update(dataset_data.keys())
-
-    # Separate run columns from pass@k columns
-    # Use regex to match only exact pass@k format (e.g., pass@5, pass@10)
-    run_columns = sorted(
-        [col for col in all_columns if col.startswith('run_')])
-    passk_pattern = re.compile(r'^pass@\d+$')
-    passk_columns = sorted(
-        [col for col in all_columns if passk_pattern.match(col)])
-
-    if not run_columns and not passk_columns:
-        print("Error: No run or pass@k columns found in CSV", file=sys.stderr)
-        sys.exit(1)
-
-    print(f"Found {len(run_columns)} run columns: {', '.join(run_columns)}")
-    print(
-        f"Found {len(passk_columns)} pass@k columns: {', '.join(passk_columns)}")
-    print(f"Will compute averages from individual runs")
-    print()
-
-    # Generate plots for each dataset separately
-    # Always put 'overall' at the end
-    datasets_no_overall = [d for d in data.keys() if d != 'overall']
-    datasets = sorted(datasets_no_overall)
-    if 'overall' in data:
-        datasets.append('overall')
-
-    print(f"Generating plots for {len(datasets)} datasets...\n")
-
-    for dataset in datasets:
-        dataset_data = data[dataset]
-
-        # Create combined plot: Individual Runs (all combined) vs Pass@k vs
-        # Computed Averages
-        if run_columns or passk_columns:
-            output_file = output_dir / f"{args.prefix}_{dataset}.png"
-            print(f"Creating combined box plot for {dataset}...")
-            create_combined_box_plot(
-                dataset_name=dataset,
-                dataset_data=dataset_data,
-                run_columns=run_columns,
-                passk_columns=passk_columns,
-                output_file=str(output_file),
-                ylabel="Correct Count"
-            )
-
-        print()
-
-    print("Done!")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/language/gpt-oss-120b/archive/summarize_eval.py b/language/gpt-oss-120b/archive/summarize_eval.py
deleted file mode 100644
index 739097c68c..0000000000
--- a/language/gpt-oss-120b/archive/summarize_eval.py
+++ /dev/null
@@ -1,371 +0,0 @@
-#!/usr/bin/env python3
-"""
-Summarize evaluation results from eval_accuracy.py output.
-
-Reads an evaluated pickle file and prints a summary of results by dataset,
-including per-pass statistics and aggregated pass@k results.
-"""
-
-import argparse
-import pickle
-import sys
-import json
-import glob
-from pathlib import Path
-from typing import Dict, Any, List
-import pandas as pd
-
-
-def expand_glob_patterns(patterns: List[str]) -> List[str]:
-    """Expand glob patterns to actual file paths.
-
-    Args:
-        patterns: List of file paths or glob patterns (e.g., '*.pkl', 'data/*_evaluated.pkl')
-
-    Returns:
-        List of actual file paths (sorted)
-    """
-    expanded_files = []
-
-    for pattern in patterns:
-        # If it's a literal file path that exists, use it directly
-        if Path(pattern).exists() and not any(
-                c in pattern for c in ['*', '?', '[', ']']):
-            expanded_files.append(pattern)
-        else:
-            # Try to expand as a glob pattern
-            matches = glob.glob(pattern)
-            if matches:
-                expanded_files.extend(matches)
-            else:
-                # If no matches and it's not a glob pattern, report the file as
-                # missing
-                if not any(c in pattern for c in ['*', '?', '[', ']']):
-                    print(
-                        f"Warning: File not found: {pattern}",
-                        file=sys.stderr)
-                else:
-                    print(
-                        f"Warning: No files matched pattern: {pattern}",
-                        file=sys.stderr)
-
-    # Remove duplicates and sort
-    return sorted(set(expanded_files))
-
-
-def detect_pass_k(df: pd.DataFrame) -> int:
-    """Detect if DataFrame has pass@k format and return k.
-
-    Returns:
-        Number of passes (k) if pass@k format detected, otherwise 1
-    """
-    # Check for model_output_0, model_output_1, etc.
-    pass_k = 0
-    while f'model_output_{pass_k}' in df.columns:
-        pass_k += 1
-
-    # If no _0 suffix found, check for single model_output column
-    if pass_k == 0 and 'model_output' in df.columns:
-        return 1
-
-    return pass_k
-
-
-def calculate_dataset_stats(df: pd.DataFrame, dataset_name: str,
-                            pass_num: int = None, pass_k: int = 1) -> Dict[str, Any]:
-    """Calculate statistics for a specific dataset and pass.
-
-    Args:
-        df: DataFrame with evaluation results
-        dataset_name: Name of the dataset to filter
-        pass_num: Pass number (None for aggregated results)
-        pass_k: Total number of passes (for aggregated results)
-
-    Returns:
-        Dictionary with statistics
-    """
-    # Filter to this dataset
-    dataset_df = df[df['dataset'] == dataset_name]
-
-    # Determine column suffixes
-    if pass_num is None:
-        # Aggregated results
-        accuracy_col = 'prompt_accuracy' if 'prompt_accuracy' in dataset_df.columns else 'prompt_accuracy_0'
-
-        # For aggregated pass@k, count answered as any sample with at least one
-        # extracted answer
-        if pass_k > 1:
-            # Check if any pass has an extracted answer
-            answered_mask = pd.Series(
-                [False] * len(dataset_df),
-                index=dataset_df.index)
-            for i in range(pass_k):
-                col = f'extracted_answer_{i}'
-                if col in dataset_df.columns:
-                    answered_mask |= dataset_df[col].notna()
-            answered = answered_mask.sum()
-        else:
-            extracted_col = 'extracted_answer' if 'extracted_answer' in dataset_df.columns else 'extracted_answer_0'
-            answered = dataset_df[extracted_col].notna().sum()
-    else:
-        # Specific pass
-        suffix = f'_{pass_num}'
-        extracted_col = f'extracted_answer{suffix}'
-        accuracy_col = f'prompt_accuracy{suffix}'
-        answered = dataset_df[extracted_col].notna().sum()
-
-    # Calculate statistics
-    total = len(dataset_df)
-    correct = (dataset_df[accuracy_col] > 0).sum()
-
-    # Calculate percentage (correct / total)
-    if total > 0:
-        pct_correct = (correct / total) * 100
-    else:
-        pct_correct = 0.0
-
-    # Calculate mean accuracy (handles HealthBench partial scores)
-    mean_accuracy = dataset_df[accuracy_col].mean()
-
-    return {
-        'dataset': dataset_name,
-        'total': int(total),
-        'answered': int(answered),
-        'correct': int(correct),
-        'pct_correct': float(pct_correct),
-        'mean_accuracy': float(mean_accuracy),
-    }
-
-
-def print_summary_table(
-        stats_list: List[Dict[str, Any]], title: str = "Summary"):
-    """Print a formatted summary table.
-
-    Args:
-        stats_list: List of statistics dictionaries
-        title: Title for the table
-    """
-    print(f"\n{'=' * 85}")
-    print(f"{title}")
-    print('=' * 85)
-    print(f"{'Dataset':<20} {'Total':>8} {'Answered':>10} {'Correct':>10} {'Accuracy':>12}")
-    print('-' * 85)
-
-    for stats in stats_list:
-        dataset_name = stats['dataset']
-        total = stats['total']
-        answered = stats['answered']
-        correct = stats['correct']
-        pct_correct = stats['pct_correct']
-
-        # Format the row
-        print(
-            f"{dataset_name:<20} {total:>8} {answered:>10} {correct:>10} {pct_correct:>11.2f}%")
-
-    # Print totals
-    if len(stats_list) > 1:
-        total_samples = sum(s['total'] for s in stats_list)
-        total_answered = sum(s['answered'] for s in stats_list)
-        total_correct = sum(s['correct'] for s in stats_list)
-        overall_pct = (
-            total_correct /
-            total_samples *
-            100) if total_samples > 0 else 0.0
-
-        print('-' * 85)
-        print(f"{'OVERALL':<20} {total_samples:>8} {total_answered:>10} {total_correct:>10} {overall_pct:>11.2f}%")
-
-    print('=' * 85)
-
-
-def summarize_evaluation(pickle_path: str, json_output: bool = False) -> str:
-    """Load and summarize evaluation results.
-
-    Args:
-        pickle_path: Path to evaluated pickle file
-        json_output: If True, save results to JSON file instead of printing
-
-    Returns:
-        Path to JSON file if json_output=True, otherwise empty string
-    """
-    # Load the pickle file
-    print(f"Loading evaluation results from: {pickle_path}")
-    with open(pickle_path, 'rb') as f:
-        df = pickle.load(f)
-
-    print(f"Loaded {len(df)} samples")
-
-    # Detect pass@k format
-    pass_k = detect_pass_k(df)
-    print(f"Detected format: pass@{pass_k}" if pass_k >
-          1 else "Detected format: single-pass")
-
-    # Get list of datasets
-    datasets = sorted(df['dataset'].unique())
-    print(f"Datasets found: {', '.join(datasets)}")
-
-    # Structure to hold all results
-    results_data = {
-        'input_file': pickle_path,
-        'total_samples': len(df),
-        'pass_k': pass_k,
-        'datasets': list(datasets),
-    }
-
-    # Calculate statistics for each dataset
-    if pass_k > 1:
-        # Collect per-pass statistics
-        per_pass_results = []
-        for pass_num in range(pass_k):
-            stats_list = []
-            for dataset in datasets:
-                stats = calculate_dataset_stats(
-                    df, dataset, pass_num=pass_num, pass_k=pass_k)
-                stats_list.append(stats)
-
-            print_summary_table(stats_list, title=f"Pass {pass_num} Results")
-
-            per_pass_results.append({
-                'pass_number': pass_num,
-                'datasets': stats_list,
-                'overall': {
-                    'total': sum(s['total'] for s in stats_list),
-                    'answered': sum(s['answered'] for s in stats_list),
-                    'correct': sum(s['correct'] for s in stats_list),
-                    'accuracy': (sum(s['correct'] for s in stats_list) / sum(s['total'] for s in stats_list) * 100) if sum(s['total'] for s in stats_list) > 0 else 0.0
-                }
-            })
-
-        results_data['per_pass_results'] = per_pass_results
-
-        # Show aggregated (pass@k) statistics
-        print("\n")
-        stats_list = []
-        for dataset in datasets:
-            stats = calculate_dataset_stats(
-                df, dataset, pass_num=None, pass_k=pass_k)
-            stats_list.append(stats)
-
-        aggregated_results = {
-            'datasets': stats_list,
-            'overall': {
-                'total': sum(s['total'] for s in stats_list),
-                'answered': sum(s['answered'] for s in stats_list),
-                'correct': sum(s['correct'] for s in stats_list),
-                'accuracy': (sum(s['correct'] for s in stats_list) / sum(s['total'] for s in stats_list) * 100) if sum(s['total'] for s in stats_list) > 0 else 0.0
-            }
-        }
-        results_data['aggregated_results'] = aggregated_results
-
-        # Always print summary table
-        print_summary_table(
-            stats_list,
-            title=f"Aggregated Pass@{pass_k} Results (Max Across Passes)")
-    else:
-        # Single pass - just show the results
-        stats_list = []
-        for dataset in datasets:
-            stats = calculate_dataset_stats(
-                df, dataset, pass_num=None, pass_k=pass_k)
-            stats_list.append(stats)
-
-        single_pass_results = {
-            'datasets': stats_list,
-            'overall': {
-                'total': sum(s['total'] for s in stats_list),
-                'answered': sum(s['answered'] for s in stats_list),
-                'correct': sum(s['correct'] for s in stats_list),
-                'accuracy': (sum(s['correct'] for s in stats_list) / sum(s['total'] for s in stats_list) * 100) if sum(s['total'] for s in stats_list) > 0 else 0.0
-            }
-        }
-        results_data['results'] = single_pass_results
-
-        # Always print summary table
-        print_summary_table(stats_list, title="Evaluation Results")
-
-    # Print column information for reference
-    print("\nColumn Information:")
-    print(f"  - Total: Total number of samples in the dataset")
-    if pass_k > 1:
-        print(f"  - Answered: Number of samples with at least one extracted answer across all passes")
-    else:
-        print(f"  - Answered: Number of samples with extracted answers")
-    print(f"  - Correct: Number of correct answers (accuracy > 0)")
-    print(f"  - Accuracy: Percentage of total samples that were correct (correct / total)")
-
-    if pass_k > 1:
-        print(f"\nPass@{pass_k} Note:")
-        print(f"  - Per-pass results show individual pass performance")
-        print(
-            f"  - Aggregated results show the maximum accuracy achieved across all {pass_k} passes")
-        print(
-            f"  - A sample is considered correct if ANY of the {pass_k} attempts were correct")
-        print(
-            f"  - A sample is considered answered if ANY of the {pass_k} attempts extracted an answer")
-
-    # Save to JSON if requested
-    if json_output:
-        # Generate output filename: input_file_summarize.json
-        input_path = Path(pickle_path)
-        output_filename = input_path.stem + "_summarize.json"
-        output_path = input_path.parent / output_filename
-
-        with open(output_path, 'w') as f:
-            json.dump(results_data, f, indent=2)
-
-        print(f"\nSummary saved to: {output_path}")
-        return str(output_path)
-
-    return ""
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Summarize evaluation results by dataset. Supports glob patterns.",
-        epilog="Examples:\n"
-               "  %(prog)s results_evaluated.pkl\n"
-               "  %(prog)s data/*_evaluated.pkl\n"
-               "  %(prog)s --json data/accuracy_eval_*_evaluated.pkl",
-        formatter_class=argparse.RawDescriptionHelpFormatter)
-    parser.add_argument("input_files", nargs='+',
-                        help="One or more paths to evaluated pickle files or glob patterns (e.g., '*.pkl', 'data/*_evaluated.pkl')")
-    parser.add_argument("--json", action="store_true",
-                        help="Output results in JSON format (for programmatic use)")
-
-    args = parser.parse_args()
-
-    # Expand glob patterns
-    expanded_files = expand_glob_patterns(args.input_files)
-
-    if not expanded_files:
-        print(
-            "Error: No files found matching the provided patterns",
-            file=sys.stderr)
-        sys.exit(1)
-
-    print(f"Found {len(expanded_files)} file(s) to process:")
-    for f in expanded_files:
-        print(f"  - {f}")
-    print()
-
-    # Process each file
-    for input_file in expanded_files:
-        # Check if file has _evaluated suffix (warn if not)
-        if "_evaluated" not in input_file:
-            print(f"Warning: Input file does not contain '_evaluated' suffix. "
-                  f"Make sure this is an evaluated pickle file from eval_accuracy.py",
-                  file=sys.stderr)
-
-        try:
-            summarize_evaluation(input_file, json_output=args.json)
-            print()  # Add spacing between files
-        except Exception as e:
-            print(f"Error processing file {input_file}: {e}", file=sys.stderr)
-            import traceback
-            traceback.print_exc()
-            # Continue processing other files
-            continue
-
-
-if __name__ == "__main__":
-    main()

From 34a8c74ac3762a29ac97057ca47bd995ebd34eba Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 15 Dec 2025 13:33:30 +0000
Subject: [PATCH 151/165] update frozen requirements

---
 language/gpt-oss-120b/requirements.txt | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/language/gpt-oss-120b/requirements.txt b/language/gpt-oss-120b/requirements.txt
index aab1573917..9d8f33995a 100644
--- a/language/gpt-oss-120b/requirements.txt
+++ b/language/gpt-oss-120b/requirements.txt
@@ -1,11 +1,11 @@
-audioread>=2.1.9
-joblib>=0.14
-msgpack>=1.0
-numba>=0.51.0
-pooch>=1.0
-scikit-learn>=0.20.0
-soxr>=0.3.2
-absl-py>=2.3.1
-lazy-loader>=0.1
-datasets>=2.0.0,<3.0.0
-anthropic~=0.72.0
+absl-py==2.3.1
+anthropic==0.72.0
+audioread==3.1.0
+datasets==2.21.0
+joblib==1.5.3
+lazy_loader==0.4
+msgpack==1.1.2
+numba==0.63.1
+pooch==1.8.2
+scikit-learn==1.8.0
+soxr==1.0.0

From 02206f7176c5e40b69160a8dd43accce42bab729 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 15 Dec 2025 17:51:18 +0000
Subject: [PATCH 152/165] rm harmonize script + fix score calculation

---
 language/gpt-oss-120b/eval_mlperf_accuracy.py | 302 +++-------
 .../preprocess/harmonize_inputs.py            | 536 ------------------
 2 files changed, 89 insertions(+), 749 deletions(-)
 delete mode 100644 language/gpt-oss-120b/preprocess/harmonize_inputs.py

diff --git a/language/gpt-oss-120b/eval_mlperf_accuracy.py b/language/gpt-oss-120b/eval_mlperf_accuracy.py
index f9c1e38f9f..818dd3b444 100644
--- a/language/gpt-oss-120b/eval_mlperf_accuracy.py
+++ b/language/gpt-oss-120b/eval_mlperf_accuracy.py
@@ -43,35 +43,31 @@
 )
 logger = logging.getLogger(__name__)
 
+# Hardcoded repeats per dataset for final score calculation
+# Final score = sum(dataset_correct / dataset_repeats)
+DATASET_REPEATS = {
+    'aime25': 8,
+    'gpqa_diamond': 5,
+    'livecodebench_v6': 3,
+}
 
-def load_mlperf_log(log_path: str) -> Tuple[List[Dict[str, Any]], int]:
+
+def load_mlperf_log(log_path: str) -> List[Dict[str, Any]]:
     """Load MLPerf accuracy JSON log.
 
     Args:
         log_path: Path to mlperf_log_accuracy.json
 
     Returns:
-        Tuple of (log_data, num_repeats)
-        - log_data: List of log entries with seq_id, qsl_idx, repeat_idx, data (hex), token_count
-        - num_repeats: Number of repeats per sample (pass@k value)
+        List of log entries with seq_id, qsl_idx, data (hex), token_count
     """
     logger.info(f"Loading MLPerf log from {log_path}")
     with open(log_path, 'r') as f:
         log_data = json.load(f)
 
-    # Detect number of repeats from repeat_idx field
-    max_repeat_idx = 0
-    for entry in log_data:
-        repeat_idx = entry.get('repeat_idx', 0)
-        max_repeat_idx = max(max_repeat_idx, repeat_idx)
-
-    num_repeats = max_repeat_idx + 1
-
     logger.info(f"Loaded {len(log_data)} log entries")
-    logger.info(
-        f"Detected repeats_per_sample = {num_repeats} (pass@{num_repeats} format)")
 
-    return log_data, num_repeats
+    return log_data
 
 
 def decode_hex_to_tokens(hex_data: str) -> List[int]:
@@ -140,7 +136,6 @@ def process_livecodebench_batch(
     for entry in tqdm(entries, desc=f"Parsing {dataset_name}", unit="entry"):
         seq_id = entry['seq_id']
         qsl_idx = entry['qsl_idx']
-        repeat_idx = entry.get('repeat_idx', 0)
         hex_data = entry['data']
 
         ref_row = reference_df.iloc[qsl_idx]
@@ -156,7 +151,6 @@ def process_livecodebench_batch(
         entry_metadata.append({
             'entry': entry,
             'qsl_idx': qsl_idx,
-            'repeat_idx': repeat_idx,
             'ref_row': ref_row,
             'token_ids': token_ids,
             'model_output': model_output,
@@ -210,7 +204,6 @@ def process_livecodebench_batch(
     for idx, metadata in enumerate(entry_metadata):
         entry = metadata['entry']
         qsl_idx = metadata['qsl_idx']
-        repeat_idx = metadata['repeat_idx']
         token_ids = metadata['token_ids']
         model_output = metadata['model_output']
         extracted_code = metadata['extracted_code']
@@ -227,7 +220,6 @@ def process_livecodebench_batch(
         result = {
             'seq_id': entry['seq_id'],
             'qsl_idx': qsl_idx,
-            'repeat_idx': repeat_idx,
             'dataset': dataset_name,
             'is_correct': is_correct,
             'extracted_answer': str(extracted_code)[:200] if extracted_code is not None else None,
@@ -242,7 +234,6 @@ def process_livecodebench_batch(
         if args.save_outputs:
             output_record = {
                 'qsl_idx': qsl_idx,
-                'repeat_idx': repeat_idx,
                 'seq_id': entry['seq_id'],
                 'dataset': dataset_name,
                 'ground_truth': ground_truth,
@@ -352,7 +343,7 @@ def main():
         logging.getLogger().setLevel(logging.DEBUG)
 
     # Load MLPerf log
-    mlperf_log, num_repeats = load_mlperf_log(args.mlperf_log)
+    mlperf_log = load_mlperf_log(args.mlperf_log)
 
     # Load reference data
     logger.info(f"Loading reference data from {args.reference_data}")
@@ -391,8 +382,6 @@ def main():
     if missing_columns:
         raise ValueError(
             f"Reference data missing required columns: {missing_columns}")
-    logger.info(
-        f"Evaluation mode: pass@{num_repeats}" if num_repeats > 1 else "Evaluation mode: single-pass")
 
     # Log unique datasets in reference data
     if 'dataset' in reference_df.columns:
@@ -429,16 +418,12 @@ def main():
     logger.info(f"Loading tokenizer: {args.tokenizer}")
     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
 
-    # Group MLPerf log entries by dataset and track repeats
+    # Group MLPerf log entries by dataset
     logger.info("Grouping MLPerf log entries by dataset...")
     dataset_entries = defaultdict(list)
 
-    # Track unique (qsl_idx, repeat_idx) pairs to verify coverage
-    qsl_repeat_pairs = set()
-
     for entry in mlperf_log:
         qsl_idx = entry['qsl_idx']
-        repeat_idx = entry.get('repeat_idx', 0)
 
         if qsl_idx >= len(reference_df):
             logger.warning(
@@ -448,16 +433,13 @@ def main():
         ref_row = reference_df.iloc[qsl_idx]
         dataset_name = validate_dataset_name(ref_row['dataset'])
         dataset_entries[dataset_name].append(entry)
-        qsl_repeat_pairs.add((qsl_idx, repeat_idx))
-
-    # Count unique qsl_idx values
-    unique_qsl_indices = set(pair[0] for pair in qsl_repeat_pairs)
 
     logger.info(f"Grouped entries by dataset:")
+    total_entries = 0
     for ds_name, entries in sorted(dataset_entries.items()):
         logger.info(f"  {ds_name}: {len(entries)} entries")
-    logger.info(f"Unique samples (qsl_idx): {len(unique_qsl_indices)}")
-    logger.info(f"Total entries (samples × repeats): {len(qsl_repeat_pairs)}")
+        total_entries += len(entries)
+    logger.info(f"Total entries: {total_entries}")
 
     # Pre-load LiveCodeBench benchmark if needed
     lcb_executor = None
@@ -484,16 +466,8 @@ def main():
     logger.info("\nProcessing MLPerf log entries by dataset...")
 
     results = []
-    # Track stats for each repeat and aggregated
-    dataset_stats = defaultdict(lambda: {
-        "per_repeat": {i: {"correct": 0, "total": 0} for i in range(num_repeats)},
-        # pass@k: at least one correct
-        "aggregated": {"correct": 0, "total": 0},
-        # pass@1 with k repeats: average correctness across repeats
-        "averaged": {"correct_sum": 0, "total": 0}
-    })
-    # Track results per (qsl_idx, repeat_idx) for aggregation
-    qsl_results = defaultdict(lambda: {i: None for i in range(num_repeats)})
+    # Track stats per dataset (simple correct/total)
+    dataset_stats = defaultdict(lambda: {"correct": 0, "total": 0})
     outputs_data = []  # For saving detokenized outputs
 
     try:
@@ -516,27 +490,17 @@ def main():
                 if args.save_outputs:
                     outputs_data.extend(outputs_batch)
 
-                # Update per-repeat stats and track for aggregation
+                # Update stats
                 for res in results_batch:
-                    repeat_idx = res['repeat_idx']
-                    qsl_idx = res['qsl_idx']
-                    is_correct = res['is_correct']
-
-                    # Track result for aggregation
-                    qsl_results[(dataset_name, qsl_idx)
-                                ][repeat_idx] = is_correct
-
-                    # Update per-repeat stats
-                    dataset_stats[dataset_name]["per_repeat"][repeat_idx]["total"] += 1
-                    if is_correct:
-                        dataset_stats[dataset_name]["per_repeat"][repeat_idx]["correct"] += 1
+                    dataset_stats[dataset_name]["total"] += 1
+                    if res['is_correct']:
+                        dataset_stats[dataset_name]["correct"] += 1
             else:
                 # Sequential evaluation for non-LCB datasets
                 for entry in tqdm(
                         entries, desc=f"Evaluating {dataset_name}", unit="entry"):
                     seq_id = entry['seq_id']
                     qsl_idx = entry['qsl_idx']
-                    repeat_idx = entry.get('repeat_idx', 0)
                     hex_data = entry['data']
 
                     ref_row = reference_df.iloc[qsl_idx]
@@ -553,7 +517,7 @@ def main():
                         )
                     except Exception as e:
                         logger.warning(
-                            f"Evaluation error for qsl_idx={qsl_idx}, repeat={repeat_idx}, dataset={dataset_name}: {e}")
+                            f"Evaluation error for qsl_idx={qsl_idx}, dataset={dataset_name}: {e}")
                         is_correct = False
                         extracted = None
                         eval_details = f"Evaluation error: {e}"
@@ -562,7 +526,6 @@ def main():
                     result = {
                         'seq_id': seq_id,
                         'qsl_idx': qsl_idx,
-                        'repeat_idx': repeat_idx,
                         'dataset': dataset_name,
                         'is_correct': is_correct,
                         'extracted_answer': str(extracted) if extracted is not None else None,
@@ -577,7 +540,6 @@ def main():
                     if args.save_outputs:
                         output_record = {
                             'qsl_idx': qsl_idx,
-                            'repeat_idx': repeat_idx,
                             'seq_id': seq_id,
                             'dataset': dataset_name,
                             'ground_truth': ground_truth,
@@ -589,14 +551,10 @@ def main():
                         }
                         outputs_data.append(output_record)
 
-                    # Track result for this (qsl_idx, repeat_idx)
-                    qsl_results[(dataset_name, qsl_idx)
-                                ][repeat_idx] = is_correct
-
-                    # Update per-repeat stats
-                    dataset_stats[dataset_name]["per_repeat"][repeat_idx]["total"] += 1
+                    # Update stats
+                    dataset_stats[dataset_name]["total"] += 1
                     if is_correct:
-                        dataset_stats[dataset_name]["per_repeat"][repeat_idx]["correct"] += 1
+                        dataset_stats[dataset_name]["correct"] += 1
 
     finally:
         # Clean up LiveCodeBench executor
@@ -605,124 +563,72 @@ def main():
             lcb_executor.shutdown(wait=True)
             os.environ.pop('TQDM_DISABLE', None)
 
-    # Aggregate results across repeats (pass@k logic)
-    logger.info("\nAggregating results across repeats...")
-    for (dataset_name, qsl_idx), repeat_results in qsl_results.items():
-        # Check if ANY repeat is correct (pass@k)
-        is_pass_k_correct = any(
-            result for result in repeat_results.values() if result is not None and result
-        )
-
-        # Update aggregated stats (pass@k)
-        dataset_stats[dataset_name]["aggregated"]["total"] += 1
-        if is_pass_k_correct:
-            dataset_stats[dataset_name]["aggregated"]["correct"] += 1
-
-        # Update averaged stats (pass@1 with k repeats)
-        correct_count = sum(
-            1 for result in repeat_results.values() if result is not None and result)
-        dataset_stats[dataset_name]["averaged"]["correct_sum"] += correct_count
-        dataset_stats[dataset_name]["averaged"]["total"] += 1
-
-    # Calculate overall stats (aggregated pass@k if num_repeats > 1, else
-    # per-repeat[0])
-    if num_repeats > 1:
-        total_correct = sum(stats["aggregated"]["correct"]
-                            for stats in dataset_stats.values())
-        total_samples = sum(stats["aggregated"]["total"]
-                            for stats in dataset_stats.values())
-
-        # Calculate overall pass@1 with k repeats accuracy
-        total_averaged_correct_sum = sum(stats["averaged"]["correct_sum"]
-                                         for stats in dataset_stats.values())
-        total_averaged_samples = sum(stats["averaged"]["total"]
-                                     for stats in dataset_stats.values())
-        overall_averaged_accuracy = (
-            total_averaged_correct_sum /
-            (total_averaged_samples * num_repeats) * 100
-        ) if total_averaged_samples > 0 else 0.0
-    else:
-        total_correct = sum(stats["per_repeat"][0]["correct"]
-                            for stats in dataset_stats.values())
-        total_samples = sum(stats["per_repeat"][0]["total"]
-                            for stats in dataset_stats.values())
-
-    overall_accuracy = (
-        total_correct /
-        total_samples *
-        100) if total_samples > 0 else 0.0
+    # Calculate per-dataset scores and final score
+    # Final score = sum(dataset_correct / dataset_repeats)
+    logger.info("\nCalculating final scores...")
+
+    total_correct = sum(stats["correct"] for stats in dataset_stats.values())
+    total_samples = sum(stats["total"] for stats in dataset_stats.values())
+    overall_accuracy = (total_correct / total_samples * 100) if total_samples > 0 else 0.0
+
+    # Calculate weighted final score
+    final_score = 0.0
+    final_score_components = {}
+    for dataset_name, stats in dataset_stats.items():
+        repeats = DATASET_REPEATS.get(dataset_name, 1)
+        component_score = stats["correct"] / repeats
+        final_score += component_score
+        final_score_components[dataset_name] = {
+            "correct": stats["correct"],
+            "total": stats["total"],
+            "repeats": repeats,
+            "component_score": component_score
+        }
 
     # Print results
     print("\n" + "=" * 80)
     print("MLPerf Accuracy Evaluation Results")
     print("=" * 80)
-    print(f"Evaluation mode: pass@{num_repeats}" if num_repeats >
-          1 else "Evaluation mode: single-pass")
-    print(f"Total unique samples: {total_samples}")
-    if num_repeats > 1:
-        print(
-            f"Overall pass@{num_repeats} accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
-        total_score = total_averaged_correct_sum / num_repeats
-        print(
-            f"Overall pass@1 with {num_repeats} repeats: {overall_averaged_accuracy:.2f}% ({total_score:.1f}/{total_averaged_samples})")
-    else:
-        print(
-            f"Overall accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
+    print(f"Total samples evaluated: {total_samples}")
+    print(f"Overall raw accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
     print("=" * 80)
 
-    if num_repeats > 1:
-        print("\nPer-Dataset pass@k Results (aggregated):")
-        print("-" * 80)
-        for dataset_name in sorted(dataset_stats.keys()):
-            stats = dataset_stats[dataset_name]["aggregated"]
-            if stats["total"] > 0:
-                accuracy = (stats["correct"] / stats["total"] * 100)
-                print(
-                    f"{dataset_name:20s}: {accuracy:6.2f}% ({stats['correct']:4d}/{stats['total']:4d})")
-
-        print("\nPer-Dataset pass@1 with k repeats Results:")
-        print("-" * 80)
-        for dataset_name in sorted(dataset_stats.keys()):
-            stats = dataset_stats[dataset_name]["averaged"]
-            if stats["total"] > 0:
-                accuracy = (stats["correct_sum"] /
-                            (stats["total"] * num_repeats) * 100)
-                total_score = stats["correct_sum"] / num_repeats
-                print(
-                    f"{dataset_name:20s}: {accuracy:6.2f}% ({total_score:7.1f}/{stats['total']:4d})")
-
-        print("\n" + "=" * 80)
-        print("Per-Dataset, Per-Repeat Breakdown:")
-        print("-" * 80)
-        for dataset_name in sorted(dataset_stats.keys()):
-            print(f"\n{dataset_name}:")
-            for repeat_idx in range(num_repeats):
-                stats = dataset_stats[dataset_name]["per_repeat"][repeat_idx]
-                if stats["total"] > 0:
-                    accuracy = (stats["correct"] / stats["total"] * 100)
-                    print(
-                        f"  Repeat {repeat_idx}: {accuracy:6.2f}% ({stats['correct']:4d}/{stats['total']:4d})")
-    else:
-        print("\nPer-Dataset Breakdown:")
-        print("-" * 80)
-        for dataset_name in sorted(dataset_stats.keys()):
-            stats = dataset_stats[dataset_name]["per_repeat"][0]
-            if stats["total"] > 0:
-                accuracy = (stats["correct"] / stats["total"] * 100)
-                print(
-                    f"{dataset_name:20s}: {accuracy:6.2f}% ({stats['correct']:4d}/{stats['total']:4d})")
+    print("\nPer-Dataset Breakdown:")
+    print("-" * 80)
+    print(f"{'Dataset':25s} {'Correct':>8s} {'Total':>8s} {'Repeats':>8s} {'Score':>10s} {'Accuracy':>10s}")
+    print("-" * 80)
+    for dataset_name in sorted(dataset_stats.keys()):
+        stats = dataset_stats[dataset_name]
+        if stats["total"] > 0:
+            accuracy = (stats["correct"] / stats["total"] * 100)
+            repeats = DATASET_REPEATS.get(dataset_name, 1)
+            component_score = stats["correct"] / repeats
+            print(
+                f"{dataset_name:25s} {stats['correct']:8d} {stats['total']:8d} {repeats:8d} {component_score:10.2f} {accuracy:9.2f}%")
 
     print("=" * 80)
+    print(f"\nFinal Score Calculation:")
+    print("-" * 80)
+    score_parts = []
+    value_parts = []
+    result_parts = []
+    for dataset_name in sorted(final_score_components.keys()):
+        comp = final_score_components[dataset_name]
+        score_parts.append(f"{dataset_name}/{comp['repeats']}")
+        value_parts.append(f"{comp['correct']}/{comp['repeats']}")
+        result_parts.append(f"{comp['component_score']:.2f}")
+    print(f"Formula: {' + '.join(score_parts)}")
+    print(f"         = {' + '.join(value_parts)}")
+    print(f"         = {' + '.join(result_parts)}")
+    print(f"\nFINAL SCORE: {final_score:.2f}")
+    print("=" * 80)
 
     # Save detokenized outputs to pickle if requested
     if args.save_outputs:
         logger.info(f"Saving detokenized outputs to {args.save_outputs}...")
 
-        # Sort by (qsl_idx, repeat_idx) for ordered output
-        outputs_data_sorted = sorted(
-            outputs_data, key=lambda x: (
-                x['qsl_idx'], x.get(
-                    'repeat_idx', 0)))
+        # Sort by qsl_idx for ordered output
+        outputs_data_sorted = sorted(outputs_data, key=lambda x: x['qsl_idx'])
 
         # Convert to DataFrame for easier inspection
         outputs_df = pd.DataFrame(outputs_data_sorted)
@@ -742,55 +648,25 @@ def main():
         # Build per-dataset stats
         per_dataset_stats = {}
         for dataset_name, stats in dataset_stats.items():
-            if num_repeats > 1:
-                # Aggregated pass@k stats
-                agg_stats = stats["aggregated"]
-                avg_stats = stats["averaged"]
-                per_dataset_stats[dataset_name] = {
-                    "pass_k": num_repeats,
-                    "aggregated": {
-                        "correct": agg_stats["correct"],
-                        "total": agg_stats["total"],
-                        "accuracy": (agg_stats["correct"] / agg_stats["total"] * 100) if agg_stats["total"] > 0 else 0.0
-                    },
-                    "averaged": {
-                        "score": avg_stats["correct_sum"] / num_repeats,
-                        "total": avg_stats["total"],
-                        "accuracy": (avg_stats["correct_sum"] / (avg_stats["total"] * num_repeats) * 100) if avg_stats["total"] > 0 else 0.0
-                    },
-                    "per_repeat": {}
-                }
-                # Add per-repeat stats
-                for repeat_idx in range(num_repeats):
-                    repeat_stats = stats["per_repeat"][repeat_idx]
-                    if repeat_stats["total"] > 0:
-                        per_dataset_stats[dataset_name]["per_repeat"][repeat_idx] = {
-                            "correct": repeat_stats["correct"],
-                            "total": repeat_stats["total"],
-                            "accuracy": (repeat_stats["correct"] / repeat_stats["total"] * 100)
-                        }
-            else:
-                # Single-pass stats
-                single_stats = stats["per_repeat"][0]
-                per_dataset_stats[dataset_name] = {
-                    "correct": single_stats["correct"],
-                    "total": single_stats["total"],
-                    "accuracy": (single_stats["correct"] / single_stats["total"] * 100) if single_stats["total"] > 0 else 0.0
-                }
+            repeats = DATASET_REPEATS.get(dataset_name, 1)
+            component_score = stats["correct"] / repeats
+            per_dataset_stats[dataset_name] = {
+                "correct": stats["correct"],
+                "total": stats["total"],
+                "accuracy": (stats["correct"] / stats["total"] * 100) if stats["total"] > 0 else 0.0,
+                "repeats": repeats,
+                "component_score": component_score
+            }
 
         summary = {
-            "num_repeats": num_repeats,
             "total_samples": total_samples,
             "total_correct": total_correct,
             "overall_accuracy": overall_accuracy,
+            "final_score": final_score,
+            "dataset_repeats": DATASET_REPEATS,
             "per_dataset": per_dataset_stats
         }
 
-        # Add averaged metrics if num_repeats > 1
-        if num_repeats > 1:
-            summary["overall_averaged_score"] = total_averaged_correct_sum / num_repeats
-            summary["overall_averaged_accuracy"] = overall_averaged_accuracy
-
         output_data = {
             "summary": summary,
             "detailed_results": results if args.verbose else None
diff --git a/language/gpt-oss-120b/preprocess/harmonize_inputs.py b/language/gpt-oss-120b/preprocess/harmonize_inputs.py
deleted file mode 100644
index 8ec1223f93..0000000000
--- a/language/gpt-oss-120b/preprocess/harmonize_inputs.py
+++ /dev/null
@@ -1,536 +0,0 @@
-#!/usr/bin/env python3
-"""
-Multi-Shot Prompting with OpenAI Harmony Format
-
-This example demonstrates how to construct and tokenize a zero/multi-shot prompt using the
-openai-harmony Python package for math500, aime1983, livecodebench, mmlu, gpqa with gpt-oss model series.
-"""
-
-import argparse
-import json
-import os
-import pandas as pd
-from multiprocessing import Pool, cpu_count
-from tqdm import tqdm
-from transformers import AutoTokenizer
-from openai_harmony import (
-    load_harmony_encoding,
-    HarmonyEncodingName,
-    Role,
-    Message,
-    Conversation,
-    SystemContent,
-    DeveloperContent,
-    ReasoningEffort
-)
-
-MOD_PROMPT = "Do not repeat steps and output the final answer immediately once you have it. Once you have a candidate answer, do not spend more than ~100 tokens to verify it - instead, do a quick check and answer immediately. Avoid thinking for a long time. It is important to answer in as few tokens as possible."
-
-
-def _create_base_messages(reasoning_effort, instructions):
-    """
-    Creates system and developer messages for a conversation.
-
-    Args:
-        reasoning_effort: ReasoningEffort enum value
-        instructions: String containing developer instructions
-
-    Returns:
-        list: List containing system and developer messages
-    """
-    # Create the system message with required channels
-    system_message = (
-        SystemContent.new()
-        .with_reasoning_effort(reasoning_effort)
-        .with_conversation_start_date("2025-09-30")
-        .with_required_channels(["analysis", "commentary", "final"])
-    )
-
-    # Create the developer message with instructions
-    developer_message = DeveloperContent.new().with_instructions(instructions)
-
-    return [
-        Message.from_role_and_content(Role.SYSTEM, system_message),
-        Message.from_role_and_content(Role.DEVELOPER, developer_message),
-    ]
-
-
-def _add_multishot_examples(messages, examples):
-    """
-    Adds multi-shot examples to a message list.
-
-    Args:
-        messages: List of messages to append examples to
-        examples: List of tuples (user_content, assistant_content) representing examples
-
-    Returns:
-        list: Updated messages list with examples added
-    """
-    for user_content, assistant_content in examples:
-        messages.append(Message.from_role_and_content(Role.USER, user_content))
-        messages.append(
-            Message.from_role_and_content(
-                Role.ASSISTANT, assistant_content).with_channel("final")
-        )
-    return messages
-
-
-def _finalize_conversation(messages, user_query=None):
-    """
-    Adds the user query, creates the conversation, and renders tokens.
-
-    Args:
-        messages: List of messages (system, developer, and optionally examples)
-        user_query: The actual user query to solve
-
-    Returns:
-        tuple: (conversation_object, token_list) ready for model completion
-    """
-    # Load the Harmony encoding for gpt-oss models
-    enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
-
-    # Add the actual problem to solve
-    if user_query is not None:
-        messages.append(Message.from_role_and_content(Role.USER, user_query))
-    # Create the conversation
-    convo = Conversation.from_messages(messages)
-
-    # Render the conversation for completion (ready to send to the model)
-    tokens = enc.render_conversation_for_completion(convo, Role.ASSISTANT)
-
-    return convo, tokens
-
-
-def create_math500_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
-    """
-    Creates a multi-shot prompt for mathematical problem solving using Harmony format.
-
-    Returns:
-        tuple: (conversation_object, token_list) ready for model completion
-    """
-    instructions = (
-        "You are a math expert that solves problems step-by-step. "
-        "Always show your work clearly and put your final answer in \\boxed{answer} format. "
-        f"{MOD_PROMPT} "
-        "Follow the format shown in the examples below. "
-    )
-
-    messages = _create_base_messages(reasoning_effort, instructions)
-
-    # Define multi-shot examples
-    examples = [
-        # Example 1: Square areas and side lengths
-        (
-            "The areas of three squares are 16, 49 and 169. What is the average (mean) of their side lengths?",
-            "Since the areas of the three squares are 16, 49 and 169, then their side lengths are $\\sqrt{16}=4$, $\\sqrt{49}=7$ and $\\sqrt{169}=13$, respectively.\n\nThus, the average of their side lengths is $$\\frac{4+7+13}{3}=\\boxed{8}.$$"
-        ),
-        # Example 2: Floor function equation
-        (
-            "Find all $x$ such that $\\lfloor \\lfloor 2x \\rfloor - 1/2 \\rfloor = \\lfloor x + 2 \\rfloor.$",
-            "Observe that $\\lfloor 2x \\rfloor$ is an integer, so it follows that $\\lfloor \\lfloor 2x \\rfloor - 1/2 \\rfloor = \\lfloor 2x \\rfloor - 1$. Also, $\\lfloor x + 2 \\rfloor = \\lfloor x \\rfloor + 2$. Thus, our equation becomes $$\\lfloor 2x \\rfloor = \\lfloor x \\rfloor + 3.$$Let $n = \\lfloor x \\rfloor,$ so $n \\le x < n + 1.$\n\nIf $x < n + \\frac{1}{2},$ then $2n \\le x < 2n + 1,$ so $\\lfloor 2x \\rfloor = 2n,$ and\n\\[2n = n + 3,\\]which means $n = 3.$\n\nIf $x \\ge n + \\frac{1}{2},$ then $2n + 1 \\le x < 2n + 2,$ so $\\lfloor 2x \\rfloor = 2n + 1,$ and\n\\[2n + 1 = n + 3,\\]which means $n = 2.$\n\nTherefore, the set of solutions is $x \\in \\boxed{\\left[ \\frac{5}{2}, \\frac{7}{2} \\right)}.$"
-        ),
-        # Example 3: Sequences and differences
-        (
-            "Sequence $A$ is a geometric sequence. Sequence $B$ is an arithmetic sequence. Each sequence stops as soon as one of its terms is greater than $300.$ What is the least positive difference between a number selected from sequence $A$ and a number selected from sequence $B?$\n\n$\\bullet$ Sequence $A:$ $2,$ $4,$ $8,$ $16,$ $32,$ $\\ldots$\n\n$\\bullet$ Sequence $B:$ $20,$ $40,$ $60,$ $80,$ $100,$ $\\ldots$",
-            "The terms of sequence $A$ are $2,$ $4,$ $8,$ $16,$ $32,$ $64,$ $128,$ $256,$ $512.$ The terms of sequence $B$ start from $20$ and go up by $20$ each time, so sequence $B$ is precisely all multiples of $20$ from $20$ to $320.$ We thus need to see which term in sequence $A$ is closest to a multiple of $20.$ $16,$ $64,$ and $256$ are the closest, each being $4$ away from a multiple of $20.$ So the least positive difference between a term in sequence $A$ and one in sequence $B$ is $\\boxed{4}.$"
-        ),
-        # Example 4: Probability and Deal or No Deal
-        (
-            "In the game Deal or No Deal, participants choose a box at random from a set of $26,$ one containing each of the following values: \\begin{tabular}{|c|c|}\\hline\\$.01&\\$1,000\\\\\\hline\\$1&\\$5,000\\\\\\hline\\$5&\\$10,000\\\\\\hline\\$10&\\$25,000\\\\\\hline\\$25&\\$50,000\\\\\\hline\\$50&\\$75,000\\\\\\hline\\$75&\\$100,000\\\\\\hline\\$100&\\$200,000\\\\\\hline\\$200&\\$300,000\\\\\\hline\\$300&\\$400,000\\\\\\hline\\$400&\\$500,000\\\\\\hline\\$500&\\$750,000\\\\\\hline\\$750&\\$1,000,000\\\\\\hline\\end{tabular} After choosing a box, participants eliminate other boxes by opening them, showing the amount of money in the box to the crowd, and then removing that box (and its money!) from the game. What is the minimum number of boxes a participant needs to eliminate in order to have a half chance of holding at least $\\$100,\\!000$ as his or her chosen box?",
-            "Seven of the boxes contain at least this amount. If a participant is going to be holding one of these boxes with a probability of $1/2,$ there can be at most $7$ other boxes left. This means that at least $26-7-7=\\boxed{12}$ boxes must be eliminated."
-        ),
-        # Example 5: Domain of composite function
-        (
-            "Find the domain of the function $f(x) = \\tan(\\arccos(x^2)).$",
-            "For $\\arccos (x^2)$ to be defined, we must have $-1 \\le x^2 \\le 1,$ which is satisfied only for $-1 \\le x \\le 1.$  Then $\\arccos (x^2)$ will always return an angle between 0 and $\\frac{\\pi}{2}.$  Then $\\tan (\\arccos(x^2))$ is defined, unless $\\arccos(x^2) = \\frac{\\pi}{2}.$  This occurs only when $x = 0.$\n\nTherefore, the domain of $f(x)$ is $\\boxed{[-1,0) \\cup (0,1]}.$"
-        ),
-    ]
-
-    _add_multishot_examples(messages, examples)
-
-    return _finalize_conversation(messages, user_query)
-
-
-def create_aime1983_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
-    """
-    Creates a zero-shot prompt for mathematical problem solving using Harmony format.
-
-    Returns:
-        tuple: (conversation_object, token_list) ready for model completion
-    """
-    instructions = (
-        "You are a math expert that solves problems step-by-step. "
-        f"{MOD_PROMPT} "
-        "The final line of your response should contain the final answer as an integer enclosed in \\boxed{answer}."
-    )
-
-    messages = _create_base_messages(reasoning_effort, instructions)
-
-    return _finalize_conversation(messages, user_query)
-
-
-def create_livecodebench_prompt(
-        user_query, reasoning_effort=ReasoningEffort.HIGH):
-    """
-    Creates a zero-shot prompt for livecodebench problem solving using Harmony format.
-
-    Returns:
-        tuple: (conversation_object, token_list) ready for model completion
-    """
-    instructions = (
-        "You are a python coding expert that solves problems step-by-step. "
-        "You must provide the reasoning to arriving at your solution and the code to solve the problem."
-        f"{MOD_PROMPT} "
-        "The code should be enclosed within ```python delimiters."
-    )
-
-    messages = _create_base_messages(reasoning_effort, instructions)
-
-    return _finalize_conversation(messages, user_query)
-
-
-def create_mmlu_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
-    """
-    Creates a multi-shot prompt for multiple choice question answering using Harmony format.
-
-    Returns:
-        tuple: (conversation_object, token_list) ready for model completion
-    """
-    instructions = (
-        "You are an expert test-taker that answers multiple choice questions accurately. "
-        f"{MOD_PROMPT} "
-        "After your reasoning, provide your final answer on a new line in the format: 'Answer: X' where X is the letter choice."
-    )
-
-    messages = _create_base_messages(reasoning_effort, instructions)
-
-    # Define multi-shot examples
-    examples = [
-        # Example 1: Abstract Algebra - Ring Theory
-        (
-            "The symmetric group $S_n$ has $n!$ elements, hence it is not true that $S_{10}$ has 10 elements.\nFind the characteristic of the ring 2Z.\nA) 0\nB) 30\nC) 3\nD) 10\nE) 12\nF) 50\nG) 2\nH) 100\nI) 20\nJ) 5",
-            "Answer: A"
-        ),
-        # Example 2: Linear Algebra - Transformations
-        (
-            "Let V be the set of all real polynomials p(x). Let transformations T, S be defined on V by T:p(x) -> xp(x) and S:p(x) -> p'(x) = d/dx p(x), and interpret (ST)(p(x)) as S(T(p(x))). Which of the following is true?\nA) ST + TS is the identity map of V onto itself.\nB) TS = 0\nC) ST = 1\nD) ST - TS = 0\nE) ST = T\nF) ST = 0\nG) ST = TS\nH) ST - TS is the identity map of V onto itself.\nI) TS = T\nJ) ST = S",
-            "Answer: H"
-        ),
-        # Example 3: Number Theory - Diophantine Equations
-        (
-            "Let A be the set of all ordered pairs of integers (m, n) such that 7m + 12n = 22. What is the greatest negative number in the set B = {m + n : (m, n) ∈ A}?\nA) -5\nB) 0\nC) -3\nD) -7\nE) -4\nF) -6\nG) -1\nH) -2\nI) -9\nJ) N/A",
-            "Answer: E"
-        ),
-        # Example 4: Differential Equations - Salt Tank Problem
-        (
-            "A tank initially contains a salt solution of 3 grams of salt dissolved in 100 liters of water. A salt solution containing 0.02 grams of salt per liter of water is sprayed into the tank at a rate of 4 liters per minute. The sprayed solution is continually mixed with the salt solution in the tank, and the mixture flows out of the tank at a rate of 4 liters per minute. If the mixing is instantaneous, how many grams of salt are in the tank after 100 minutes have elapsed?\nA) 3 + e^-2\nB) 2 - e^-4\nC) 2 - e^-2\nD) 3 + e^-4\nE) 2 + e^-3\nF) 2 - e^-3\nG) 3 - e^-2\nH) 2 + e^-2\nI) 2 + e^-4\nJ) 2",
-            "Answer: I"
-        ),
-        # Example 5: Basic Arithmetic - Division
-        (
-            "A total of 30 players will play basketball at a park. There will be exactly 5 players on each team. Which statement correctly explains how to find the number of teams needed?\nA) Multiply 5 by 5 to find 25 teams.\nB) Divide 30 by 5 to find 6 teams.\nC) Add 5 to 30 to find 35 teams.\nD) Subtract 30 from 5 to find -25 teams.\nE) Divide 5 by 30 to find 0.1667 teams.\nF) Add 5 to 30 then divide by 2 to find 17.5 teams.\nG) N/A\nH) N/A\nI) N/A\nJ) N/A",
-            "Answer: B"
-        ),
-    ]
-
-    _add_multishot_examples(messages, examples)
-
-    return _finalize_conversation(messages, user_query)
-
-
-def create_gpqa_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
-    """
-    Creates a multi-shot prompt for Biology, Physics and Chemistry question answering using Harmony format.
-
-    Returns:
-        tuple: (conversation_object, token_list) ready for model completion
-    """
-    instructions = (
-        "You are an expert in Biology, Physics and Chemistry who answers scientific questions accurately. "
-        f"{MOD_PROMPT} "
-        "After your reasoning, provide your final answer on a new line in the format: 'Answer: X' where X is the letter choice."
-    )
-
-    messages = _create_base_messages(reasoning_effort, instructions)
-
-    # Define multi-shot examples
-    examples = [
-        # Example 1: Molecular Biology - Gene Therapy
-        (
-            "A large gene has dozens of exons, of which the central ones code for folded triple helical repeats that connect the cytoskeleton with sarcolemma and extracellular space. Each exon usually codes for one folded triple alpha helix. The most common mutations of the gene are central exon deletions that create out-of-frame peptides and progressive degenerative organ waste. A solution is to deliver a Morpholino that recognizes the 5' end of the out-of-frame exon in pre-mRNA. The molecule prevents binding of the spliceosome and creates exon skipping and in-frame joining. Several missing exons are well tolerated by an organism. Which structure below is not involved in the proposed therapy?\nA) polyA tail\nB) lariat\nC) antisense\nD) R-loops",
-            "The text describes the dystrophin gene and the FDA-approved oligonucleotide therapy that causes exon skipping by creating a functional, albeit shorter, dystrophin protein. Morpholino is bound to the pre-mRNA in an antisense orientation. Every splicing mechanism creates the lariat molecule that is circular with a 3' tail and soon degraded. The spliced RNA is polyadenylated at the 3' end. R-loops are triple helix of DNA and the pre-mRNA and a consequence of the RNA transcription, not splicing and RNA maturation.\n\nAnswer: D"
-        ),
-        # Example 2: Stereochemistry - Optical Activity
-        (
-            "How many of the following compounds exhibit optical activity?\n1-methyl-4-(prop-1-en-2-yl)cyclohex-1-ene\n2,3,3,3-tetrafluoroprop-1-ene\ndi(cyclohex-2-en-1-ylidene)methane\n5-(5-methylhexan-2-ylidene)cyclopenta-1,3-diene\n3-(2-methylbut-1-en-1-ylidene)cyclohex-1-ene\n[1,1'-biphenyl]-3,3'-diol\n8,8-dichlorobicyclo[4.2.0]octan-7-one\ncyclopent-2-en-1-one\nA) 6\nB) 5\nC) 4\nD) 3",
-            "The compounds 1-methyl-4-(prop-1-en-2-yl)cyclohex-1-ene, 3-(2-methylbut-1-en-1-ylidene)cyclohex-1-ene, di(cyclohex-2-en-1-ylidene)methane, and 8,8-dichlorobicyclo[4.2.0]octan-7-one are chiral molecules and thus will be optically active.\n\nAnswer: C"
-        ),
-    ]
-
-    _add_multishot_examples(messages, examples)
-
-    return _finalize_conversation(messages, user_query)
-
-
-def create_healthbench_prompt(prompt, reasoning_effort=ReasoningEffort.HIGH):
-    """
-    Creates a zero-shot prompt for healthbench problem solving using Harmony format.
-    prompt: list of messages (message: Dict[str, str] (content, role)) giving conversation history
-
-    Returns:
-        tuple: (conversation_object, token_list) ready for model completion
-    """
-    instructions = (
-        "Your task is to answer the following health-related query as a responsible assistant."
-        "Provide responses that are factually accurate, up-to-date, and consistent with current clinical guidelines. Adjust the detail, terminology, and tone based on the user's apparent background (layperson or healthcare professional)."
-        "If the evidence is limited or uncertain, clearly acknowledge this in your answer. When essential information is missing for a safe, thorough response, ask clarifying questions before proceeding."
-        "Do not provide specific medical diagnoses or emergency recommendations unless explicitly appropriate and safe to do so."
-        "All responses should promote user safety, accuracy, completeness, clear communication, and adherence to user instructions while maintaining clinical standards."
-    )
-
-    messages = _create_base_messages(reasoning_effort, instructions)
-
-    role_map = {
-        "user": Role.USER,
-        "assistant": Role.ASSISTANT
-    }
-
-    for message in prompt:
-        role = message["role"]
-        assert role in role_map, f"Unknown role: {role}"
-        role = role_map[role]
-
-        content = message["content"]
-        messages.append(Message.from_role_and_content(role, content))
-
-    return _finalize_conversation(messages, None)
-
-
-def create_arxiv_summarization_prompt(
-        user_query, reasoning_effort=ReasoningEffort.HIGH):
-    """
-    Creates a zero-shot prompt for arXiv paper summarization using Harmony format.
-
-    Returns:
-        tuple: (conversation_object, token_list) ready for model completion
-    """
-    instructions = (
-        "You are an expert at reading and summarizing academic research papers. "
-        "Your task is to provide clear and comprehensive summaries. "
-        "Cover the research purpose, methods, main findings, and significance. "
-        "Be thorough but avoid unnecessary details. Ignore citations and references."
-    )
-
-    messages = _create_base_messages(reasoning_effort, instructions)
-
-    return _finalize_conversation(messages, user_query)
-
-
-def create_default_prompt(user_query, reasoning_effort=ReasoningEffort.HIGH):
-    """
-    Creates a default zero-shot prompt for general problem solving using Harmony format.
-    This is used when no specific dataset is specified.
-
-    Returns:
-        tuple: (conversation_object, token_list) ready for model completion
-    """
-    instructions = (
-        "You are a helpful AI assistant that solves user questions. "
-        "Provide a well-structured answer to the user's question."
-    )
-
-    messages = _create_base_messages(reasoning_effort, instructions)
-
-    return _finalize_conversation(messages, user_query)
-
-
-def process_row(args):
-    """
-    Worker function to process a single row from the dataframe.
-
-    Args:
-        args: tuple of (index, row, dataset_function_map, reasoning_effort)
-
-    Returns:
-        tuple: (index, convo, tokens, dataset_name) or (index, None, None, dataset_name, error)
-    """
-    index, row, dataset_function_map, reasoning_effort = args
-
-    # Check if dataset column exists, use default if not
-    if "dataset" in row:
-        dataset_name = row["dataset"]
-    else:
-        dataset_name = "default"
-
-    if dataset_name == "healthbench":
-        user_query = row["prompt"]
-    else:
-        # Try to get question from common column names
-        if "question" in row:
-            user_query = row["question"]
-        elif "prompt" in row:
-            user_query = row["prompt"]
-        elif "query" in row:
-            user_query = row["query"]
-        else:
-            error_msg = f"No query column found (tried: question, prompt, query) at index {index}"
-            return (index, None, None, dataset_name, error_msg)
-
-    try:
-        # Get the appropriate function based on dataset type
-        if dataset_name in dataset_function_map:
-            create_prompt_func = dataset_function_map[dataset_name]
-            convo, tokens = create_prompt_func(user_query, reasoning_effort)
-            return (index, convo, tokens, dataset_name)
-        else:
-            error_msg = f"Unknown dataset '{dataset_name}' at index {index}"
-            return (index, None, None, dataset_name, error_msg)
-    except Exception as e:
-        error_msg = f"Error processing {dataset_name} at index {index}: {str(e)}"
-        return (index, None, None, dataset_name, error_msg)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--data-file",
-        type=str,
-        default="/home/mlperf_inference_storage/data/deepseek-r1/mlperf_deepseek_r1_dataset_4388_fp8_eval.pkl")
-    parser.add_argument("--num-processes", type=int, default=cpu_count(),
-                        help="Number of processes to use for parallel processing (default: number of CPU cores)")
-    parser.add_argument("--max-rows", type=int, default=None,
-                        help="Maximum number of rows to process (default: process all rows)")
-    parser.add_argument("--output-file", type=str, required=True,
-                        help="Output pickle file path to save the processed data as pandas DataFrame")
-    parser.add_argument("--reasoning-effort", type=str, default="high",
-                        help="Reasoning effort to use for the prompt")
-    parser.add_argument("--dataset", type=str, default=None,
-                        help="Filter to only process rows from this dataset (e.g., 'gpqa', 'mmlu_pro', 'math500', 'aime1983', 'livecodebench')")
-    args = parser.parse_args()
-    df = pd.read_pickle(args.data_file)
-
-    reasoning_effort = {
-        "high": ReasoningEffort.HIGH,
-        "medium": ReasoningEffort.MEDIUM,
-        "low": ReasoningEffort.LOW
-    }[args.reasoning_effort.lower()]
-
-    # Filter by dataset if specified
-    if args.dataset is not None:
-        if 'dataset' not in df.columns:
-            print(
-                f"WARNING: No 'dataset' column found in dataframe. Cannot filter by dataset.")
-            print(f"All rows will be processed using the default prompt function.")
-        else:
-            original_len = len(df)
-            available_datasets = sorted(df['dataset'].unique().tolist())
-            df = df[df['dataset'] == args.dataset].copy()
-            print(
-                f"Filtered to dataset '{args.dataset}': {len(df)} rows (from {original_len} total)")
-            if len(df) == 0:
-                print(f"ERROR: No rows found for dataset '{args.dataset}'")
-                print(f"Available datasets: {available_datasets}")
-                import sys
-                sys.exit(1)
-
-    # Apply row limit if specified
-    if args.max_rows is not None:
-        df = df.head(args.max_rows)
-        print(f"Limited to first {args.max_rows} rows")
-
-    # Create mapping from dataset names to prompt creation functions
-    dataset_function_map = {
-        'aime1983': create_aime1983_prompt,
-        'aime2025': create_aime1983_prompt,
-        'arxiv_summarization': create_arxiv_summarization_prompt,
-        'pubmed_summarization': create_arxiv_summarization_prompt,
-        'gpqa': create_gpqa_prompt,
-        'livecodebench': create_livecodebench_prompt,
-        'math500': create_math500_prompt,
-        'mmlu_pro': create_mmlu_prompt,
-        'mmlu': create_mmlu_prompt,
-        'healthbench': create_healthbench_prompt,
-        'default': create_default_prompt,
-    }
-
-    # Prepare data for parallel processing
-    process_args = [(index, row, dataset_function_map, reasoning_effort)
-                    for index, row in df.iterrows()]
-
-    # Don't use more processes than we have rows
-    num_processes = min(args.num_processes, len(df))
-
-    print(f"Processing {len(df)} queries using {num_processes} processes...")
-
-    # Process rows in parallel with progress bar
-    with Pool(processes=num_processes) as pool:
-        results = list(tqdm(
-            pool.imap(process_row, process_args),
-            total=len(process_args),
-            desc="Processing queries",
-            unit="query"
-        ))
-
-    # Sort results by index to preserve original order
-    results.sort(key=lambda x: x[0])
-
-    # Ensure output directory exists
-    output_dir = os.path.dirname(args.output_file)
-    if output_dir and not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    # Process results and modify original DataFrame
-    successful_count = 0
-    error_count = 0
-
-    # Load tokenizer for decoding tokens to text
-    print("Loading tokenizer...")
-    tokenizer = AutoTokenizer.from_pretrained(
-        "openai/gpt-oss-120b", trust_remote_code=True)
-    print("Tokenizer loaded successfully")
-
-    # Initialize columns for harmony tokenized input
-    df['tok_input'] = None
-    df['tok_input_len'] = None
-    df['text_input'] = None
-
-    for result in tqdm(results, desc="Processing results", unit="entry"):
-        if len(result) == 5:  # Error case
-            index, convo, tokens, dataset_name, error_msg = result
-            error_count += 1
-
-            # Update the original DataFrame with error data
-            df.at[index, 'tok_input'] = None
-            df.at[index, 'tok_input_len'] = None
-            df.at[index, 'text_input'] = None
-        else:  # Success case
-            index, convo, tokens, dataset_name = result
-            successful_count += 1
-
-            # Decode tokens to text using the tokenizer
-            tokens_list = tokens.tolist() if hasattr(tokens, 'tolist') else list(tokens)
-            text_input = tokenizer.decode(
-                tokens_list, skip_special_tokens=False)
-
-            # Update the original DataFrame with successful data
-            df.at[index, 'tok_input'] = tokens_list
-            df.at[index, 'tok_input_len'] = len(tokens_list)
-            df.at[index, 'text_input'] = text_input
-
-    # Verify input and output have identical column lists
-    input_columns = list(pd.read_pickle(args.data_file).columns)
-    output_columns = list(df.columns)
-    print(f"Input columns: {input_columns}")
-    print(f"Output columns: {output_columns}")
-
-    # Save the modified DataFrame as pickle
-    print("Saving modified DataFrame to pickle...")
-    df.to_pickle(args.output_file)
-
-    print(f"\nProcessing completed:")
-    print(f"  - Successfully processed: {successful_count} queries")
-    print(f"  - Errors: {error_count} queries")
-    print(f"  - Output saved to: {args.output_file} (pickle format)")
-    print(f"  - Total processed: {successful_count + error_count} queries")
-    print(f"  - Modified DataFrame shape: {df.shape}")
-    print(f"  - Updated columns: tok_input, tok_input_len, text_input")

From de831b3dbaa42fab2eb29e0d319c8b72753e58b7 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Mon, 15 Dec 2025 18:13:40 +0000
Subject: [PATCH 153/165] add percentage

---
 language/gpt-oss-120b/eval_mlperf_accuracy.py | 23 +++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/language/gpt-oss-120b/eval_mlperf_accuracy.py b/language/gpt-oss-120b/eval_mlperf_accuracy.py
index 818dd3b444..436ce6283c 100644
--- a/language/gpt-oss-120b/eval_mlperf_accuracy.py
+++ b/language/gpt-oss-120b/eval_mlperf_accuracy.py
@@ -573,18 +573,24 @@ def main():
 
     # Calculate weighted final score
     final_score = 0.0
+    max_score = 0.0
     final_score_components = {}
     for dataset_name, stats in dataset_stats.items():
         repeats = DATASET_REPEATS.get(dataset_name, 1)
         component_score = stats["correct"] / repeats
+        max_component_score = stats["total"] / repeats
         final_score += component_score
+        max_score += max_component_score
         final_score_components[dataset_name] = {
             "correct": stats["correct"],
             "total": stats["total"],
             "repeats": repeats,
-            "component_score": component_score
+            "component_score": component_score,
+            "max_component_score": max_component_score
         }
 
+    final_score_percentage = (final_score / max_score * 100) if max_score > 0 else 0.0
+
     # Print results
     print("\n" + "=" * 80)
     print("MLPerf Accuracy Evaluation Results")
@@ -612,15 +618,20 @@ def main():
     score_parts = []
     value_parts = []
     result_parts = []
+    max_parts = []
     for dataset_name in sorted(final_score_components.keys()):
         comp = final_score_components[dataset_name]
         score_parts.append(f"{dataset_name}/{comp['repeats']}")
         value_parts.append(f"{comp['correct']}/{comp['repeats']}")
         result_parts.append(f"{comp['component_score']:.2f}")
+        max_parts.append(f"{comp['total']}/{comp['repeats']}")
     print(f"Formula: {' + '.join(score_parts)}")
-    print(f"         = {' + '.join(value_parts)}")
+    print(f"Score:   = {' + '.join(value_parts)}")
     print(f"         = {' + '.join(result_parts)}")
-    print(f"\nFINAL SCORE: {final_score:.2f}")
+    print(f"         = {final_score:.2f}")
+    print(f"Max:     = {' + '.join(max_parts)}")
+    print(f"         = {max_score:.2f}")
+    print(f"\nFINAL SCORE: {final_score_percentage:.2f}% ({final_score:.2f}/{max_score:.2f})")
     print("=" * 80)
 
     # Save detokenized outputs to pickle if requested
@@ -650,12 +661,14 @@ def main():
         for dataset_name, stats in dataset_stats.items():
             repeats = DATASET_REPEATS.get(dataset_name, 1)
             component_score = stats["correct"] / repeats
+            max_component_score = stats["total"] / repeats
             per_dataset_stats[dataset_name] = {
                 "correct": stats["correct"],
                 "total": stats["total"],
                 "accuracy": (stats["correct"] / stats["total"] * 100) if stats["total"] > 0 else 0.0,
                 "repeats": repeats,
-                "component_score": component_score
+                "component_score": component_score,
+                "max_component_score": max_component_score
             }
 
         summary = {
@@ -663,6 +676,8 @@ def main():
             "total_correct": total_correct,
             "overall_accuracy": overall_accuracy,
             "final_score": final_score,
+            "max_score": max_score,
+            "final_score_percentage": final_score_percentage,
             "dataset_repeats": DATASET_REPEATS,
             "per_dataset": per_dataset_stats
         }

From 65b71ad333ff87bc135f01449228bffb66e3a948 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Mon, 15 Dec 2025 18:14:12 +0000
Subject: [PATCH 154/165] [Automated Commit] Format Codebase

---
 language/gpt-oss-120b/eval_mlperf_accuracy.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/language/gpt-oss-120b/eval_mlperf_accuracy.py b/language/gpt-oss-120b/eval_mlperf_accuracy.py
index 436ce6283c..19e91d03c2 100644
--- a/language/gpt-oss-120b/eval_mlperf_accuracy.py
+++ b/language/gpt-oss-120b/eval_mlperf_accuracy.py
@@ -569,7 +569,10 @@ def main():
 
     total_correct = sum(stats["correct"] for stats in dataset_stats.values())
     total_samples = sum(stats["total"] for stats in dataset_stats.values())
-    overall_accuracy = (total_correct / total_samples * 100) if total_samples > 0 else 0.0
+    overall_accuracy = (
+        total_correct /
+        total_samples *
+        100) if total_samples > 0 else 0.0
 
     # Calculate weighted final score
     final_score = 0.0
@@ -589,14 +592,18 @@ def main():
             "max_component_score": max_component_score
         }
 
-    final_score_percentage = (final_score / max_score * 100) if max_score > 0 else 0.0
+    final_score_percentage = (
+        final_score /
+        max_score *
+        100) if max_score > 0 else 0.0
 
     # Print results
     print("\n" + "=" * 80)
     print("MLPerf Accuracy Evaluation Results")
     print("=" * 80)
     print(f"Total samples evaluated: {total_samples}")
-    print(f"Overall raw accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
+    print(
+        f"Overall raw accuracy: {overall_accuracy:.2f}% ({total_correct}/{total_samples})")
     print("=" * 80)
 
     print("\nPer-Dataset Breakdown:")
@@ -631,7 +638,8 @@ def main():
     print(f"         = {final_score:.2f}")
     print(f"Max:     = {' + '.join(max_parts)}")
     print(f"         = {max_score:.2f}")
-    print(f"\nFINAL SCORE: {final_score_percentage:.2f}% ({final_score:.2f}/{max_score:.2f})")
+    print(
+        f"\nFINAL SCORE: {final_score_percentage:.2f}% ({final_score:.2f}/{max_score:.2f})")
     print("=" * 80)
 
     # Save detokenized outputs to pickle if requested

From 8bebe02840ced560e52d3b99ccecdca34f51c020 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Tue, 16 Dec 2025 09:48:05 -0800
Subject: [PATCH 155/165] empty commit to trigger CLA


From 6346113047a2ebf2f4653902080ad3d173032b1b Mon Sep 17 00:00:00 2001
From: v-shobhit <161510941+v-shobhit@users.noreply.github.com>
Date: Wed, 17 Dec 2025 14:55:37 +0530
Subject: [PATCH 156/165] remove comments

---
 language/gpt-oss-120b/mlperf/user.conf | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/language/gpt-oss-120b/mlperf/user.conf b/language/gpt-oss-120b/mlperf/user.conf
index 7b123ea88b..98c3282ee2 100644
--- a/language/gpt-oss-120b/mlperf/user.conf
+++ b/language/gpt-oss-120b/mlperf/user.conf
@@ -1,15 +1,11 @@
 gpt-oss-120b.Offline.target_qps = 10.0
 gpt-oss-120b.Offline.min_duration = 60000
-# gpt-oss-120b.Offline.performance_sample_count = 6396
 gpt-oss-120b.Offline.performance_sample_count = 4395
 
 gpt-oss-120b.Server.target_qps = 10
 gpt-oss-120b.Server.min_duration = 60000
-# gpt-oss-120b.Server.performance_sample_count = 6396
 gpt-oss-120b.Server.performance_sample_count = 4395
 
 gpt-oss-120b.Server.target_latency = 0
 gpt-oss-120b.Server.ttft_latency = 2000
 gpt-oss-120b.Server.tpot_latency = 20
-
-# gpt-oss-120b.*.repeats_per_sample = 5

From 71f2a834b57ad9324fba6bfd534f3d585360162a Mon Sep 17 00:00:00 2001
From: v-shobhit <161510941+v-shobhit@users.noreply.github.com>
Date: Wed, 17 Dec 2025 15:34:12 +0530
Subject: [PATCH 157/165] add gptoss placeholder values

---
 loadgen/mlperf.conf | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf
index 1b825514bd..1bf48519a4 100644
--- a/loadgen/mlperf.conf
+++ b/loadgen/mlperf.conf
@@ -162,3 +162,10 @@ whisper.Offline.min_query_count = 1633
 *.MultiStream.target_latency = 80
 *.Server.target_qps = 1.0
 *.Offline.target_qps = 1.0
+
+# gpt-oss placeholder values
+gpt-oss-120b.Offline.performance_sample_count = 4395
+gpt-oss-120b.Server.performance_sample_count = 4395
+gpt-oss-120b.Server.target_latency = 0
+gpt-oss-120b.Server.ttft_latency = 2000
+gpt-oss-120b.Server.tpot_latency = 20

From 8e83c680a25208cdc9c174bb91e21aaeb1bbae35 Mon Sep 17 00:00:00 2001
From: v-shobhit <161510941+v-shobhit@users.noreply.github.com>
Date: Thu, 18 Dec 2025 00:04:12 +0530
Subject: [PATCH 158/165] rm gpt-oss fields

---
 loadgen/mlperf.conf | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf
index 1bf48519a4..1b825514bd 100644
--- a/loadgen/mlperf.conf
+++ b/loadgen/mlperf.conf
@@ -162,10 +162,3 @@ whisper.Offline.min_query_count = 1633
 *.MultiStream.target_latency = 80
 *.Server.target_qps = 1.0
 *.Offline.target_qps = 1.0
-
-# gpt-oss placeholder values
-gpt-oss-120b.Offline.performance_sample_count = 4395
-gpt-oss-120b.Server.performance_sample_count = 4395
-gpt-oss-120b.Server.target_latency = 0
-gpt-oss-120b.Server.ttft_latency = 2000
-gpt-oss-120b.Server.tpot_latency = 20

From 9dfb9a5122b52573eca330c6e4d37f4982a0a338 Mon Sep 17 00:00:00 2001
From: v-shobhit <161510941+v-shobhit@users.noreply.github.com>
Date: Thu, 18 Dec 2025 00:05:08 +0530
Subject: [PATCH 159/165] update user.conf

---
 language/gpt-oss-120b/mlperf/user.conf | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/language/gpt-oss-120b/mlperf/user.conf b/language/gpt-oss-120b/mlperf/user.conf
index 98c3282ee2..27c2fe59b4 100644
--- a/language/gpt-oss-120b/mlperf/user.conf
+++ b/language/gpt-oss-120b/mlperf/user.conf
@@ -1,11 +1,7 @@
-gpt-oss-120b.Offline.target_qps = 10.0
-gpt-oss-120b.Offline.min_duration = 60000
-gpt-oss-120b.Offline.performance_sample_count = 4395
-
-gpt-oss-120b.Server.target_qps = 10
+gpt-oss-120b.*.performance_sample_count = 4395
+gpt-oss-120b.Server.target_qps = 1
 gpt-oss-120b.Server.min_duration = 60000
 gpt-oss-120b.Server.performance_sample_count = 4395
-
 gpt-oss-120b.Server.target_latency = 0
 gpt-oss-120b.Server.ttft_latency = 2000
 gpt-oss-120b.Server.tpot_latency = 20

From 715f063eb349ded1f7929661647c49bd3ddee38c Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 18 Dec 2025 10:53:13 +0000
Subject: [PATCH 160/165] add generation_config.json

---
 language/gpt-oss-120b/generation_config.json | 46 ++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 language/gpt-oss-120b/generation_config.json

diff --git a/language/gpt-oss-120b/generation_config.json b/language/gpt-oss-120b/generation_config.json
new file mode 100644
index 0000000000..d7b586f927
--- /dev/null
+++ b/language/gpt-oss-120b/generation_config.json
@@ -0,0 +1,46 @@
+{
+  "_comment": "Generation configuration for gpt-oss-120b model",
+  "_description": "These parameters control the text generation behavior",
+  
+  "max_new_tokens": 32768,
+  "temperature": 1.0,
+  "top_k": -1,
+  "top_p": 1.0,
+  
+  "_parameter_descriptions": {
+    "max_new_tokens": "Maximum number of tokens to generate per request (1-32768)",
+    "temperature": "Sampling temperature (0.0 = deterministic, higher = more random). Typical: 0.001-2.0",
+    "top_k": "Top-k sampling (number of highest probability tokens to consider). -1 = disabled",
+    "top_p": "Top-p/nucleus sampling (cumulative probability threshold). 0.0-1.0, typically 1.0 for no filtering",
+    
+    "_additional_params_note": "SGLang supports additional parameters like:",
+    "repetition_penalty": "Penalty for repeating tokens (typically 1.0-1.2)",
+    "frequency_penalty": "Penalty based on token frequency (0.0-2.0)",
+    "presence_penalty": "Penalty for tokens already present (0.0-2.0)",
+    "min_tokens": "Minimum tokens to generate before stopping",
+    "stop": "Stop sequences (list of strings)",
+    "ignore_eos": "Whether to ignore EOS token (boolean)"
+  },
+  
+  "_presets": {
+    "deterministic": {
+      "max_new_tokens": 10240,
+      "temperature": 0.001,
+      "top_k": 1,
+      "top_p": 1.0
+    },
+    "creative": {
+      "max_new_tokens": 10240,
+      "temperature": 1.5,
+      "top_k": 50,
+      "top_p": 0.95
+    },
+    "balanced": {
+      "max_new_tokens": 10240,
+      "temperature": 0.7,
+      "top_k": 40,
+      "top_p": 0.9
+    }
+  }
+}
+

From 7319e8284fa0ccfa1ec8369c180a0ecec7100f11 Mon Sep 17 00:00:00 2001
From: v-shobhit <161510941+v-shobhit@users.noreply.github.com>
Date: Thu, 18 Dec 2025 16:27:15 +0530
Subject: [PATCH 161/165] add docker command

---
 language/gpt-oss-120b/README.md | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/language/gpt-oss-120b/README.md b/language/gpt-oss-120b/README.md
index 71287b0473..60e611344b 100644
--- a/language/gpt-oss-120b/README.md
+++ b/language/gpt-oss-120b/README.md
@@ -31,6 +31,18 @@ Use [`./sglang/run_server.sh`](./sglang/run_server.sh) to launch an SGLang serve
 ```
 The script uses `python3 -m sglang.launch_server` tp instantiate the model, with `tp=pp=ep=1`, and `dp` as specified. 
 
+You may also use docker:
+```bash
+docker run --runtime nvidia --gpus all --net host  \
+    -v ${HF_HOME}:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN"     \
+    --ipc=host lmsysorg/sglang:latest \
+     python3 -m sglang.launch_server --model-path ${MODEL_NAME} \
+    --host 0.0.0.0  --port 3000 --data-parallel-size=1 --max-running-requests 512 \
+    --mem-fraction-static 0.85 --chunked-prefill-size 16384 --ep-size=1 \
+    --enable-metrics --stream-interval 500
+```
+
 Then, run a benchmark script that uses the client to send/recv requests.
 ### Run the inference
 
@@ -139,4 +151,4 @@ options:
                         Number of parallel workers for LiveCodeBench evaluation (default: 64)
   --verbose             Verbose logging
 
-```
\ No newline at end of file
+```

From 0a95e40e1200d19b8e8fed738f27273ab2e70bbf Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 18 Dec 2025 20:52:40 +0000
Subject: [PATCH 162/165] add better parsing and check for harmony tokens

---
 language/gpt-oss-120b/eval_accuracy.py | 122 +++++++++++++++++++++++--
 1 file changed, 112 insertions(+), 10 deletions(-)

diff --git a/language/gpt-oss-120b/eval_accuracy.py b/language/gpt-oss-120b/eval_accuracy.py
index fb46fdcb9b..32ecd8f9a2 100644
--- a/language/gpt-oss-120b/eval_accuracy.py
+++ b/language/gpt-oss-120b/eval_accuracy.py
@@ -109,33 +109,113 @@ def validate_dataset_name(dataset: Any) -> str:
     return str(dataset).lower()
 
 
+# =============================================================================
+# Harmony Format Extraction
+# =============================================================================
+
+
+def extract_final_section(text: str) -> str:
+    """Extract content from the <|channel|>final<|message|>...<|return|> section.
+
+    The model outputs have two sections:
+    - <|channel|>analysis<|message|>... (reasoning, may have draft answers)
+    - <|channel|>final<|message|>... (actual final answer)
+
+    This function extracts only the final section to avoid extracting
+    wrong answers from the analysis section.
+
+    Uses a flexible regex to handle corrupted markers like:
+    - <|channel|>final 明<|message|>
+    - <|channel|>final537<|message|>
+
+    Args:
+        text: Full model output text
+
+    Returns:
+        Content of final section if found, otherwise returns original text
+    """
+    text = validate_text_input(text)
+    if not text:
+        return ""
+
+    # Flexible pattern to handle corrupted markers (allows chars between final and <|message|>)
+    match = re.search(
+        r'<\|channel\|>final[^<]*<\|message\|>(.*?)(?:<\|return\|>|$)',
+        text, re.DOTALL
+    )
+    if match:
+        return match.group(1).strip()
+
+    # Fallback: return original text if no final section found
+    return text
+
+
+def strip_markdown_bold(text: str) -> str:
+    """Remove markdown bold formatting (**text**) from text.
+
+    Args:
+        text: Text that may contain **bold** formatting
+
+    Returns:
+        Text with bold markers removed
+    """
+    return re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
+
+
 # =============================================================================
 # Answer Parsing Functions
 # =============================================================================
 
 def parse_multiple_choice(text: str, max_option: str = 'D') -> Optional[str]:
-    """Parse multiple choice answer (A-D or A-J)."""
+    """Parse multiple choice answer (A-D or A-J).
+
+    First extracts the final section from harmony-formatted outputs,
+    then parses the answer from that section only.
+    """
     text = validate_text_input(text)
     if not text:
         return None
 
+    # Extract final section first (for harmony format)
+    final_section = extract_final_section(text)
+
+    # Strip markdown bold formatting (**A** -> A)
+    final_section = strip_markdown_bold(final_section)
+
     # Clean artifacts
-    if text.startswith(("['", '["')) and text.endswith(("']", '"]')):
-        text = text[2:-2].strip()
+    if final_section.startswith(("['", '["')) and final_section.endswith(("']", '"]')):
+        final_section = final_section[2:-2].strip()
 
-    text = text.replace(r'\n', '\n').replace(r'\'', "'")
+    final_section = final_section.replace(r'\n', '\n').replace(r'\'', "'")
 
-    # Find ANSWER/FINAL ANSWER pattern
+    # Try to extract from final section first
+    # Priority 1: Single letter answer at start of final section (common in harmony format)
+    single_letter_match = re.match(
+        rf'^[^a-zA-Z]*([A-{max_option}])(?:[^a-zA-Z]|$)',
+        final_section.strip(), re.IGNORECASE
+    )
+    if single_letter_match:
+        return single_letter_match.group(1).upper()
+
+    # Priority 2: "Answer: X" pattern in final section
+    answer_pattern = rf'\b(?:Answer|ANSWER)\s*[:.]?\s*([A-{max_option}])\b'
+    answer_match = re.search(answer_pattern, final_section, re.IGNORECASE)
+    if answer_match:
+        return answer_match.group(1).upper()
+
+    # Priority 3: Fall back to ANSWER/FINAL ANSWER pattern in full text
+    # (for backwards compatibility with non-harmony outputs)
+    full_text = text.replace(r'\n', '\n').replace(r'\'', "'")
     pattern = rf"\b(?:ANSWER|FINAL\s*ANSWER)\b\s*[:=]?\s*(?:\(?\s*([A-{max_option}])\s*\)?)(?:\s*$|[^A-Za-z])"
-    matches = list(re.finditer(pattern, text, re.IGNORECASE))
+    matches = list(re.finditer(pattern, full_text, re.IGNORECASE))
 
     if matches:
         return matches[-1].group(1).upper()
 
-    # MMLU-Pro fallback: standalone letter
+    # MMLU-Pro fallback: standalone letter in final section
     if max_option == 'J':
         fallback_matches = list(re.finditer(
-            r"\b([A-J])\b", text, re.IGNORECASE))
+            rf"\b([A-{max_option}])\b", final_section, re.IGNORECASE))
         if fallback_matches:
             return fallback_matches[-1].group(1).upper()
 
@@ -197,14 +277,36 @@ def parse_aime_answer(text: str) -> Optional[int]:
 def parse_code(text: str) -> Optional[str]:
     """Parse code from ```python or plain ``` code block.
 
+    First extracts the final section from harmony-formatted outputs,
+    then parses code from that section only. This avoids extracting
+    malformed code blocks from the analysis section.
+
     Priority:
-    1. Last ```python block
-    2. Last plain ``` block (if it looks like Python code)
+    1. Code from final section (if harmony format detected)
+    2. Last ```python block from full text (fallback)
+    3. Last plain ``` block from full text (fallback)
     """
     text = validate_text_input(text)
     if not text:
         return None
 
+    # First try to extract from final section (for harmony format)
+    final_section = extract_final_section(text)
+
+    # Check if we got a different final section (harmony format detected)
+    if final_section != text:
+        # Parse code from final section only
+        python_matches = list(re.finditer(r"```python(.*?)```", final_section, re.DOTALL))
+        if python_matches:
+            return python_matches[-1].group(1).strip()
+
+        plain_matches = list(re.finditer(r"```(.*?)```", final_section, re.DOTALL))
+        if plain_matches:
+            code = plain_matches[-1].group(1).strip()
+            code = re.sub(r'^(?:python|py)\s*\n', '', code, flags=re.IGNORECASE)
+            return code
+
+    # Fallback: search full text (for non-harmony outputs or if final section has no code)
     # Try ```python blocks first (most specific)
     python_matches = list(re.finditer(r"```python(.*?)```", text, re.DOTALL))
     if python_matches:

From 6fa49dcd4caeec7b5b9e3ba738e32a3c8be7ad26 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 18 Dec 2025 21:20:19 +0000
Subject: [PATCH 163/165] [Automated Commit] Format Codebase

---
 language/gpt-oss-120b/eval_accuracy.py | 27 ++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/language/gpt-oss-120b/eval_accuracy.py b/language/gpt-oss-120b/eval_accuracy.py
index 32ecd8f9a2..0169f43c36 100644
--- a/language/gpt-oss-120b/eval_accuracy.py
+++ b/language/gpt-oss-120b/eval_accuracy.py
@@ -138,7 +138,8 @@ def extract_final_section(text: str) -> str:
     if not text:
         return ""
 
-    # Flexible pattern to handle corrupted markers (allows chars between final and <|message|>)
+    # Flexible pattern to handle corrupted markers (allows chars between final
+    # and <|message|>)
     match = re.search(
         r'<\|channel\|>final[^<]*<\|message\|>(.*?)(?:<\|return\|>|$)',
         text, re.DOTALL
@@ -183,13 +184,15 @@ def parse_multiple_choice(text: str, max_option: str = 'D') -> Optional[str]:
     final_section = strip_markdown_bold(final_section)
 
     # Clean artifacts
-    if final_section.startswith(("['", '["')) and final_section.endswith(("']", '"]')):
+    if final_section.startswith(
+            ("['", '["')) and final_section.endswith(("']", '"]')):
         final_section = final_section[2:-2].strip()
 
     final_section = final_section.replace(r'\n', '\n').replace(r'\'', "'")
 
     # Try to extract from final section first
-    # Priority 1: Single letter answer at start of final section (common in harmony format)
+    # Priority 1: Single letter answer at start of final section (common in
+    # harmony format)
     single_letter_match = re.match(
         rf'^[^a-zA-Z]*([A-{max_option}])(?:[^a-zA-Z]|$)',
         final_section.strip(), re.IGNORECASE
@@ -296,14 +299,26 @@ def parse_code(text: str) -> Optional[str]:
     # Check if we got a different final section (harmony format detected)
     if final_section != text:
         # Parse code from final section only
-        python_matches = list(re.finditer(r"```python(.*?)```", final_section, re.DOTALL))
+        python_matches = list(
+            re.finditer(
+                r"```python(.*?)```",
+                final_section,
+                re.DOTALL))
         if python_matches:
             return python_matches[-1].group(1).strip()
 
-        plain_matches = list(re.finditer(r"```(.*?)```", final_section, re.DOTALL))
+        plain_matches = list(
+            re.finditer(
+                r"```(.*?)```",
+                final_section,
+                re.DOTALL))
         if plain_matches:
             code = plain_matches[-1].group(1).strip()
-            code = re.sub(r'^(?:python|py)\s*\n', '', code, flags=re.IGNORECASE)
+            code = re.sub(
+                r'^(?:python|py)\s*\n',
+                '',
+                code,
+                flags=re.IGNORECASE)
             return code
 
     # Fallback: search full text (for non-harmony outputs or if final section has no code)

From 1950b13038216016c6df2a82b37336135190b8b7 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 18 Dec 2025 21:30:05 +0000
Subject: [PATCH 164/165] add exact_match log for submission_checker

---
 language/gpt-oss-120b/eval_mlperf_accuracy.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/language/gpt-oss-120b/eval_mlperf_accuracy.py b/language/gpt-oss-120b/eval_mlperf_accuracy.py
index 19e91d03c2..1267c49da4 100644
--- a/language/gpt-oss-120b/eval_mlperf_accuracy.py
+++ b/language/gpt-oss-120b/eval_mlperf_accuracy.py
@@ -642,6 +642,9 @@ def main():
         f"\nFINAL SCORE: {final_score_percentage:.2f}% ({final_score:.2f}/{max_score:.2f})")
     print("=" * 80)
 
+    print("\n\nPrinting for submission_checker:")
+    print(f"\n'exact_match': {final_score}")
+
     # Save detokenized outputs to pickle if requested
     if args.save_outputs:
         logger.info(f"Saving detokenized outputs to {args.save_outputs}...")

From b286618f319a50a88ef89c00bc5093ebfe337236 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Fri, 19 Dec 2025 07:18:17 +0000
Subject: [PATCH 165/165] empty commit to trigger test