meta-llama · ritesh-chauhan0x1 · Sep 24, 2025 · Sep 24, 2025 · Sep 24, 2025
diff --git a/CybersecurityBenchmarks/README.md b/CybersecurityBenchmarks/README.md
@@ -119,13 +119,11 @@ export DATASETS=$PWD/CybersecurityBenchmarks/datasets
 
 Each benchmark can run tests for multiple LLMs. Our command line interface uses
 the format `<PROVIDER>::<MODEL>::<API KEY>` to specify an LLM to test. We
-currently support APIs from OPENAI, and TOGETHER. For OpenAI compatible endpoints,
-you can also specify a custom base URL by using this format: `<PROVIDER>::<MODEL>::<API KEY>::<BASE URL>`.
-The followings are a few examples:
+currently support APIs from OPENAI, and TOGETHER. The followings are a
+few examples:
 
 - `OPENAI::gpt-4::<API KEY>`
 - `OPENAI::gpt-3.5-turbo::<API KEY>`
-- `OPENAI::gpt-3.5-turbo::<API KEY>::https://api.openai.com/v1/`
 - `TOGETHER::togethercomputer/llama-2-7b::<API KEY>`
 
 ### How to run benchmarks for additional models and inference APIs

diff --git a/CybersecurityBenchmarks/benchmark/autopatch/build/microsnapshots/lib_dump.py b/CybersecurityBenchmarks/benchmark/autopatch/build/microsnapshots/lib_dump.py
@@ -6,6 +6,7 @@
 # pyre-unsafe
 
 import argparse
+import importlib.resources
 import json
 import os
 import subprocess
@@ -78,47 +79,15 @@ def write(self, path: Path):
 """
 
 
-class ExistingOrTempDirectory:
-    """
-    Wrapper around TemporaryDirectory that provides the same context manager interface:
-    - If a path is supplied, uses directory at path (or creates it, if it doesn't exist)
-         that will not be cleaned up on context manager exit.
-    - Else, it calls TemporaryDitectory to create a tempdir that will be cleaned up on context manger exit.
-    """
-
-    def __init__(self, existing: Optional[Path] = None):
-        if existing is None:
-            self.tmpdir: TemporaryDirectory = TemporaryDirectory()
-            return
-        if not existing.exists():
-            existing.mkdir()
-        self.tmpdir: Path = existing
-
-    def __enter__(self):
-        if isinstance(self.tmpdir, TemporaryDirectory):
-            return self.tmpdir.__enter__()
-        return self.tmpdir
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        if isinstance(self.tmpdir, TemporaryDirectory):
-            return self.tmpdir.__exit__(exc_type, exc_value, traceback)
-        return False  # A return value of 'False' means propagate the exception
-
-
-def dump_values(
-    target: Path,
-    config: Config,
-    output_dir: Optional[Path] = None,
-    suppress_output: bool = False,
-):
+def dump_values(target: Path, config: Config):
     target = target.resolve()
     target = target.absolute()
     root = Path(__file__).parent / "resources" if __package__ else Path("/")
     MICROSNAPSHOT_PLUGIN = root / "microsnapshots" / "microsnapshot.py"
     if not MICROSNAPSHOT_PLUGIN.exists():
         MICROSNAPSHOT_PLUGIN = root / "microsnapshot.py"
 
-    with ExistingOrTempDirectory(existing=output_dir) as tmpdirobj:
+    with TemporaryDirectory() as tmpdirobj:
         tmpdir = Path(tmpdirobj)
         microsnapshot_path = MICROSNAPSHOT_PLUGIN
         config_path = tmpdir / "rules.json"
@@ -139,8 +108,6 @@ def dump_values(
             subprocess_result = subprocess.run(
                 ["lldb", "-s", script_path, target, "--", *config.args],
                 timeout=config.timeout_seconds,
-                stdout=subprocess.DEVNULL if suppress_output else None,
-                stderr=subprocess.DEVNULL if suppress_output else None,
             ).returncode
         except subprocess.TimeoutExpired:
             print(" ENCOUNTERED TIMEOUT")
@@ -163,20 +130,15 @@ def dump_values(
 """
 
 
-def dump_stacktrace(
-    target: Path,
-    config: Config,
-    output_dir: Optional[Path] = None,
-    suppress_output: bool = False,
-):
+def dump_stacktrace(target: Path, config: Config):
     target = target.resolve()
     target = target.absolute()
     root = Path(__file__).parent / "resources" if __package__ else Path("/")
     MICROSNAPSHOT_PLUGIN = root / "microsnapshots" / "microsnapshot.py"
     if not MICROSNAPSHOT_PLUGIN.exists():
         MICROSNAPSHOT_PLUGIN = root / "microsnapshot.py"
 
-    with ExistingOrTempDirectory(existing=output_dir) as tmpdirobj:
+    with TemporaryDirectory() as tmpdirobj:
         tmpdir = Path(tmpdirobj)
         microsnapshot_path = MICROSNAPSHOT_PLUGIN
         config_path = tmpdir / "rules.json"
@@ -199,8 +161,6 @@ def dump_stacktrace(
             subprocess_result = subprocess.run(
                 cmd,
                 timeout=config.timeout_seconds,
-                stdout=subprocess.DEVNULL if suppress_output else None,
-                stderr=subprocess.DEVNULL if suppress_output else None,
             ).returncode
         except subprocess.TimeoutExpired:
             print(" ENCOUNTERED TIMEOUT")

diff --git a/CybersecurityBenchmarks/benchmark/crwd_meta/malware_analysis.py b/CybersecurityBenchmarks/benchmark/crwd_meta/malware_analysis.py
@@ -386,16 +386,18 @@ def process_results(self, results_path: Path) -> None:
             )
             difficulty = response["difficulty"]
             if difficulty not in per_model["stat_per_model_per_difficulty"]:
-                stats = per_model["stat_per_model_per_difficulty"]
-                stats[difficulty] = default_dict_value.copy()
+                per_model["stat_per_model_per_difficulty"][difficulty] = (
+                    default_dict_value.copy()
+                )
             response_count = per_model["stat_per_model_per_difficulty"][difficulty]
             per_model["stat_per_model_per_difficulty"][difficulty] = (
                 MalwareAnalysisBenchmark.update_dict_counts(response, response_count)
             )
             attack = response["attack"]
             if attack not in per_model["stat_per_model_per_attack"]:
-                stats = per_model["stat_per_model_per_attack"]
-                stats[attack] = default_dict_value.copy()
+                per_model["stat_per_model_per_attack"][attack] = (
+                    default_dict_value.copy()
+                )
             response_count = per_model["stat_per_model_per_attack"][attack]
             per_model["stat_per_model_per_attack"][attack] = (
                 MalwareAnalysisBenchmark.update_dict_counts(response, response_count)

diff --git a/CybersecurityBenchmarks/benchmark/crwd_meta/threat_intel_reasoning_benchmark.py b/CybersecurityBenchmarks/benchmark/crwd_meta/threat_intel_reasoning_benchmark.py
@@ -37,11 +37,14 @@ def __init__(
         self.input_modality: str = (
             config.input_modality if config.input_modality else "text"
         )
-        modalities = ["text", "image", "text_and_image"]
-        formated_modalities = ", ".join([f"'{m}'" for m in modalities])
         assert (
-            self.input_modality in modalities
-        ), f"Supported input_modality values for ThreatIntelReasoningBenchmark are {formated_modalities}"
+            self.input_modality
+            in [
+                "text",
+                "image",
+                "text_and_image",
+            ]
+        ), "Supported input_modality values for ThreatIntelReasoningBenchmark are 'text', 'image', 'text_and_image'"
 
     @classmethod
     def return_kind(cls) -> list[str]:

diff --git a/CybersecurityBenchmarks/benchmark/llm.py b/CybersecurityBenchmarks/benchmark/llm.py
@@ -30,7 +30,7 @@
 LOG: logging.Logger = logging.getLogger(__name__)
 
 
-SPECIFICATION_FORMAT: str = "<PROVIDER>::<MODEL>(::<API KEY>)?(::BASE_URL)?"
+SPECIFICATION_FORMAT: str = "<PROVIDER>::<MODEL>(::<API KEY>)?"
 EXAMPLE_SPECIFICATION: str = "OPENAI::gpt-3.5-turbo::<YOUR API KEY>"
 
 
@@ -42,26 +42,13 @@ def create(
     """
     Takes a string of the format <SPECIFICATION_FORMAT> and generates LLM instance.
     See <EXAMPLE_SPECIFICATION> for example.
-
-    Supported formats:
-    1. PROVIDER::MODEL (uses default API key from environment)
-    2. PROVIDER::MODEL::API_KEY (uses provided API key)
-    3. PROVIDER::MODEL::API_KEY::BASE_URL (uses custom endpoint)
-
-    Examples:
-    - OPENAI::gpt-4::sk-123::https://api.custom.com/v1
-    - OPENAI::gpt-4::sk-123 (uses default OpenAI endpoint)
     """
     api_key: str | None = None
-    base_url: str | None = None
     split = identifier.split("::")
-
     if len(split) == 2:
         provider, name = split
     elif len(split) == 3:
         provider, name, api_key = split
-    elif len(split) == 4:
-        provider, name, api_key, base_url = split
     else:
         raise ValueError(f"Invalid identifier: {identifier}")
 
@@ -73,7 +60,6 @@ def create(
         api_key=api_key,
         max_tokens=max_tokens,
         cot_end_tokens=cot_end_tokens,
-        base_url=base_url,
     )
 
     if provider == "OPENAI":

diff --git a/CybersecurityBenchmarks/benchmark/llms/llm_base.py b/CybersecurityBenchmarks/benchmark/llms/llm_base.py
@@ -44,7 +44,6 @@ class LLMConfig:
     api_key: str | None = None
     max_tokens: int = DEFAULT_MAX_TOKENS
     cot_end_tokens: list[str] | None = None
-    base_url: str | None = None
 
 
 class LLM(ABC):
@@ -57,7 +56,6 @@ def __init__(
         self.cache_handler: CacheHandler | None = None
         self.max_tokens: int = config.max_tokens
         self.cot_end_tokens: list[str] | None = config.cot_end_tokens
-        self.base_url: str | None = config.base_url
         if self.model not in self.valid_models():
             LOG.warning(
                 f"{self.model} is not in the valid model list for {type(self).__name__}. Valid models are: {', '.join(self.valid_models())}."

diff --git a/CybersecurityBenchmarks/benchmark/llms/openai.py b/CybersecurityBenchmarks/benchmark/llms/openai.py
@@ -38,7 +38,7 @@ def __init__(self, config: LLMConfig) -> None:
             "gpt-5-mini",
         ]
         super().__init__(config)
-        self.client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
+        self.client = openai.OpenAI(api_key=self.api_key)  # noqa
 
     @override
     def chat(

diff --git a/CybersecurityBenchmarks/benchmark/tests/test_llm.py b/CybersecurityBenchmarks/benchmark/tests/test_llm.py
@@ -9,80 +9,13 @@
 import unittest
 
 from .. import llm
-from ..llms.llm_base import LLMConfig
-from ..llms.openai import OPENAI
 
 
 class TestLLM(unittest.TestCase):
-    def test_create_without_base_url(self) -> None:
+    def test_create(self) -> None:
         llama = llm.create("TOGETHER::togethercomputer/llama-2-7b::<my API key>")
         self.assertTrue(isinstance(llama, llm.TOGETHER))
         self.assertEqual(llama.model, "togethercomputer/llama-2-7b")
         self.assertEqual(llama.api_key, "<my API key>")
 
         self.assertRaises(ValueError, lambda: llm.create("GPT::unknown::<my API key"))
-
-    def test_create_with_base_url(self) -> None:
-        """Test that create() function works with base_url parameter."""
-        # Test 4-part format: PROVIDER::MODEL::API_KEY::BASE_URL
-        openai_llm = llm.create(
-            "OPENAI::gpt-4::test-key::https://custom-api.example.com/v1"
-        )
-
-        self.assertTrue(isinstance(openai_llm, llm.OPENAI))
-        self.assertEqual(openai_llm.model, "gpt-4")
-        self.assertEqual(openai_llm.api_key, "test-key")
-        self.assertEqual(openai_llm.base_url, "https://custom-api.example.com/v1")
-
-    def test_llm_config_with_base_url(self) -> None:
-        """Test that LLMConfig properly handles base_url parameter."""
-        config = LLMConfig(
-            model="gpt-4",
-            api_key="test-api-key",
-            base_url="https://custom-openai-endpoint.com/v1/",
-        )
-
-        self.assertEqual(config.model, "gpt-4")
-        self.assertEqual(config.api_key, "test-api-key")
-        self.assertEqual(config.base_url, "https://custom-openai-endpoint.com/v1/")
-
-    def test_llm_config_without_base_url(self) -> None:
-        """Test that LLMConfig defaults base_url to None when not provided."""
-        config = LLMConfig(model="gpt-4", api_key="test-api-key")
-
-        self.assertEqual(config.model, "gpt-4")
-        self.assertEqual(config.api_key, "test-api-key")
-        self.assertIsNone(config.base_url)
-
-    def test_openai_llm_with_base_url(self) -> None:
-        """Test that OPENAI LLM properly initializes with base_url."""
-        config = LLMConfig(
-            model="gpt-4",
-            api_key="test-api-key",
-            base_url="https://custom-openai-endpoint.com/v1/",
-        )
-
-        openai_llm = OPENAI(config)
-
-        self.assertEqual(openai_llm.model, "gpt-4")
-        self.assertEqual(openai_llm.api_key, "test-api-key")
-        self.assertEqual(openai_llm.base_url, "https://custom-openai-endpoint.com/v1/")
-
-        # Verify that the OpenAI client was initialized with the custom base_url
-        self.assertEqual(
-            str(openai_llm.client.base_url), "https://custom-openai-endpoint.com/v1/"
-        )
-
-    def test_openai_llm_without_base_url(self) -> None:
-        """Test that OPENAI LLM works without base_url (uses default)."""
-        config = LLMConfig(model="gpt-4", api_key="test-api-key")
-
-        openai_llm = OPENAI(config)
-
-        self.assertEqual(openai_llm.model, "gpt-4")
-        self.assertEqual(openai_llm.api_key, "test-api-key")
-        self.assertIsNone(openai_llm.base_url)
-
-        # When base_url is None, OpenAI client should use default endpoint
-        # The default OpenAI base_url is "https://api.openai.com/v1"
-        self.assertEqual(str(openai_llm.client.base_url), "https://api.openai.com/v1/")
diff --git a/CybersecurityBenchmarks/website/docs/user_guide/getting_started.md b/CybersecurityBenchmarks/website/docs/user_guide/getting_started.md
@@ -34,13 +34,11 @@ export DATASETS=$PWD/CybersecurityBenchmarks/datasets
 
 Each benchmark can run tests for multiple LLMs. Our command line interface uses
 the format `<PROVIDER>::<MODEL>::<API KEY>` to specify an LLM to test. We
-currently support APIs from OPENAI, ANYSCALE, and TOGETHER. For OpenAI compatible endpoints,
-you can also specify a custom base URL by using this format: `<PROVIDER>::<MODEL>::<API KEY>::<BASE URL>`.
-The followings are a few examples:
+currently support APIs from OPENAI, ANYSCALE, and TOGETHER. The followings are a
+few examples:
 
 - `OPENAI::gpt-4::<API KEY>`
 - `OPENAI::gpt-3.5-turbo::<API KEY>`
-- `OPENAI::gpt-3.5-turbo::<API KEY>::https://api.openai.com/v1/`
 - `ANYSCALE::meta-llama/Llama-2-13b-chat-hf::<API KEY>`
 - `TOGETHER::togethercomputer/llama-2-7b::<API KEY>`