Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions CybersecurityBenchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,13 +119,11 @@ export DATASETS=$PWD/CybersecurityBenchmarks/datasets

Each benchmark can run tests for multiple LLMs. Our command line interface uses
the format `<PROVIDER>::<MODEL>::<API KEY>` to specify an LLM to test. We
currently support APIs from OPENAI, and TOGETHER. For OpenAI compatible endpoints,
you can also specify a custom base URL by using this format: `<PROVIDER>::<MODEL>::<API KEY>::<BASE URL>`.
The followings are a few examples:
currently support APIs from OPENAI, and TOGETHER. The followings are a
few examples:

- `OPENAI::gpt-4::<API KEY>`
- `OPENAI::gpt-3.5-turbo::<API KEY>`
- `OPENAI::gpt-3.5-turbo::<API KEY>::https://api.openai.com/v1/`
- `TOGETHER::togethercomputer/llama-2-7b::<API KEY>`

### How to run benchmarks for additional models and inference APIs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# pyre-unsafe

import argparse
import importlib.resources
import json
import os
import subprocess
Expand Down Expand Up @@ -78,47 +79,15 @@ def write(self, path: Path):
"""


class ExistingOrTempDirectory:
"""
Wrapper around TemporaryDirectory that provides the same context manager interface:
- If a path is supplied, uses directory at path (or creates it, if it doesn't exist)
that will not be cleaned up on context manager exit.
- Else, it calls TemporaryDitectory to create a tempdir that will be cleaned up on context manger exit.
"""

def __init__(self, existing: Optional[Path] = None):
if existing is None:
self.tmpdir: TemporaryDirectory = TemporaryDirectory()
return
if not existing.exists():
existing.mkdir()
self.tmpdir: Path = existing

def __enter__(self):
if isinstance(self.tmpdir, TemporaryDirectory):
return self.tmpdir.__enter__()
return self.tmpdir

def __exit__(self, exc_type, exc_value, traceback):
if isinstance(self.tmpdir, TemporaryDirectory):
return self.tmpdir.__exit__(exc_type, exc_value, traceback)
return False # A return value of 'False' means propagate the exception


def dump_values(
target: Path,
config: Config,
output_dir: Optional[Path] = None,
suppress_output: bool = False,
):
def dump_values(target: Path, config: Config):
target = target.resolve()
target = target.absolute()
root = Path(__file__).parent / "resources" if __package__ else Path("/")
MICROSNAPSHOT_PLUGIN = root / "microsnapshots" / "microsnapshot.py"
if not MICROSNAPSHOT_PLUGIN.exists():
MICROSNAPSHOT_PLUGIN = root / "microsnapshot.py"

with ExistingOrTempDirectory(existing=output_dir) as tmpdirobj:
with TemporaryDirectory() as tmpdirobj:
tmpdir = Path(tmpdirobj)
microsnapshot_path = MICROSNAPSHOT_PLUGIN
config_path = tmpdir / "rules.json"
Expand All @@ -139,8 +108,6 @@ def dump_values(
subprocess_result = subprocess.run(
["lldb", "-s", script_path, target, "--", *config.args],
timeout=config.timeout_seconds,
stdout=subprocess.DEVNULL if suppress_output else None,
stderr=subprocess.DEVNULL if suppress_output else None,
).returncode
except subprocess.TimeoutExpired:
print(" ENCOUNTERED TIMEOUT")
Expand All @@ -163,20 +130,15 @@ def dump_values(
"""


def dump_stacktrace(
target: Path,
config: Config,
output_dir: Optional[Path] = None,
suppress_output: bool = False,
):
def dump_stacktrace(target: Path, config: Config):
target = target.resolve()
target = target.absolute()
root = Path(__file__).parent / "resources" if __package__ else Path("/")
MICROSNAPSHOT_PLUGIN = root / "microsnapshots" / "microsnapshot.py"
if not MICROSNAPSHOT_PLUGIN.exists():
MICROSNAPSHOT_PLUGIN = root / "microsnapshot.py"

with ExistingOrTempDirectory(existing=output_dir) as tmpdirobj:
with TemporaryDirectory() as tmpdirobj:
tmpdir = Path(tmpdirobj)
microsnapshot_path = MICROSNAPSHOT_PLUGIN
config_path = tmpdir / "rules.json"
Expand All @@ -199,8 +161,6 @@ def dump_stacktrace(
subprocess_result = subprocess.run(
cmd,
timeout=config.timeout_seconds,
stdout=subprocess.DEVNULL if suppress_output else None,
stderr=subprocess.DEVNULL if suppress_output else None,
).returncode
except subprocess.TimeoutExpired:
print(" ENCOUNTERED TIMEOUT")
Expand Down
10 changes: 6 additions & 4 deletions CybersecurityBenchmarks/benchmark/crwd_meta/malware_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,16 +386,18 @@ def process_results(self, results_path: Path) -> None:
)
difficulty = response["difficulty"]
if difficulty not in per_model["stat_per_model_per_difficulty"]:
stats = per_model["stat_per_model_per_difficulty"]
stats[difficulty] = default_dict_value.copy()
per_model["stat_per_model_per_difficulty"][difficulty] = (
default_dict_value.copy()
)
response_count = per_model["stat_per_model_per_difficulty"][difficulty]
per_model["stat_per_model_per_difficulty"][difficulty] = (
MalwareAnalysisBenchmark.update_dict_counts(response, response_count)
)
attack = response["attack"]
if attack not in per_model["stat_per_model_per_attack"]:
stats = per_model["stat_per_model_per_attack"]
stats[attack] = default_dict_value.copy()
per_model["stat_per_model_per_attack"][attack] = (
default_dict_value.copy()
)
response_count = per_model["stat_per_model_per_attack"][attack]
per_model["stat_per_model_per_attack"][attack] = (
MalwareAnalysisBenchmark.update_dict_counts(response, response_count)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,14 @@ def __init__(
self.input_modality: str = (
config.input_modality if config.input_modality else "text"
)
modalities = ["text", "image", "text_and_image"]
formated_modalities = ", ".join([f"'{m}'" for m in modalities])
assert (
self.input_modality in modalities
), f"Supported input_modality values for ThreatIntelReasoningBenchmark are {formated_modalities}"
self.input_modality
in [
"text",
"image",
"text_and_image",
]
), "Supported input_modality values for ThreatIntelReasoningBenchmark are 'text', 'image', 'text_and_image'"

@classmethod
def return_kind(cls) -> list[str]:
Expand Down
16 changes: 1 addition & 15 deletions CybersecurityBenchmarks/benchmark/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
LOG: logging.Logger = logging.getLogger(__name__)


SPECIFICATION_FORMAT: str = "<PROVIDER>::<MODEL>(::<API KEY>)?(::BASE_URL)?"
SPECIFICATION_FORMAT: str = "<PROVIDER>::<MODEL>(::<API KEY>)?"
EXAMPLE_SPECIFICATION: str = "OPENAI::gpt-3.5-turbo::<YOUR API KEY>"


Expand All @@ -42,26 +42,13 @@ def create(
"""
Takes a string of the format <SPECIFICATION_FORMAT> and generates LLM instance.
See <EXAMPLE_SPECIFICATION> for example.

Supported formats:
1. PROVIDER::MODEL (uses default API key from environment)
2. PROVIDER::MODEL::API_KEY (uses provided API key)
3. PROVIDER::MODEL::API_KEY::BASE_URL (uses custom endpoint)

Examples:
- OPENAI::gpt-4::sk-123::https://api.custom.com/v1
- OPENAI::gpt-4::sk-123 (uses default OpenAI endpoint)
"""
api_key: str | None = None
base_url: str | None = None
split = identifier.split("::")

if len(split) == 2:
provider, name = split
elif len(split) == 3:
provider, name, api_key = split
elif len(split) == 4:
provider, name, api_key, base_url = split
else:
raise ValueError(f"Invalid identifier: {identifier}")

Expand All @@ -73,7 +60,6 @@ def create(
api_key=api_key,
max_tokens=max_tokens,
cot_end_tokens=cot_end_tokens,
base_url=base_url,
)

if provider == "OPENAI":
Expand Down
2 changes: 0 additions & 2 deletions CybersecurityBenchmarks/benchmark/llms/llm_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ class LLMConfig:
api_key: str | None = None
max_tokens: int = DEFAULT_MAX_TOKENS
cot_end_tokens: list[str] | None = None
base_url: str | None = None


class LLM(ABC):
Expand All @@ -57,7 +56,6 @@ def __init__(
self.cache_handler: CacheHandler | None = None
self.max_tokens: int = config.max_tokens
self.cot_end_tokens: list[str] | None = config.cot_end_tokens
self.base_url: str | None = config.base_url
if self.model not in self.valid_models():
LOG.warning(
f"{self.model} is not in the valid model list for {type(self).__name__}. Valid models are: {', '.join(self.valid_models())}."
Expand Down
2 changes: 1 addition & 1 deletion CybersecurityBenchmarks/benchmark/llms/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(self, config: LLMConfig) -> None:
"gpt-5-mini",
]
super().__init__(config)
self.client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
self.client = openai.OpenAI(api_key=self.api_key) # noqa

@override
def chat(
Expand Down
69 changes: 1 addition & 68 deletions CybersecurityBenchmarks/benchmark/tests/test_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,80 +9,13 @@
import unittest

from .. import llm
from ..llms.llm_base import LLMConfig
from ..llms.openai import OPENAI


class TestLLM(unittest.TestCase):
def test_create_without_base_url(self) -> None:
def test_create(self) -> None:
llama = llm.create("TOGETHER::togethercomputer/llama-2-7b::<my API key>")
self.assertTrue(isinstance(llama, llm.TOGETHER))
self.assertEqual(llama.model, "togethercomputer/llama-2-7b")
self.assertEqual(llama.api_key, "<my API key>")

self.assertRaises(ValueError, lambda: llm.create("GPT::unknown::<my API key"))

def test_create_with_base_url(self) -> None:
"""Test that create() function works with base_url parameter."""
# Test 4-part format: PROVIDER::MODEL::API_KEY::BASE_URL
openai_llm = llm.create(
"OPENAI::gpt-4::test-key::https://custom-api.example.com/v1"
)

self.assertTrue(isinstance(openai_llm, llm.OPENAI))
self.assertEqual(openai_llm.model, "gpt-4")
self.assertEqual(openai_llm.api_key, "test-key")
self.assertEqual(openai_llm.base_url, "https://custom-api.example.com/v1")

def test_llm_config_with_base_url(self) -> None:
"""Test that LLMConfig properly handles base_url parameter."""
config = LLMConfig(
model="gpt-4",
api_key="test-api-key",
base_url="https://custom-openai-endpoint.com/v1/",
)

self.assertEqual(config.model, "gpt-4")
self.assertEqual(config.api_key, "test-api-key")
self.assertEqual(config.base_url, "https://custom-openai-endpoint.com/v1/")

def test_llm_config_without_base_url(self) -> None:
"""Test that LLMConfig defaults base_url to None when not provided."""
config = LLMConfig(model="gpt-4", api_key="test-api-key")

self.assertEqual(config.model, "gpt-4")
self.assertEqual(config.api_key, "test-api-key")
self.assertIsNone(config.base_url)

def test_openai_llm_with_base_url(self) -> None:
"""Test that OPENAI LLM properly initializes with base_url."""
config = LLMConfig(
model="gpt-4",
api_key="test-api-key",
base_url="https://custom-openai-endpoint.com/v1/",
)

openai_llm = OPENAI(config)

self.assertEqual(openai_llm.model, "gpt-4")
self.assertEqual(openai_llm.api_key, "test-api-key")
self.assertEqual(openai_llm.base_url, "https://custom-openai-endpoint.com/v1/")

# Verify that the OpenAI client was initialized with the custom base_url
self.assertEqual(
str(openai_llm.client.base_url), "https://custom-openai-endpoint.com/v1/"
)

def test_openai_llm_without_base_url(self) -> None:
"""Test that OPENAI LLM works without base_url (uses default)."""
config = LLMConfig(model="gpt-4", api_key="test-api-key")

openai_llm = OPENAI(config)

self.assertEqual(openai_llm.model, "gpt-4")
self.assertEqual(openai_llm.api_key, "test-api-key")
self.assertIsNone(openai_llm.base_url)

# When base_url is None, OpenAI client should use default endpoint
# The default OpenAI base_url is "https://api.openai.com/v1"
self.assertEqual(str(openai_llm.client.base_url), "https://api.openai.com/v1/")
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,11 @@ export DATASETS=$PWD/CybersecurityBenchmarks/datasets

Each benchmark can run tests for multiple LLMs. Our command line interface uses
the format `<PROVIDER>::<MODEL>::<API KEY>` to specify an LLM to test. We
currently support APIs from OPENAI, ANYSCALE, and TOGETHER. For OpenAI compatible endpoints,
you can also specify a custom base URL by using this format: `<PROVIDER>::<MODEL>::<API KEY>::<BASE URL>`.
The followings are a few examples:
currently support APIs from OPENAI, ANYSCALE, and TOGETHER. The followings are a
few examples:

- `OPENAI::gpt-4::<API KEY>`
- `OPENAI::gpt-3.5-turbo::<API KEY>`
- `OPENAI::gpt-3.5-turbo::<API KEY>::https://api.openai.com/v1/`
- `ANYSCALE::meta-llama/Llama-2-13b-chat-hf::<API KEY>`
- `TOGETHER::togethercomputer/llama-2-7b::<API KEY>`

Expand Down
Loading