Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
f75cb59
adding evaluators (WIP)
jp-agenta Dec 19, 2025
c2c553a
adding evaluators (WIP)
jp-agenta Dec 19, 2025
5a8dcd0
fixing evaluators
jp-agenta Dec 19, 2025
91e69e8
Merge branch 'release/v0.69.5' into chore/check-daytona-code-evaluator
jp-agenta Dec 19, 2025
a602930
testing numpy/openai/agenta
jp-agenta Dec 19, 2025
59f4797
fix typos in init
jp-agenta Dec 19, 2025
6c297c9
confirm works with localhost if public host
jp-agenta Dec 19, 2025
a717366
fix playground
jp-agenta Dec 20, 2025
b3d90f2
fix presets
jp-agenta Dec 20, 2025
5bdc802
remove blaot
jp-agenta Dec 20, 2025
5304e0f
remove bloat
jp-agenta Dec 20, 2025
00958cc
fix daytona imports
jp-agenta Dec 20, 2025
4071a3d
remove openai key from daytona
jp-agenta Dec 20, 2025
7d3ac94
WIP add runtimes
jp-agenta Dec 23, 2025
84bbdaa
Merge branch 'fix/remove-autoevals-and-rag-evaluators' into chore/che…
jp-agenta Dec 23, 2025
a4ffa8c
Merge branch 'main' into chore/check-daytona-code-evaluator
jp-agenta Dec 23, 2025
93d7bb5
WIP
jp-agenta Dec 23, 2025
d3f2a87
Clean up extra logs
jp-agenta Dec 23, 2025
d485e2f
Add/Fix presets
jp-agenta Dec 23, 2025
d3c2af3
ruff format
jp-agenta Dec 23, 2025
a924c3c
Fix editor highlighting
jp-agenta Dec 23, 2025
cc7de34
Apply suggestion from @Copilot
junaway Dec 23, 2025
6b929d0
Apply suggestion from @Copilot
junaway Dec 23, 2025
605e9af
Minor vault fixes
jp-agenta Dec 23, 2025
e6d4803
Merge branch 'chore/check-daytona-code-evaluator' of github.com:Agent…
jp-agenta Dec 23, 2025
08f8903
more vault fix
jp-agenta Dec 23, 2025
90a5896
more vault fixes
jp-agenta Dec 23, 2025
b4a663d
more cleanups
jp-agenta Dec 23, 2025
d960f6e
Apply suggestion from @Copilot
junaway Dec 23, 2025
3498973
Apply suggestion from @Copilot
junaway Dec 23, 2025
908531f
example fixes
jp-agenta Dec 23, 2025
83c80f9
Merge branch 'chore/check-daytona-code-evaluator' of github.com:Agent…
jp-agenta Dec 23, 2025
8301c76
update locks
jp-agenta Dec 23, 2025
7fa102a
fix tabs/spaces conversion
jp-agenta Dec 23, 2025
4b6375e
clearer error printing with daytona
jp-agenta Dec 23, 2025
bf68e6b
apply eslint
jp-agenta Dec 23, 2025
3db392d
apply eslint
jp-agenta Dec 23, 2025
59a6e6b
apply es lint
jp-agenta Dec 23, 2025
cdf1ae0
Merge branch 'frontend-feat/new-testsets-integration' into chore/chec…
jp-agenta Dec 25, 2025
51856c6
fix merge issues
jp-agenta Dec 25, 2025
3966be2
Merge branch 'frontend-feat/new-testsets-integration' into chore/chec…
jp-agenta Dec 25, 2025
8a8d9df
Merge branch 'frontend-feat/new-testsets-integration' into chore/chec…
jp-agenta Dec 26, 2025
18e2e3c
Merge branch 'frontend-feat/new-testsets-integration' into chore/chec…
jp-agenta Dec 30, 2025
9ce5afe
Merge branch 'frontend-feat/new-testsets-integration' into chore/chec…
jp-agenta Jan 2, 2026
d9d6858
ruff format
jp-agenta Jan 2, 2026
93ceec9
Merge branch 'frontend-feat/new-testsets-integration' into chore/chec…
jp-agenta Jan 4, 2026
47f4ce6
fix runtime
jp-agenta Jan 4, 2026
6617b89
Merge branch 'frontend-feat/new-testsets-integration' into chore/chec…
jp-agenta Jan 4, 2026
306e9a7
Fix runtime
jp-agenta Jan 4, 2026
7cfa317
fix blank form on reload
jp-agenta Jan 4, 2026
5c6729f
Merge branch 'release/v0.74.0' into chore/check-daytona-code-evaluator
jp-agenta Jan 5, 2026
b1e2664
added _sandbox_context
jp-agenta Jan 6, 2026
ccd6087
fix 3-tuple in secrets
jp-agenta Jan 6, 2026
f0beb20
restricted python and dependencies cleanup
jp-agenta Jan 6, 2026
710bdf9
add missing locks
jp-agenta Jan 6, 2026
840fcc1
Propagate custom code evaluator exception
jp-agenta Jan 6, 2026
ce3ba20
Error copy fix
jp-agenta Jan 6, 2026
93eee32
added docs for env vars
jp-agenta Jan 6, 2026
33ebca9
Merge branch 'main' into chore/check-daytona-code-evaluator
jp-agenta Jan 6, 2026
9e7960f
Merge branch 'release/v0.75.0' into chore/check-daytona-code-evaluator
jp-agenta Jan 7, 2026
d9feb15
Fix poetry.lock
jp-agenta Jan 7, 2026
eef9477
fix var name in web
jp-agenta Jan 7, 2026
d8364c7
fix fallback in js/ts evaluators
jp-agenta Jan 7, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions api/oss/src/core/workflows/dtos.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,9 @@ class WorkflowServiceInterface(WorkflowServiceVersion):
class WorkflowServiceConfiguration(WorkflowServiceInterface):
script: Optional[Data] = None # str w/ validation
parameters: Optional[Data] = None # configuration values
runtime: Optional[str] = (
None # runtime environment (python, javascript, typescript), None = python
)


class WorkflowRevisionData(WorkflowServiceConfiguration):
Expand Down
45 changes: 44 additions & 1 deletion api/oss/src/resources/evaluators/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,41 @@
"name": "Code Evaluation",
"key": "auto_custom_code_run",
"direct_use": False,
"settings_presets": [
{
"key": "python_default",
"name": "Exact Match (Python)",
"values": {
"requires_llm_api_keys": False,
"runtime": "python",
"correct_answer_key": "correct_answer",
"code": "from typing import Dict, Union, Any\n\n\ndef evaluate(\n app_params: Dict[str, str], # deprecated; currently receives {}\n inputs: Dict[str, str],\n output: Union[str, Dict[str, Any]],\n correct_answer: str,\n) -> float:\n if output == correct_answer:\n return 1.0\n return 0.0\n",
},
"description": "Exact match evaluator implemented in Python.",
},
{
"key": "javascript_default",
"name": "Exact Match (JavaScript)",
"values": {
"requires_llm_api_keys": False,
"runtime": "javascript",
"correct_answer_key": "correct_answer",
"code": 'function evaluate(appParams, inputs, output, correctAnswer) {\n void appParams\n void inputs\n\n const outputStr =\n typeof output === "string" ? output : JSON.stringify(output)\n\n return outputStr === String(correctAnswer) ? 1.0 : 0.0\n}\n',
},
"description": "Exact match evaluator implemented in JavaScript.",
},
{
"key": "typescript_default",
"name": "Exact Match (TypeScript)",
"values": {
"requires_llm_api_keys": False,
"runtime": "typescript",
"correct_answer_key": "correct_answer",
"code": 'type OutputValue = string | Record<string, unknown>\n\nfunction evaluate(\n app_params: Record<string, string>,\n inputs: Record<string, string>,\n output: OutputValue,\n correct_answer: string\n): number {\n void app_params\n void inputs\n\n const outputStr =\n (typeof output === "string" ? output : JSON.stringify(output)) as string\n\n return outputStr === String(correct_answer) ? 1.0 : 0.0\n}\n',
},
"description": "Exact match evaluator implemented in TypeScript.",
},
],
"settings_template": {
"requires_llm_api_keys": {
"label": "Requires LLM API Key(s)",
Expand All @@ -310,10 +345,18 @@
"code": {
"label": "Evaluation Code",
"type": "code",
"default": "from typing import Dict, Union, Any\n\ndef evaluate(\n app_params: Dict[str, str],\n inputs: Dict[str, str],\n output: Union[str, Dict[str, Any]], # output of the llm app\n correct_answer: str # contains the testset row \n) -> float:\n if output in correct_answer:\n return 1.0\n else:\n return 0.0\n",
"default": "from typing import Dict, Union, Any\n\n\ndef evaluate(\n app_params: Dict[str, str], # deprecated; currently receives {}\n inputs: Dict[str, str],\n output: Union[str, Dict[str, Any]],\n correct_answer: str,\n) -> float:\n if output == correct_answer:\n return 1.0\n return 0.0\n",
"description": "Code for evaluating submissions",
"required": True,
},
"runtime": {
"label": "Runtime",
"type": "multiple_choice",
"default": "python",
"options": ["python", "javascript", "typescript"],
"advanced": True,
"description": "Runtime environment used to execute the evaluator code.",
},
"correct_answer_key": {
"label": "Expected Answer Column",
"default": "correct_answer",
Expand Down
13 changes: 7 additions & 6 deletions api/oss/src/routers/evaluators_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,15 +98,16 @@ async def evaluator_run(
workspace_id=str(request.state.workspace_id),
organization_id=str(request.state.organization_id),
)
credentials = f"Secret {secret_token}"

with tracing_context_manager(TracingContext.get()):
tracing_ctx = TracingContext.get()
tracing_ctx.credentials = f"Secret {secret_token}"
tracing_ctx = TracingContext.get()
tracing_ctx.credentials = credentials

with running_context_manager(RunningContext.get()):
running_ctx = RunningContext.get()
running_ctx.credentials = f"Secret {secret_token}"
ctx = RunningContext.get()
ctx.credentials = credentials

with tracing_context_manager(tracing_ctx):
with running_context_manager(ctx):
try:
result = await evaluators_service.run(
evaluator_key=evaluator_key,
Expand Down
87 changes: 69 additions & 18 deletions api/oss/src/services/evaluators_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,25 @@
EvaluatorOutputInterface,
)
from oss.src.models.shared_models import Error, Result
from oss.src.services.security import sandbox

# COMMENTED OUT: autoevals dependency removed
# from autoevals.ragas import Faithfulness, ContextRelevancy
from oss.src.utils.logging import get_module_logger
from oss.src.utils.traces import (
get_field_value_from_trace_tree,
process_distributed_trace_into_trace_tree,
get_field_value_from_trace_tree,
)

from agenta.sdk.contexts.running import RunningContext
from agenta.sdk.models.workflows import (
WorkflowServiceRequest,
WorkflowServiceRequestData,
)
from agenta.sdk.workflows.builtin import (
auto_custom_code_run as sdk_auto_custom_code_run,
)


log = get_module_logger(__name__)


Expand Down Expand Up @@ -504,7 +513,7 @@ async def auto_webhook_test(
type="error",
value=None,
error=Error(
message=f"[webhook evaluation] HTTP - {repr(e)}",
message=f"[webhook evaluator] HTTP - {repr(e)}",
stacktrace=traceback.format_exc(),
),
)
Expand All @@ -513,7 +522,7 @@ async def auto_webhook_test(
type="error",
value=None,
error=Error(
message=f"[webhook evaluation] JSON - {repr(e)}",
message=f"[webhook evaluator] JSON - {repr(e)}",
stacktrace=traceback.format_exc(),
),
)
Expand All @@ -522,7 +531,7 @@ async def auto_webhook_test(
type="error",
value=None,
error=Error(
message=f"[webhook evaluation] Exception - {repr(e)} ",
message=f"[webhook evaluator] Exception - {repr(e)} ",
stacktrace=traceback.format_exc(),
),
)
Expand Down Expand Up @@ -558,7 +567,7 @@ async def auto_custom_code_run(
"prediction": output,
"ground_truth": correct_answer,
}
response = await custom_code_run(
response = await sdk_custom_code_run(
input=EvaluatorInputInterface(
**{"inputs": inputs, "settings": settings_values}
)
Expand All @@ -575,16 +584,58 @@ async def auto_custom_code_run(
)


async def custom_code_run(input: EvaluatorInputInterface) -> EvaluatorOutputInterface:
result = sandbox.execute_code_safely(
app_params=input.inputs["app_config"],
inputs=input.inputs,
output=input.inputs["prediction"],
correct_answer=input.inputs["ground_truth"],
code=input.settings["code"],
datapoint=input.inputs["ground_truth"],
async def sdk_custom_code_run(
input: EvaluatorInputInterface,
) -> EvaluatorOutputInterface:
inputs = input.inputs or {}
settings = input.settings or {}

code = settings.get("code")
if code is None:
raise ValueError("Missing evaluator setting: code")

correct_answer_key = settings.get("correct_answer_key")
if not correct_answer_key:
correct_answer_key = (
"ground_truth" if "ground_truth" in inputs else "correct_answer"
)

threshold = settings.get("threshold", 0.5)
runtime = settings.get("runtime", "python")

workflow = sdk_auto_custom_code_run(
code=str(code),
correct_answer_key=str(correct_answer_key),
threshold=float(threshold),
runtime=runtime,
)

credentials = RunningContext.get().credentials

outputs = inputs.get("prediction", inputs.get("output"))
request = WorkflowServiceRequest(
data=WorkflowServiceRequestData(
inputs=inputs,
outputs=outputs,
),
credentials=credentials,
)
return {"outputs": {"score": result}}

response = await workflow.invoke(request=request)

# Check for error status and propagate it
if response.status and response.status.code and response.status.code >= 400:
error_message = response.status.message or "Custom code execution failed"
raise RuntimeError(error_message)

result = response.data.outputs if response.data else None

if isinstance(result, dict) and "score" in result:
score = result["score"]
else:
score = result

return {"outputs": {"score": score}}


async def auto_ai_critique(
Expand Down Expand Up @@ -912,7 +963,7 @@ async def ai_critique(input: EvaluatorInputInterface) -> EvaluatorOutputInterfac
if inputs and isinstance(inputs, dict) and correct_answer_key:
correct_answer = inputs[correct_answer_key]

secrets = await SecretsManager.retrieve_secrets()
secrets, _, _ = await SecretsManager.retrieve_secrets()

openai_api_key = None # secrets.get("OPENAI_API_KEY")
anthropic_api_key = None # secrets.get("ANTHROPIC_API_KEY")
Expand Down Expand Up @@ -1096,7 +1147,7 @@ async def ai_critique(input: EvaluatorInputInterface) -> EvaluatorOutputInterfac
if inputs and isinstance(inputs, dict) and correct_answer_key:
correct_answer = inputs[correct_answer_key]

secrets = await SecretsManager.retrieve_secrets()
secrets, _, _ = await SecretsManager.retrieve_secrets()

openai_api_key = None # secrets.get("OPENAI_API_KEY")
anthropic_api_key = None # secrets.get("ANTHROPIC_API_KEY")
Expand Down Expand Up @@ -2154,7 +2205,7 @@ async def auto_semantic_similarity(
"field_match_test": field_match_test,
"json_multi_field_match": json_multi_field_match,
"auto_webhook_test": webhook_test,
"auto_custom_code_run": custom_code_run,
"auto_custom_code_run": sdk_custom_code_run,
"auto_ai_critique": ai_critique,
"auto_starts_with": starts_with,
"auto_ends_with": ends_with,
Expand Down
119 changes: 0 additions & 119 deletions api/oss/src/services/security/sandbox.py

This file was deleted.

Loading