Agenta-AI · junaway · Jan 7, 2026 · Dec 19, 2025 · Dec 19, 2025 · Dec 19, 2025
diff --git a/api/oss/src/core/workflows/dtos.py b/api/oss/src/core/workflows/dtos.py
@@ -181,6 +181,9 @@ class WorkflowServiceInterface(WorkflowServiceVersion):
 class WorkflowServiceConfiguration(WorkflowServiceInterface):
     script: Optional[Data] = None  # str w/ validation
     parameters: Optional[Data] = None  # configuration values
+    runtime: Optional[str] = (
+        None  # runtime environment (python, javascript, typescript), None = python
+    )
 
 
 class WorkflowRevisionData(WorkflowServiceConfiguration):

diff --git a/api/oss/src/resources/evaluators/evaluators.py b/api/oss/src/resources/evaluators/evaluators.py
@@ -298,6 +298,41 @@
         "name": "Code Evaluation",
         "key": "auto_custom_code_run",
         "direct_use": False,
+        "settings_presets": [
+            {
+                "key": "python_default",
+                "name": "Exact Match (Python)",
+                "values": {
+                    "requires_llm_api_keys": False,
+                    "runtime": "python",
+                    "correct_answer_key": "correct_answer",
+                    "code": "from typing import Dict, Union, Any\n\n\ndef evaluate(\n    app_params: Dict[str, str],  # deprecated; currently receives {}\n    inputs: Dict[str, str],\n    output: Union[str, Dict[str, Any]],\n    correct_answer: str,\n) -> float:\n    if output == correct_answer:\n        return 1.0\n    return 0.0\n",
+                },
+                "description": "Exact match evaluator implemented in Python.",
+            },
+            {
+                "key": "javascript_default",
+                "name": "Exact Match (JavaScript)",
+                "values": {
+                    "requires_llm_api_keys": False,
+                    "runtime": "javascript",
+                    "correct_answer_key": "correct_answer",
+                    "code": 'function evaluate(appParams, inputs, output, correctAnswer) {\n  void appParams\n  void inputs\n\n  const outputStr =\n    typeof output === "string" ? output : JSON.stringify(output)\n\n  return outputStr === String(correctAnswer) ? 1.0 : 0.0\n}\n',
+                },
+                "description": "Exact match evaluator implemented in JavaScript.",
+            },
+            {
+                "key": "typescript_default",
+                "name": "Exact Match (TypeScript)",
+                "values": {
+                    "requires_llm_api_keys": False,
+                    "runtime": "typescript",
+                    "correct_answer_key": "correct_answer",
+                    "code": 'type OutputValue = string | Record<string, unknown>\n\nfunction evaluate(\n  app_params: Record<string, string>,\n  inputs: Record<string, string>,\n  output: OutputValue,\n  correct_answer: string\n): number {\n  void app_params\n  void inputs\n\n  const outputStr =\n    (typeof output === "string" ? output : JSON.stringify(output)) as string\n\n  return outputStr === String(correct_answer) ? 1.0 : 0.0\n}\n',
+                },
+                "description": "Exact match evaluator implemented in TypeScript.",
+            },
+        ],
         "settings_template": {
             "requires_llm_api_keys": {
                 "label": "Requires LLM API Key(s)",
@@ -310,10 +345,18 @@
             "code": {
                 "label": "Evaluation Code",
                 "type": "code",
-                "default": "from typing import Dict, Union, Any\n\ndef evaluate(\n    app_params: Dict[str, str],\n    inputs: Dict[str, str],\n    output: Union[str, Dict[str, Any]], # output of the llm app\n    correct_answer: str # contains the testset row \n) -> float:\n    if output in correct_answer:\n        return 1.0\n    else:\n        return 0.0\n",
+                "default": "from typing import Dict, Union, Any\n\n\ndef evaluate(\n    app_params: Dict[str, str],  # deprecated; currently receives {}\n    inputs: Dict[str, str],\n    output: Union[str, Dict[str, Any]],\n    correct_answer: str,\n) -> float:\n    if output == correct_answer:\n        return 1.0\n    return 0.0\n",
                 "description": "Code for evaluating submissions",
                 "required": True,
             },
+            "runtime": {
+                "label": "Runtime",
+                "type": "multiple_choice",
+                "default": "python",
+                "options": ["python", "javascript", "typescript"],
+                "advanced": True,
+                "description": "Runtime environment used to execute the evaluator code.",
+            },
             "correct_answer_key": {
                 "label": "Expected Answer Column",
                 "default": "correct_answer",

diff --git a/api/oss/src/routers/evaluators_router.py b/api/oss/src/routers/evaluators_router.py
@@ -98,15 +98,16 @@ async def evaluator_run(
         workspace_id=str(request.state.workspace_id),
         organization_id=str(request.state.organization_id),
     )
+    credentials = f"Secret {secret_token}"
 
-    with tracing_context_manager(TracingContext.get()):
-        tracing_ctx = TracingContext.get()
-        tracing_ctx.credentials = f"Secret {secret_token}"
+    tracing_ctx = TracingContext.get()
+    tracing_ctx.credentials = credentials
 
-        with running_context_manager(RunningContext.get()):
-            running_ctx = RunningContext.get()
-            running_ctx.credentials = f"Secret {secret_token}"
+    ctx = RunningContext.get()
+    ctx.credentials = credentials
 
+    with tracing_context_manager(tracing_ctx):
+        with running_context_manager(ctx):
             try:
                 result = await evaluators_service.run(
                     evaluator_key=evaluator_key,

diff --git a/api/oss/src/services/evaluators_service.py b/api/oss/src/services/evaluators_service.py
@@ -15,16 +15,25 @@
     EvaluatorOutputInterface,
 )
 from oss.src.models.shared_models import Error, Result
-from oss.src.services.security import sandbox
 
 # COMMENTED OUT: autoevals dependency removed
 # from autoevals.ragas import Faithfulness, ContextRelevancy
 from oss.src.utils.logging import get_module_logger
 from oss.src.utils.traces import (
-    get_field_value_from_trace_tree,
     process_distributed_trace_into_trace_tree,
+    get_field_value_from_trace_tree,
+)
+
+from agenta.sdk.contexts.running import RunningContext
+from agenta.sdk.models.workflows import (
+    WorkflowServiceRequest,
+    WorkflowServiceRequestData,
+)
+from agenta.sdk.workflows.builtin import (
+    auto_custom_code_run as sdk_auto_custom_code_run,
 )
 
+
 log = get_module_logger(__name__)
 
 
@@ -504,7 +513,7 @@ async def auto_webhook_test(
             type="error",
             value=None,
             error=Error(
-                message=f"[webhook evaluation] HTTP - {repr(e)}",
+                message=f"[webhook evaluator] HTTP - {repr(e)}",
                 stacktrace=traceback.format_exc(),
             ),
         )
@@ -513,7 +522,7 @@ async def auto_webhook_test(
             type="error",
             value=None,
             error=Error(
-                message=f"[webhook evaluation] JSON - {repr(e)}",
+                message=f"[webhook evaluator] JSON - {repr(e)}",
                 stacktrace=traceback.format_exc(),
             ),
         )
@@ -522,7 +531,7 @@ async def auto_webhook_test(
             type="error",
             value=None,
             error=Error(
-                message=f"[webhook evaluation] Exception - {repr(e)} ",
+                message=f"[webhook evaluator] Exception - {repr(e)} ",
                 stacktrace=traceback.format_exc(),
             ),
         )
@@ -558,7 +567,7 @@ async def auto_custom_code_run(
             "prediction": output,
             "ground_truth": correct_answer,
         }
-        response = await custom_code_run(
+        response = await sdk_custom_code_run(
             input=EvaluatorInputInterface(
                 **{"inputs": inputs, "settings": settings_values}
             )
@@ -575,16 +584,58 @@ async def auto_custom_code_run(
         )
 
 
-async def custom_code_run(input: EvaluatorInputInterface) -> EvaluatorOutputInterface:
-    result = sandbox.execute_code_safely(
-        app_params=input.inputs["app_config"],
-        inputs=input.inputs,
-        output=input.inputs["prediction"],
-        correct_answer=input.inputs["ground_truth"],
-        code=input.settings["code"],
-        datapoint=input.inputs["ground_truth"],
+async def sdk_custom_code_run(
+    input: EvaluatorInputInterface,
+) -> EvaluatorOutputInterface:
+    inputs = input.inputs or {}
+    settings = input.settings or {}
+
+    code = settings.get("code")
+    if code is None:
+        raise ValueError("Missing evaluator setting: code")
+
+    correct_answer_key = settings.get("correct_answer_key")
+    if not correct_answer_key:
+        correct_answer_key = (
+            "ground_truth" if "ground_truth" in inputs else "correct_answer"
+        )
+
+    threshold = settings.get("threshold", 0.5)
+    runtime = settings.get("runtime", "python")
+
+    workflow = sdk_auto_custom_code_run(
+        code=str(code),
+        correct_answer_key=str(correct_answer_key),
+        threshold=float(threshold),
+        runtime=runtime,
+    )
+
+    credentials = RunningContext.get().credentials
+
+    outputs = inputs.get("prediction", inputs.get("output"))
+    request = WorkflowServiceRequest(
+        data=WorkflowServiceRequestData(
+            inputs=inputs,
+            outputs=outputs,
+        ),
+        credentials=credentials,
     )
-    return {"outputs": {"score": result}}
+
+    response = await workflow.invoke(request=request)
+
+    # Check for error status and propagate it
+    if response.status and response.status.code and response.status.code >= 400:
+        error_message = response.status.message or "Custom code execution failed"
+        raise RuntimeError(error_message)
+
+    result = response.data.outputs if response.data else None
+
+    if isinstance(result, dict) and "score" in result:
+        score = result["score"]
+    else:
+        score = result
+
+    return {"outputs": {"score": score}}
 
 
 async def auto_ai_critique(
@@ -912,7 +963,7 @@ async def ai_critique(input: EvaluatorInputInterface) -> EvaluatorOutputInterfac
             if inputs and isinstance(inputs, dict) and correct_answer_key:
                 correct_answer = inputs[correct_answer_key]
 
-            secrets = await SecretsManager.retrieve_secrets()
+            secrets, _, _ = await SecretsManager.retrieve_secrets()
 
             openai_api_key = None  # secrets.get("OPENAI_API_KEY")
             anthropic_api_key = None  # secrets.get("ANTHROPIC_API_KEY")
@@ -1096,7 +1147,7 @@ async def ai_critique(input: EvaluatorInputInterface) -> EvaluatorOutputInterfac
             if inputs and isinstance(inputs, dict) and correct_answer_key:
                 correct_answer = inputs[correct_answer_key]
 
-            secrets = await SecretsManager.retrieve_secrets()
+            secrets, _, _ = await SecretsManager.retrieve_secrets()
 
             openai_api_key = None  # secrets.get("OPENAI_API_KEY")
             anthropic_api_key = None  # secrets.get("ANTHROPIC_API_KEY")
@@ -2154,7 +2205,7 @@ async def auto_semantic_similarity(
     "field_match_test": field_match_test,
     "json_multi_field_match": json_multi_field_match,
     "auto_webhook_test": webhook_test,
-    "auto_custom_code_run": custom_code_run,
+    "auto_custom_code_run": sdk_custom_code_run,
     "auto_ai_critique": ai_critique,
     "auto_starts_with": starts_with,
     "auto_ends_with": ends_with,

diff --git a/api/oss/src/services/security/sandbox.py b/api/oss/src/services/security/sandbox.py