EleutherAI · tanios13 · Nov 6, 2025 · Nov 6, 2025 · Nov 6, 2025 · Nov 6, 2025
@@ -1,16 +1,21 @@
 task: mbpp
 dataset_path: google-research-datasets/mbpp
-dataset_name: full
+dataset_name: sanitized
 unsafe_code: true
 output_type: generate_until
 test_split: test
-doc_to_text: "You are an expert Python programmer, and here is your task: {{text}} Your code should pass these tests:\n\n{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}\n[BEGIN]\n"
+doc_to_text: "You are an expert Python programmer, and here is your task: {{prompt}} Your code should pass these tests:\n\n{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}\n[BEGIN]\n"
 doc_to_target: "{% if is_fewshot is defined %}{{code}}\n[DONE]{% else %}{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}{% endif %}"
 target_delimiter: ""
 metric_list:
   - metric: !function utils.pass_at_1
     aggregation: mean
     higher_is_better: true
+filter_list:
+  - name: "extract_code"
+    filter:
+      - function: "custom"
+        filter_fn: !function utils.build_predictions
 generation_kwargs:
   until:
     - "[DONE]"

@@ -1,10 +1,10 @@
 task: mbpp_instruct
 dataset_path: google-research-datasets/mbpp
-dataset_name: full
+dataset_name: sanitized
 unsafe_code: true
 output_type: generate_until
 test_split: test
-doc_to_text: "You are an expert Python programmer, and here is your task:\n{{text}}\nYour code should pass these tests:\n{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}"
+doc_to_text: "You are an expert Python programmer, and here is your task:\n{{prompt}}\nYour code should pass these tests:\n{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}"
 doc_to_target: "{% if is_fewshot is defined %}{{code}}\n```{% else %}{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}{% endif %}"
 gen_prefix: "\n```python\n"
 target_delimiter: ""

@@ -3,3 +3,4 @@ task: mbpp_plus
 dataset_path: evalplus/mbppplus
 dataset_name: null
 doc_to_text: "You are an expert Python programmer, and here is your task: {{prompt if prompt is defined else text}} Your code should pass these tests:\n\n{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}\n[BEGIN]\n"
+doc_to_target: "{{test}}"
@@ -2,8 +2,8 @@ include: mbpp_instruct.yaml
 task: mbpp_plus_instruct
 dataset_path: evalplus/mbppplus
 dataset_name: null
-doc_to_text: "{{prompt if prompt is defined else text}} Your code should satisfy the following assertion:\n{{test_list[0]}}"
-doc_to_target: "{{test_list[0]}}"
+doc_to_text: "{{prompt if prompt is defined else text}} Your code should satisfy the following assertion:\n{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}"
+doc_to_target: "{{test}}"
 gen_prefix: "Here is a solution to this programming problem:\n```python\n"
 num_fewshot: 0
 generation_kwargs:

@@ -22,6 +22,7 @@ def pass_at_1(
         references = [references]
     if isinstance(predictions[0], str):
         predictions = [[p] for p in predictions]
+
     return pass_at_k.compute(
         references=references,
         predictions=predictions,
@@ -30,18 +31,38 @@ def pass_at_1(
 
 
 def extract_code_blocks(text: str) -> str:
-    # Pattern to match ```...``` blocks
-    pattern = r"```(?:\w+)?\n?(.*?)\n?```"
-    # (+ ```) as we add the opening "```python" to the gen_prefix
-    matches = re.findall(pattern, r"```" + text, re.DOTALL)
-    # if no matches, try to match ```...``` blocks (after removing the language)
-    if not matches:
-        text_without_lang = re.sub(r"```python", "```", text)
-        matches = re.findall(pattern, text_without_lang, re.DOTALL)
-    if not matches:
-        return ""
-    else:
-        return matches[0]
+    text = text.strip()
+
+    # 1. If starts with ```python → take everything until the next ```
+    if text.startswith("```python"):
+        end = text.find("```", len("```python"))
+        if end != -1:
+            return text[len("```python") : end].strip()
 prefix = gen_prefix + " " if gen_prefix else "" 
 prefix = gen_prefix + " " if gen_prefix else "" 
+        return text[len("```python") :].strip()
+
+    # 2. If starts with ``` but not python → take until next ```
+    if text.startswith("```"):
+        end = text.find("```", 3)
+        if end != -1:
+            return text[3:end].strip()
+        return text[3:].strip()
+
+    # 3. If doesn’t start with ```
+    text = text.replace("```python", "```")
+    count_backticks = text.count("```")
+
+    # 4. If count is odd → take everything until the last ```
+    if count_backticks % 2 == 1 and count_backticks > 0:
+        last = text.rfind("```")
+        return text[:last].strip()
+
+    # 5. If count is even and >= 2 → take first complete block between ```
+    if count_backticks >= 2:
+        first = text.find("```")
+        second = text.find("```", first + 3)
+        return text[first + 3 : second].strip()
+
+    return text
 
 
 def build_predictions(resps: list[list[str]], docs: list[dict]) -> list[list[str]]: