@@ -263,6 +263,9 @@ async def grade(self, computer: ComputerInterface) -> SWELancerGrade:
263263 "pytest_logs" : pytest_outputs ,
264264 "variant" : self .variant ,
265265 "model_patch" : model_patch .decode ("utf-8" ),
266+
267+ # You can compute this using percentage of tests passed, how far along the e2e test got, etc.
268+ "partial_credit" : correct
266269 }
267270 ),
268271 patch_path = "" ,
@@ -411,6 +414,9 @@ async def get_summary(self, results: list[tuple[ComputerTask, FinalResult]]) ->
411414 "swe_manager_hard" : {"correct" : 0 , "total" : 0 },
412415 }
413416
417+ partial_credit_sum = 0.0
418+ partial_credit_count = 0
419+
414420 for task , result in results :
415421 try :
416422 grader_log = json .loads (result .grade .grader_log )
@@ -438,19 +444,24 @@ async def get_summary(self, results: list[tuple[ComputerTask, FinalResult]]) ->
438444 hard_variant = f"{ variant } _hard"
439445 if hard_variant in original_summary ["metadata_variants" ]["accuracy" ]:
440446 # Update earnings and available for hard variants
441- original_summary ["metadata_variants" ]["earnings" ][hard_variant ] += (
442- earned
443- )
444- original_summary ["metadata_variants" ]["available" ][hard_variant ] += (
445- available
446- )
447+ original_summary ["metadata_variants" ]["earnings" ][hard_variant ] += earned
448+ original_summary ["metadata_variants" ]["available" ][hard_variant ] += available
447449
448450 # Update accuracy counters for hard variants
449451 variant_counts [hard_variant ]["total" ] += 1
450452 if earned > 0 :
451453 variant_counts [hard_variant ]["correct" ] += 1
452454
453455 original_summary ["pytest_log" ] = grader_log .get ("pytest_log" , "No logs found" )
456+
457+ # Compute partial credit for each task. Use provided partial_credit if available,
458+ # otherwise derive it: if earned equals available (and available > 0), then full credit, else 0.
459+ pc = grader_log .get ("partial_credit" )
460+ if pc is None :
461+ pc = 1.0 if (available > 0 and earned == available ) else 0.0
462+ partial_credit_sum += pc
463+ partial_credit_count += 1
464+
454465 except Exception as e :
455466 print (str (e ))
456467
@@ -461,6 +472,8 @@ async def get_summary(self, results: list[tuple[ComputerTask, FinalResult]]) ->
461472 if total > 0 :
462473 original_summary ["metadata_variants" ]["accuracy" ][variant ] = correct / total
463474
475+ original_summary ["average_partial_credit" ] = partial_credit_sum / partial_credit_count if partial_credit_count else 0.0
476+
464477 return original_summary
465478
466479 except Exception as e :
0 commit comments