Skip to content
This repository was archived by the owner on Jul 18, 2025. It is now read-only.
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 19 additions & 6 deletions swelancer.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,9 @@ async def grade(self, computer: ComputerInterface) -> SWELancerGrade:
"pytest_logs": pytest_outputs,
"variant": self.variant,
"model_patch": model_patch.decode("utf-8"),

# You can compute this using percentage of tests passed, how far along the e2e test got, etc.
"partial_credit": correct
}
),
patch_path="",
Expand Down Expand Up @@ -411,6 +414,9 @@ async def get_summary(self, results: list[tuple[ComputerTask, FinalResult]]) ->
"swe_manager_hard": {"correct": 0, "total": 0},
}

partial_credit_sum = 0.0
partial_credit_count = 0

for task, result in results:
try:
grader_log = json.loads(result.grade.grader_log)
Expand Down Expand Up @@ -438,19 +444,24 @@ async def get_summary(self, results: list[tuple[ComputerTask, FinalResult]]) ->
hard_variant = f"{variant}_hard"
if hard_variant in original_summary["metadata_variants"]["accuracy"]:
# Update earnings and available for hard variants
original_summary["metadata_variants"]["earnings"][hard_variant] += (
earned
)
original_summary["metadata_variants"]["available"][hard_variant] += (
available
)
original_summary["metadata_variants"]["earnings"][hard_variant] += earned
original_summary["metadata_variants"]["available"][hard_variant] += available

# Update accuracy counters for hard variants
variant_counts[hard_variant]["total"] += 1
if earned > 0:
variant_counts[hard_variant]["correct"] += 1

original_summary["pytest_log"] = grader_log.get("pytest_log", "No logs found")

# Compute partial credit for each task. Use provided partial_credit if available,
# otherwise derive it: if earned equals available (and available > 0), then full credit, else 0.
pc = grader_log.get("partial_credit")
if pc is None:
pc = 1.0 if (available > 0 and earned == available) else 0.0
partial_credit_sum += pc
partial_credit_count += 1

except Exception as e:
print(str(e))

Expand All @@ -461,6 +472,8 @@ async def get_summary(self, results: list[tuple[ComputerTask, FinalResult]]) ->
if total > 0:
original_summary["metadata_variants"]["accuracy"][variant] = correct / total

original_summary["average_partial_credit"] = partial_credit_sum / partial_credit_count if partial_credit_count else 0.0

return original_summary

except Exception as e:
Expand Down