From 49a967cb944ebc5f04f5fe07e639ad6babf3d6be Mon Sep 17 00:00:00 2001 From: User Date: Tue, 13 Jan 2026 14:00:27 -0800 Subject: [PATCH 01/33] remove P2/P3 suppression instruction --- src/create-prompt/templates/review-prompt.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/create-prompt/templates/review-prompt.ts b/src/create-prompt/templates/review-prompt.ts index 418d264..831839e 100644 --- a/src/create-prompt/templates/review-prompt.ts +++ b/src/create-prompt/templates/review-prompt.ts @@ -134,8 +134,8 @@ Commenting rules: Submission: - Do not submit inline comments when: - the PR appears formatting-only, or - - all findings are low-severity (P2/P3), or - you cannot anchor a high-confidence issue to a specific changed line. +- Submit all findings P0-P3 that meet the bug detection criteria. Low-severity issues (P2/P3) should still be submitted if they would cause runtime errors, incorrect behavior, or security vulnerabilities. - Do not escalate style/formatting into P0/P1 just to justify leaving an inline comment. - If no issues are found and a prior "no issues" comment from this bot already exists, skip submitting another comment to avoid redundancy. - If no issues are found and no prior "no issues" comment exists, post a single brief top-level summary noting no issues. From 23e8f1f6f8a9ed9d0808fb5d6ab34bcbf02c6dd8 Mon Sep 17 00:00:00 2001 From: User Date: Tue, 13 Jan 2026 14:02:07 -0800 Subject: [PATCH 02/33] add import verification step --- src/create-prompt/templates/review-prompt.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/create-prompt/templates/review-prompt.ts b/src/create-prompt/templates/review-prompt.ts index 831839e..859beb5 100644 --- a/src/create-prompt/templates/review-prompt.ts +++ b/src/create-prompt/templates/review-prompt.ts @@ -113,6 +113,11 @@ Cross-reference capability: - If a test references a constant or path, verify it matches the production code's actual behavior. - For any suspicious pattern, search the codebase to confirm your understanding before flagging an issue. +Import verification: +- For new imports added in the diff, use Grep to verify the imported symbol (class, function, constant) actually exists in the codebase or is a valid external package. +- For imports from the same project, confirm the symbol is exported from the source file. +- Flag any import that references a non-existent symbol as a bug (will cause ImportError/ModuleNotFoundError at runtime). + Accuracy gates: - Base findings strictly on the current diff and repo context available via gh/MCP; avoid speculation. - If confidence is low, phrase a single concise clarifying question instead of asserting a bug. From e422d6a2f2513cda3ac24bf4174ca70d046f6f99 Mon Sep 17 00:00:00 2001 From: User Date: Tue, 13 Jan 2026 14:09:42 -0800 Subject: [PATCH 03/33] Add structured analysis phases --- src/create-prompt/templates/review-prompt.ts | 35 +++++++++++++------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/src/create-prompt/templates/review-prompt.ts b/src/create-prompt/templates/review-prompt.ts index 859beb5..6724440 100644 --- a/src/create-prompt/templates/review-prompt.ts +++ b/src/create-prompt/templates/review-prompt.ts @@ -29,18 +29,29 @@ Objectives: 3) Leave concise inline comments (1-2 sentences) on bugs introduced by the PR. You may also comment on unchanged lines if the PR's changes expose or trigger issues there—but explain the connection clearly. Procedure: -- Run: gh pr view ${prNumber} --repo ${repoFullName} --json comments,reviews -- Prefer reviewing the local git diff since the PR branch is already checked out: - - Ensure you have the base branch ref locally (fetch if needed). - - Find merge base between HEAD and the base branch. - - Run git diff from that merge base to HEAD to see exactly what would merge. - - Example: - - git fetch origin ${baseRefName}:refs/remotes/origin/${baseRefName} - - MERGE_BASE=$(git merge-base HEAD refs/remotes/origin/${baseRefName}) - - git diff $MERGE_BASE..HEAD -- Use gh PR diff/file APIs only as a fallback when local git diff is not possible: - - gh pr diff ${prNumber} --repo ${repoFullName} - - gh api repos/${repoFullName}/pulls/${prNumber}/files --paginate --jq '.[] | {filename,patch,additions,deletions}' +Follow these phases IN ORDER. Do not skip to submitting findings until you complete Phase 1. + +## Phase 1: Context Gathering (REQUIRED before making any findings) +1. Check existing comments: gh pr view ${prNumber} --repo ${repoFullName} --json comments,reviews +2. Get the full diff: + - git fetch origin ${baseRefName}:refs/remotes/origin/${baseRefName} + - MERGE_BASE=$(git merge-base HEAD refs/remotes/origin/${baseRefName}) + - git diff $MERGE_BASE..HEAD +3. For EACH file in the diff, gather context: + - For new imports: Grep to verify the imported symbol exists + - For new/modified functions: Grep for callers to understand usage patterns + - For functions that process data: Read surrounding code to understand expected types +4. Do NOT identify bugs yet - focus only on understanding the changes. + +## Phase 2: Issue Identification (ONLY AFTER Phase 1 is complete) +1. Review ALL changed lines systematically - Do NOT stop after finding just a few issues +2. For each potential issue: + - Verify with Grep/Read before flagging (do not speculate) + - Trace data flow to confirm the bug path exists + - Check if the pattern exists elsewhere in the codebase (may be intentional) +3. Continue until you have reviewed every changed line in every file + +## Phase 3: Submit Review - Prefer github_inline_comment___create_inline_comment with side="RIGHT" to post inline findings on changed/added lines - Compute exact diff positions (path + position) for each issue; every substantive comment must be inline on the changed line (no new top-level issue comments). - Detect prior top-level "no issues" comments authored by this bot (e.g., "no issues", "No issues found", "LGTM", including emoji-prefixed variants). From d0d4251c91f6233cebbc3c9d23141e55aa4d9d3e Mon Sep 17 00:00:00 2001 From: User Date: Tue, 13 Jan 2026 14:34:44 -0800 Subject: [PATCH 04/33] Add Thorough Analysis Checklist --- src/create-prompt/templates/review-prompt.ts | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/create-prompt/templates/review-prompt.ts b/src/create-prompt/templates/review-prompt.ts index 6724440..d0a476c 100644 --- a/src/create-prompt/templates/review-prompt.ts +++ b/src/create-prompt/templates/review-prompt.ts @@ -50,6 +50,12 @@ Follow these phases IN ORDER. Do not skip to submitting findings until you compl - Trace data flow to confirm the bug path exists - Check if the pattern exists elsewhere in the codebase (may be intentional) 3. Continue until you have reviewed every changed line in every file +4. Thorough analysis checklist (complete before moving to Phase 3): + - Do NOT submit after finding only 2-3 issues - a common failure mode is stopping too early + - Have you examined every modified function/method in each changed file? + - If the PR touches multiple files, have you analyzed interactions between changes? + - When you found a bug pattern, did you search for the same pattern elsewhere in the diff? + - For PRs >200 lines: explicitly confirm all sections have been reviewed ## Phase 3: Submit Review - Prefer github_inline_comment___create_inline_comment with side="RIGHT" to post inline findings on changed/added lines @@ -76,9 +82,6 @@ Diff Side Selection (CRITICAL): - The 'line' parameter refers to the line number on the specified side of the diff - Ensure the line numbers you use correspond to the side you choose; -How Many Findings to Return: -Output all findings that the original author would fix if they knew about it. If there is no finding that a person would definitely love to see and fix, prefer outputting no findings. Do not stop at the first qualifying finding. Continue until you've listed every qualifying finding. - Key Guidelines for Bug Detection: Only flag an issue as a bug if: 1. It meaningfully impacts the accuracy, performance, security, or maintainability of the code. From 26c52521d35c0b26d0424ce3128669aa433615cc Mon Sep 17 00:00:00 2001 From: User Date: Tue, 13 Jan 2026 14:39:23 -0800 Subject: [PATCH 05/33] consolidate instructions into structured phases and remove redundant sections --- src/create-prompt/templates/review-prompt.ts | 93 +++++--------------- 1 file changed, 24 insertions(+), 69 deletions(-) diff --git a/src/create-prompt/templates/review-prompt.ts b/src/create-prompt/templates/review-prompt.ts index d0a476c..530bf74 100644 --- a/src/create-prompt/templates/review-prompt.ts +++ b/src/create-prompt/templates/review-prompt.ts @@ -50,7 +50,30 @@ Follow these phases IN ORDER. Do not skip to submitting findings until you compl - Trace data flow to confirm the bug path exists - Check if the pattern exists elsewhere in the codebase (may be intentional) 3. Continue until you have reviewed every changed line in every file -4. Thorough analysis checklist (complete before moving to Phase 3): +4. Cross-reference checks: + - When reviewing tests, search for related constants and configurations + - Use Grep and Read tools to understand relationships between files—do not rely solely on diff output + - If a test references a constant or path, verify it matches the production code's actual behavior +5. Import verification: + - For new imports, use Grep to verify the imported symbol actually exists in the codebase or is a valid external package + - Flag any import that references a non-existent symbol as a bug (will cause ImportError/ModuleNotFoundError at runtime) +6. Accuracy gates: + - Only flag bugs introduced by this PR (not pre-existing issues) + - Base findings strictly on the current diff and repo context; avoid speculation + - If confidence is low, phrase a clarifying question instead of asserting a bug + - Never raise purely stylistic or preference-only concerns +7. Deduplication: Never repeat or re-raise an issue previously highlighted by this bot on this PR +8. Priority levels for findings: + - [P0] - Drop everything to fix. Blocking release/operations + - [P1] - Urgent. Should be addressed in next cycle + - [P2] - Normal. To be fixed eventually + - [P3] - Low. Nice to have +9. Comment format for each finding: + - Structure: **[P0-P3] Clear title (≤ 80 chars, imperative mood)** followed by 1 paragraph explanation + - Be clear about why it's a bug and when/where it manifests + - Brief (1 paragraph max), code chunks max 3 lines + - Matter-of-fact tone, immediately graspable by the author +10. Thorough analysis checklist (complete before moving to Phase 3): - Do NOT submit after finding only 2-3 issues - a common failure mode is stopping too early - Have you examined every modified function/method in each changed file? - If the PR touches multiple files, have you analyzed interactions between changes? @@ -82,74 +105,6 @@ Diff Side Selection (CRITICAL): - The 'line' parameter refers to the line number on the specified side of the diff - Ensure the line numbers you use correspond to the side you choose; -Key Guidelines for Bug Detection: -Only flag an issue as a bug if: -1. It meaningfully impacts the accuracy, performance, security, or maintainability of the code. -2. The bug is discrete and actionable (not a general issue). -3. Fixing the bug does not demand a level of rigor not present in the rest of the codebase. -4. The bug was introduced in the PR (pre-existing bugs should not be flagged). -5. The author would likely fix the issue if made aware of it. -6. The bug does not rely on unstated assumptions. -7. Must identify provably affected code parts (not speculation). -8. The bug is clearly not intentional. - -Priority Levels: -Use the following priority levels to categorize findings: -- [P0] - Drop everything to fix. Blocking release/operations -- [P1] - Urgent. Should be addressed in next cycle -- [P2] - Normal. To be fixed eventually -- [P3] - Low. Nice to have - -Comment Guidelines: -Your review comments should be: -1. Clear about why the issue is a bug -2. Appropriately communicate severity -3. Brief - at most 1 paragraph -4. Code chunks max 3 lines, wrapped in markdown -5. Clearly communicate scenarios/environments where the bug manifests -6. Matter-of-fact tone without being accusatory -7. Immediately graspable by the original author -8. Avoid excessive flattery - -Output Format: -Structure each inline comment as: -**[P0-P3] Clear title (≤ 80 chars, imperative mood)** -(blank line) -Explanation of why this is a problem (1 paragraph max). - -In the review summary body (submitted via github_pr___submit_review), provide an overall assessment: -- Whether the changes are correct or incorrect -- 1-3 sentence overall explanation - -Cross-reference capability: -- When reviewing tests, search for related constants and configurations (e.g., if a test sets an environment variable like FACTORY_ENV, use Grep to find how that env var maps to directories or behavior in production code). -- Use Grep and Read tools to understand relationships between files—do not rely solely on diff output for context. -- If a test references a constant or path, verify it matches the production code's actual behavior. -- For any suspicious pattern, search the codebase to confirm your understanding before flagging an issue. - -Import verification: -- For new imports added in the diff, use Grep to verify the imported symbol (class, function, constant) actually exists in the codebase or is a valid external package. -- For imports from the same project, confirm the symbol is exported from the source file. -- Flag any import that references a non-existent symbol as a bug (will cause ImportError/ModuleNotFoundError at runtime). - -Accuracy gates: -- Base findings strictly on the current diff and repo context available via gh/MCP; avoid speculation. -- If confidence is low, phrase a single concise clarifying question instead of asserting a bug. -- Never raise purely stylistic or preference-only concerns. - -Deduplication policy: -- Never repeat or re-raise an issue previously highlighted by this bot on this PR. -- Do not open a new thread for a previously reported issue; resolve the existing thread via github_pr___resolve_review_thread when possible, otherwise leave a brief reply in that discussion and move on. - -Commenting rules: -- One issue per comment. -- Anchor findings to the relevant diff hunk so reviewers see the context immediately. -- Focus on defects introduced or exposed by the PR's changes; if a new bug manifests on an unchanged line, you may post inline comments on those unchanged lines but clearly explain how the submitted changes trigger it. -- Match the side parameter to the code segment you're referencing (default to RIGHT for new code) and provide line numbers from that same side -- Keep comments concise and immediately graspable. -- For low confidence findings, ask a question; for medium/high confidence, state the issue concretely. -- Only include explicit code suggestions when you are absolutely certain the replacement is correct and safe. - Submission: - Do not submit inline comments when: - the PR appears formatting-only, or From 45cb5b50e56e300a0b2fea2cc8a35bc4807fc481 Mon Sep 17 00:00:00 2001 From: User Date: Tue, 13 Jan 2026 14:48:27 -0800 Subject: [PATCH 06/33] remove contradicting instructions about resolving existing threads --- src/create-prompt/templates/review-prompt.ts | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/create-prompt/templates/review-prompt.ts b/src/create-prompt/templates/review-prompt.ts index 530bf74..b88601c 100644 --- a/src/create-prompt/templates/review-prompt.ts +++ b/src/create-prompt/templates/review-prompt.ts @@ -24,7 +24,7 @@ Context: - The PR branch has already been checked out. You have full access to read any file in the codebase, not just the diff output. Objectives: -1) Re-check existing review comments and resolve threads when the issue is fixed (fall back to a brief "resolved" reply only if the thread cannot be marked resolved). +1) Re-check existing review comments; if a previously reported issue appears fixed, leave a brief "resolved" reply (do NOT programmatically resolve threads). 2) Review the PR diff and surface all bugs that meet the detection criteria below. 3) Leave concise inline comments (1-2 sentences) on bugs introduced by the PR. You may also comment on unchanged lines if the PR's changes expose or trigger issues there—but explain the connection clearly. @@ -86,17 +86,12 @@ Follow these phases IN ORDER. Do not skip to submitting findings until you compl - Detect prior top-level "no issues" comments authored by this bot (e.g., "no issues", "No issues found", "LGTM", including emoji-prefixed variants). - If the current run finds issues and prior "no issues" comments exist, delete them via gh api -X DELETE repos/${repoFullName}/issues/comments/; if deletion fails, minimize via GraphQL or reply "Superseded: issues were found in newer commits". - IMPORTANT: Do NOT delete comment ID ${context.droidCommentId} - this is the tracking comment for the current run. -- Thread resolution rule (CRITICAL): NEVER resolve review threads. - - Do NOT call github_pr___resolve_review_thread under any circumstances. - - If a previously reported issue appears fixed, leave the thread unresolved. - Preferred MCP tools (when available): - github_inline_comment___create_inline_comment to post inline feedback anchored to the diff - github_pr___submit_review to send inline review feedback - github_pr___delete_comment to remove outdated "no issues" comments - github_pr___minimize_comment when deletion is unavailable but minimization is acceptable -- github_pr___reply_to_comment to acknowledge resolved threads -- github_pr___resolve_review_thread to formally resolve threads once issues are fixed +- github_pr___reply_to_comment to reply to existing threads (e.g., acknowledge fixed issues) Diff Side Selection (CRITICAL): - When calling github_inline_comment___create_inline_comment, ALWAYS specify the 'side' parameter From 38af5d474465a092c7f0ca80e30181a3a88cd1f8 Mon Sep 17 00:00:00 2001 From: User Date: Tue, 13 Jan 2026 15:05:25 -0800 Subject: [PATCH 07/33] reorganize Phase 3, remove duplicates, fix diff position params --- src/create-prompt/templates/review-prompt.ts | 46 ++++++++------------ 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/src/create-prompt/templates/review-prompt.ts b/src/create-prompt/templates/review-prompt.ts index b88601c..c401874 100644 --- a/src/create-prompt/templates/review-prompt.ts +++ b/src/create-prompt/templates/review-prompt.ts @@ -81,35 +81,25 @@ Follow these phases IN ORDER. Do not skip to submitting findings until you compl - For PRs >200 lines: explicitly confirm all sections have been reviewed ## Phase 3: Submit Review -- Prefer github_inline_comment___create_inline_comment with side="RIGHT" to post inline findings on changed/added lines -- Compute exact diff positions (path + position) for each issue; every substantive comment must be inline on the changed line (no new top-level issue comments). -- Detect prior top-level "no issues" comments authored by this bot (e.g., "no issues", "No issues found", "LGTM", including emoji-prefixed variants). -- If the current run finds issues and prior "no issues" comments exist, delete them via gh api -X DELETE repos/${repoFullName}/issues/comments/; if deletion fails, minimize via GraphQL or reply "Superseded: issues were found in newer commits". -- IMPORTANT: Do NOT delete comment ID ${context.droidCommentId} - this is the tracking comment for the current run. -Preferred MCP tools (when available): -- github_inline_comment___create_inline_comment to post inline feedback anchored to the diff -- github_pr___submit_review to send inline review feedback -- github_pr___delete_comment to remove outdated "no issues" comments -- github_pr___minimize_comment when deletion is unavailable but minimization is acceptable -- github_pr___reply_to_comment to reply to existing threads (e.g., acknowledge fixed issues) +Submit findings from Phase 2 using the rules below. -Diff Side Selection (CRITICAL): -- When calling github_inline_comment___create_inline_comment, ALWAYS specify the 'side' parameter -- Use side="RIGHT" for comments on NEW or MODIFIED code (what the PR adds/changes) -- Use side="LEFT" ONLY when commenting on code being REMOVED (only if you need to reference the old implementation) -- The 'line' parameter refers to the line number on the specified side of the diff -- Ensure the line numbers you use correspond to the side you choose; +When NOT to submit: +- PR appears formatting-only +- Cannot anchor a high-confidence issue to a specific changed line +- Do not escalate style/formatting into P0/P1 just to justify submitting -Submission: -- Do not submit inline comments when: - - the PR appears formatting-only, or - - you cannot anchor a high-confidence issue to a specific changed line. -- Submit all findings P0-P3 that meet the bug detection criteria. Low-severity issues (P2/P3) should still be submitted if they would cause runtime errors, incorrect behavior, or security vulnerabilities. -- Do not escalate style/formatting into P0/P1 just to justify leaving an inline comment. -- If no issues are found and a prior "no issues" comment from this bot already exists, skip submitting another comment to avoid redundancy. -- If no issues are found and no prior "no issues" comment exists, post a single brief top-level summary noting no issues. -- If issues are found, delete/minimize/supersede any prior "no issues" comment before submitting. -- Prefer github_inline_comment___create_inline_comment for inline findings and submit the overall review via github_pr___submit_review (fall back to gh api repos/${repoFullName}/pulls/${prNumber}/reviews -f event=COMMENT -f body="$SUMMARY" -f comments='[$COMMENTS_JSON]' when MCP tools are unavailable). -- Do not approve or request changes; submit a comment-only review with inline feedback. +Tools and format: +- Use github_inline_comment___create_inline_comment for inline findings (path + side + line) +- Use github_pr___submit_review to submit the overall review +- Use github_pr___delete_comment or github_pr___minimize_comment for outdated comments +- Use github_pr___reply_to_comment to reply to existing threads +- Side selection: use RIGHT for new/modified code, LEFT only for removed code +- Do not approve or request changes; submit comment-only reviews + +"No issues" handling: +- If no issues found and a prior "no issues" comment exists: skip (avoid redundancy) +- If no issues found and no prior comment exists: post a single brief summary +- If issues found and prior "no issues" comment exists: delete/minimize it before submitting +- Do NOT delete comment ID ${context.droidCommentId} (tracking comment for current run) `; } From 2ff53b2704cc6c4deba50217c65c3e0a9e07fde3 Mon Sep 17 00:00:00 2001 From: User Date: Tue, 13 Jan 2026 15:38:57 -0800 Subject: [PATCH 08/33] Add robustness improvements to review prompt --- src/create-prompt/templates/review-prompt.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/create-prompt/templates/review-prompt.ts b/src/create-prompt/templates/review-prompt.ts index c401874..4e969cf 100644 --- a/src/create-prompt/templates/review-prompt.ts +++ b/src/create-prompt/templates/review-prompt.ts @@ -54,6 +54,7 @@ Follow these phases IN ORDER. Do not skip to submitting findings until you compl - When reviewing tests, search for related constants and configurations - Use Grep and Read tools to understand relationships between files—do not rely solely on diff output - If a test references a constant or path, verify it matches the production code's actual behavior + - Example: if a test sets an env var, Grep where it is consumed to verify behavior matches production 5. Import verification: - For new imports, use Grep to verify the imported symbol actually exists in the codebase or is a valid external package - Flag any import that references a non-existent symbol as a bug (will cause ImportError/ModuleNotFoundError at runtime) @@ -79,6 +80,7 @@ Follow these phases IN ORDER. Do not skip to submitting findings until you compl - If the PR touches multiple files, have you analyzed interactions between changes? - When you found a bug pattern, did you search for the same pattern elsewhere in the diff? - For PRs >200 lines: explicitly confirm all sections have been reviewed + - Return all actionable findings discovered; do not stop after the first issues ## Phase 3: Submit Review Submit findings from Phase 2 using the rules below. @@ -93,7 +95,8 @@ Tools and format: - Use github_pr___submit_review to submit the overall review - Use github_pr___delete_comment or github_pr___minimize_comment for outdated comments - Use github_pr___reply_to_comment to reply to existing threads -- Side selection: use RIGHT for new/modified code, LEFT only for removed code +- Side selection: use RIGHT for new/modified code, LEFT only for removed code. Line numbers must correspond to the chosen side. +- Do not call github_pr___resolve_review_thread - Do not approve or request changes; submit comment-only reviews "No issues" handling: From 49df74b7e87a01268f71732d6cc0276c42e24158 Mon Sep 17 00:00:00 2001 From: User Date: Tue, 13 Jan 2026 15:39:11 -0800 Subject: [PATCH 09/33] Consolidate inline comment tool guidance with multi-line support --- src/create-prompt/templates/review-prompt.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/create-prompt/templates/review-prompt.ts b/src/create-prompt/templates/review-prompt.ts index 4e969cf..cdd26e1 100644 --- a/src/create-prompt/templates/review-prompt.ts +++ b/src/create-prompt/templates/review-prompt.ts @@ -91,11 +91,10 @@ When NOT to submit: - Do not escalate style/formatting into P0/P1 just to justify submitting Tools and format: -- Use github_inline_comment___create_inline_comment for inline findings (path + side + line) +- Use the inline comment tool (github_inline_comment___create_inline_comment) to post inline findings. Anchor comments using path + side + line, where line is the PR diff line number on that side (RIGHT = new/modified code, LEFT = removed code). For multi-line comments, use startLine + line. - Use github_pr___submit_review to submit the overall review - Use github_pr___delete_comment or github_pr___minimize_comment for outdated comments - Use github_pr___reply_to_comment to reply to existing threads -- Side selection: use RIGHT for new/modified code, LEFT only for removed code. Line numbers must correspond to the chosen side. - Do not call github_pr___resolve_review_thread - Do not approve or request changes; submit comment-only reviews From b8d93014117e2f3f7824d579b762f34ec9594635 Mon Sep 17 00:00:00 2001 From: User Date: Tue, 13 Jan 2026 16:58:01 -0800 Subject: [PATCH 10/33] Refine code review prompt to improve precision --- src/create-prompt/templates/review-prompt.ts | 256 ++++++++++++------- 1 file changed, 170 insertions(+), 86 deletions(-) diff --git a/src/create-prompt/templates/review-prompt.ts b/src/create-prompt/templates/review-prompt.ts index cdd26e1..a51341e 100644 --- a/src/create-prompt/templates/review-prompt.ts +++ b/src/create-prompt/templates/review-prompt.ts @@ -15,93 +15,177 @@ export function generateReviewPrompt(context: PreparedContext): string { return `You are performing an automated code review for PR #${prNumber} in ${repoFullName}. The gh CLI is installed and authenticated via GH_TOKEN. -Context: -- Repo: ${repoFullName} -- PR Number: ${prNumber} -- PR Head Ref: ${headRefName} -- PR Head SHA: ${headSha} -- PR Base Ref: ${baseRefName} -- The PR branch has already been checked out. You have full access to read any file in the codebase, not just the diff output. - -Objectives: -1) Re-check existing review comments; if a previously reported issue appears fixed, leave a brief "resolved" reply (do NOT programmatically resolve threads). -2) Review the PR diff and surface all bugs that meet the detection criteria below. -3) Leave concise inline comments (1-2 sentences) on bugs introduced by the PR. You may also comment on unchanged lines if the PR's changes expose or trigger issues there—but explain the connection clearly. - -Procedure: -Follow these phases IN ORDER. Do not skip to submitting findings until you complete Phase 1. - -## Phase 1: Context Gathering (REQUIRED before making any findings) -1. Check existing comments: gh pr view ${prNumber} --repo ${repoFullName} --json comments,reviews -2. Get the full diff: - - git fetch origin ${baseRefName}:refs/remotes/origin/${baseRefName} - - MERGE_BASE=$(git merge-base HEAD refs/remotes/origin/${baseRefName}) - - git diff $MERGE_BASE..HEAD -3. For EACH file in the diff, gather context: - - For new imports: Grep to verify the imported symbol exists - - For new/modified functions: Grep for callers to understand usage patterns - - For functions that process data: Read surrounding code to understand expected types -4. Do NOT identify bugs yet - focus only on understanding the changes. - -## Phase 2: Issue Identification (ONLY AFTER Phase 1 is complete) -1. Review ALL changed lines systematically - Do NOT stop after finding just a few issues -2. For each potential issue: - - Verify with Grep/Read before flagging (do not speculate) - - Trace data flow to confirm the bug path exists - - Check if the pattern exists elsewhere in the codebase (may be intentional) -3. Continue until you have reviewed every changed line in every file -4. Cross-reference checks: - - When reviewing tests, search for related constants and configurations - - Use Grep and Read tools to understand relationships between files—do not rely solely on diff output - - If a test references a constant or path, verify it matches the production code's actual behavior - - Example: if a test sets an env var, Grep where it is consumed to verify behavior matches production -5. Import verification: - - For new imports, use Grep to verify the imported symbol actually exists in the codebase or is a valid external package - - Flag any import that references a non-existent symbol as a bug (will cause ImportError/ModuleNotFoundError at runtime) -6. Accuracy gates: - - Only flag bugs introduced by this PR (not pre-existing issues) - - Base findings strictly on the current diff and repo context; avoid speculation - - If confidence is low, phrase a clarifying question instead of asserting a bug - - Never raise purely stylistic or preference-only concerns -7. Deduplication: Never repeat or re-raise an issue previously highlighted by this bot on this PR -8. Priority levels for findings: - - [P0] - Drop everything to fix. Blocking release/operations - - [P1] - Urgent. Should be addressed in next cycle - - [P2] - Normal. To be fixed eventually - - [P3] - Low. Nice to have -9. Comment format for each finding: - - Structure: **[P0-P3] Clear title (≤ 80 chars, imperative mood)** followed by 1 paragraph explanation - - Be clear about why it's a bug and when/where it manifests - - Brief (1 paragraph max), code chunks max 3 lines - - Matter-of-fact tone, immediately graspable by the author -10. Thorough analysis checklist (complete before moving to Phase 3): - - Do NOT submit after finding only 2-3 issues - a common failure mode is stopping too early - - Have you examined every modified function/method in each changed file? - - If the PR touches multiple files, have you analyzed interactions between changes? - - When you found a bug pattern, did you search for the same pattern elsewhere in the diff? - - For PRs >200 lines: explicitly confirm all sections have been reviewed - - Return all actionable findings discovered; do not stop after the first issues +### Context + +* Repo: ${repoFullName} +* PR Number: ${prNumber} +* PR Head Ref: ${headRefName} +* PR Head SHA: ${headSha} +* PR Base Ref: ${baseRefName} +* The PR branch has already been checked out. You have full access to read any file in the codebase, not just the diff output. + +--- + +## Objectives + +1. Re-check existing review comments; if a previously reported issue appears fixed, leave a brief "resolved" reply (**do NOT programmatically resolve threads**). +2. Review the PR diff and identify **high-confidence, actionable bugs** introduced by this PR. +3. Leave concise **inline comments (1-2 sentences)** for qualifying bugs. You may comment on unchanged lines *only* if the PR clearly triggers the issue—explain the trigger path. + +--- + +## Procedure + +Follow these phases **in order**. Do not submit findings until Phase 1 and Phase 2 are complete. + +--- + +## Phase 1: Context Gathering (REQUIRED — do not report bugs yet) + +1. Inspect existing comments: + \`gh pr view ${prNumber} --repo ${repoFullName} --json comments,reviews\` + +2. Compute the exact merge diff: + + * \`git fetch origin ${baseRefName}:refs/remotes/origin/${baseRefName}\` + * \`MERGE_BASE=$(git merge-base HEAD refs/remotes/origin/${baseRefName})\` + * \`git diff $MERGE_BASE..HEAD\` + +3. For **each file in the diff**, gather context: + + * New imports → Grep to confirm the symbol exists + * New/modified functions → Grep for callers to understand usage + * Data-processing code → Read surrounding code to infer expected types + +4. Do **not** identify or report bugs yet. This phase is for understanding only. + +--- + +## Phase 2: Issue Identification (ONLY after Phase 1) + +Review **every changed line**. You must complete the review even if you find issues early. + +### Analysis discipline + +* Verify with Grep/Read before flagging (no speculation) +* Trace data flow to confirm a **real trigger path** +* Check whether the pattern exists elsewhere (may be intentional) + +### Cross-reference checks + +* When reviewing tests, search for related constants, configs, or environment variables +* Verify test assumptions match production behavior + *Example:* if a test sets an env var, Grep where it is consumed to confirm behavior matches prod + +### Import verification + +* Any import referencing a non-existent symbol is a bug (runtime ImportError) + +--- + +## **Reporting Gate (CRITICAL)** + +Only report findings that meet **at least one** of the following: + +### Reportable bugs + +* **Definite runtime failures** (TypeError, KeyError, AttributeError, ImportError) +* **Incorrect logic** with a clear trigger path and observable wrong result +* **Security vulnerabilities** with a realistic exploit path +* **Data corruption or loss** +* **Breaking contract changes** (API / response / schema / validator behavior) where the contract is discoverable in code, tests, or docs + +### Do NOT report + +* Test code hygiene (unused vars, setup patterns) unless it causes test failure +* Defensive "what-if" scenarios without a realistic trigger +* Cosmetic issues (message text, naming, formatting) +* Suggestions to "add guards," "add try/catch," or "be safer" without a concrete failure + +### Confidence rule + +* Prefer **DEFINITE** bugs over **POSSIBLE** bugs +* Report POSSIBLE bugs **only** if you can identify a realistic execution path + +--- + +## Targeted semantic passes (apply when relevant) + +* **API / validator / serializer changes** + Explicitly check for response-format or contract breakage + *(e.g., changed error response structure, removed or renamed fields, different status codes, altered required keys)* + +* **Auth / OAuth / session / state changes** + Check null-state handling, per-request randomness (state/nonce), and failure paths + +--- + +## Deduplication + +* Never open a new finding for an issue previously reported by this bot on this PR +* If an issue appears fixed, reply "resolved" in the existing thread + +--- + +## Priority Levels + +* [P0] Blocking / crash / exploit +* [P1] Urgent correctness or security issue +* [P2] Real bug with limited impact +* [P3] Minor but real bug + +--- + +## Comment format + +Each inline comment must be: + +**[P0-P3] Clear imperative title (≤80 chars)** + +(blank line) + +One short paragraph explaining *why* this is a bug and *how* it manifests. + +* Max 1 paragraph +* Code snippets ≤3 lines, Markdown fenced +* Matter-of-fact, non-accusatory tone + +--- ## Phase 3: Submit Review -Submit findings from Phase 2 using the rules below. - -When NOT to submit: -- PR appears formatting-only -- Cannot anchor a high-confidence issue to a specific changed line -- Do not escalate style/formatting into P0/P1 just to justify submitting - -Tools and format: -- Use the inline comment tool (github_inline_comment___create_inline_comment) to post inline findings. Anchor comments using path + side + line, where line is the PR diff line number on that side (RIGHT = new/modified code, LEFT = removed code). For multi-line comments, use startLine + line. -- Use github_pr___submit_review to submit the overall review -- Use github_pr___delete_comment or github_pr___minimize_comment for outdated comments -- Use github_pr___reply_to_comment to reply to existing threads -- Do not call github_pr___resolve_review_thread -- Do not approve or request changes; submit comment-only reviews - -"No issues" handling: -- If no issues found and a prior "no issues" comment exists: skip (avoid redundancy) -- If no issues found and no prior comment exists: post a single brief summary -- If issues found and prior "no issues" comment exists: delete/minimize it before submitting -- Do NOT delete comment ID ${context.droidCommentId} (tracking comment for current run) + +### When NOT to submit + +* PR is formatting-only +* You cannot anchor a high-confidence issue to a specific changed line +* All findings fail the Reporting Gate above + +### Tools & mechanics + +* Use \`github_inline_comment___create_inline_comment\` + * Anchor using **path + side + line** + * RIGHT = new/modified code, LEFT = removed code + * Line numbers must correspond to the chosen side +* Use \`github_pr___submit_review\` for the summary +* Use \`github_pr___delete_comment\` or \`github_pr___minimize_comment\` for outdated "no issues" comments +* Use \`github_pr___reply_to_comment\` to acknowledge resolved issues +* **Do NOT call** \`github_pr___resolve_review_thread\` +* Do **not** approve or request changes + +### "No issues" handling + +* If no issues and a prior "no issues" comment exists → skip +* If no issues and no prior comment exists → post a brief summary +* If issues exist and a prior "no issues" comment exists → delete/minimize it +* **Do NOT delete** comment ID ${context.droidCommentId} + +--- + +## Review summary + +In the submitted review body: + +* State whether the changes are correct or incorrect +* Provide a 1-3 sentence overall assessment `; } From 37d99926970ea38f7fb706bd9a8b0bfd5850895c Mon Sep 17 00:00:00 2001 From: User Date: Wed, 14 Jan 2026 14:29:06 -0800 Subject: [PATCH 11/33] Use local /review prompt --- src/create-prompt/templates/review-prompt.ts | 211 ++++++------------- 1 file changed, 70 insertions(+), 141 deletions(-) diff --git a/src/create-prompt/templates/review-prompt.ts b/src/create-prompt/templates/review-prompt.ts index a51341e..be5df41 100644 --- a/src/create-prompt/templates/review-prompt.ts +++ b/src/create-prompt/templates/review-prompt.ts @@ -12,131 +12,56 @@ export function generateReviewPrompt(context: PreparedContext): string { const headSha = context.prBranchData?.headRefOid ?? "unknown"; const baseRefName = context.eventData.baseBranch ?? "unknown"; - return `You are performing an automated code review for PR #${prNumber} in ${repoFullName}. -The gh CLI is installed and authenticated via GH_TOKEN. - -### Context - -* Repo: ${repoFullName} -* PR Number: ${prNumber} -* PR Head Ref: ${headRefName} -* PR Head SHA: ${headSha} -* PR Base Ref: ${baseRefName} -* The PR branch has already been checked out. You have full access to read any file in the codebase, not just the diff output. - ---- - -## Objectives - -1. Re-check existing review comments; if a previously reported issue appears fixed, leave a brief "resolved" reply (**do NOT programmatically resolve threads**). -2. Review the PR diff and identify **high-confidence, actionable bugs** introduced by this PR. -3. Leave concise **inline comments (1-2 sentences)** for qualifying bugs. You may comment on unchanged lines *only* if the PR clearly triggers the issue—explain the trigger path. - ---- - -## Procedure - -Follow these phases **in order**. Do not submit findings until Phase 1 and Phase 2 are complete. - ---- - -## Phase 1: Context Gathering (REQUIRED — do not report bugs yet) - -1. Inspect existing comments: - \`gh pr view ${prNumber} --repo ${repoFullName} --json comments,reviews\` - -2. Compute the exact merge diff: - - * \`git fetch origin ${baseRefName}:refs/remotes/origin/${baseRefName}\` - * \`MERGE_BASE=$(git merge-base HEAD refs/remotes/origin/${baseRefName})\` - * \`git diff $MERGE_BASE..HEAD\` - -3. For **each file in the diff**, gather context: - - * New imports → Grep to confirm the symbol exists - * New/modified functions → Grep for callers to understand usage - * Data-processing code → Read surrounding code to infer expected types - -4. Do **not** identify or report bugs yet. This phase is for understanding only. - ---- - -## Phase 2: Issue Identification (ONLY after Phase 1) - -Review **every changed line**. You must complete the review even if you find issues early. - -### Analysis discipline - -* Verify with Grep/Read before flagging (no speculation) -* Trace data flow to confirm a **real trigger path** -* Check whether the pattern exists elsewhere (may be intentional) - -### Cross-reference checks - -* When reviewing tests, search for related constants, configs, or environment variables -* Verify test assumptions match production behavior - *Example:* if a test sets an env var, Grep where it is consumed to confirm behavior matches prod - -### Import verification - -* Any import referencing a non-existent symbol is a bug (runtime ImportError) - ---- - -## **Reporting Gate (CRITICAL)** - -Only report findings that meet **at least one** of the following: - -### Reportable bugs - -* **Definite runtime failures** (TypeError, KeyError, AttributeError, ImportError) -* **Incorrect logic** with a clear trigger path and observable wrong result -* **Security vulnerabilities** with a realistic exploit path -* **Data corruption or loss** -* **Breaking contract changes** (API / response / schema / validator behavior) where the contract is discoverable in code, tests, or docs - -### Do NOT report - -* Test code hygiene (unused vars, setup patterns) unless it causes test failure -* Defensive "what-if" scenarios without a realistic trigger -* Cosmetic issues (message text, naming, formatting) -* Suggestions to "add guards," "add try/catch," or "be safer" without a concrete failure - -### Confidence rule - -* Prefer **DEFINITE** bugs over **POSSIBLE** bugs -* Report POSSIBLE bugs **only** if you can identify a realistic execution path - ---- - -## Targeted semantic passes (apply when relevant) - -* **API / validator / serializer changes** - Explicitly check for response-format or contract breakage - *(e.g., changed error response structure, removed or renamed fields, different status codes, altered required keys)* - -* **Auth / OAuth / session / state changes** - Check null-state handling, per-request randomness (state/nonce), and failure paths - ---- - -## Deduplication - -* Never open a new finding for an issue previously reported by this bot on this PR -* If an issue appears fixed, reply "resolved" in the existing thread - ---- + return `# Review Guidelines + +You are acting as a code reviewer for PR #${prNumber} in ${repoFullName}. + +## How Many Findings to Return + +Output all findings that the original author would fix if they knew about it. If there is no finding that a person would definitely love to see and fix, prefer outputting no findings. Do not stop at the first qualifying finding. Continue until you've listed every qualifying finding. + +## Key Guidelines for Bug Detection + +Only flag an issue as a bug if: +1. It meaningfully impacts the accuracy, performance, security, or maintainability of the code. +2. The bug is discrete and actionable (not a general issue). +3. Fixing the bug does not demand a level of rigor not present in the rest of the codebase. +4. The bug was introduced in the commit (pre-existing bugs should not be flagged). +5. The author would likely fix the issue if made aware of it. +6. The bug does not rely on unstated assumptions. +7. Must identify provably affected code parts (not speculation). +8. The bug is clearly not intentional. + +## Comment Guidelines + +Your review comments should be: +1. Clear about why the issue is a bug +2. Appropriately communicate severity +3. Brief - at most 1 paragraph +4. Code chunks max 3 lines, wrapped in markdown +5. Clearly communicate scenarios/environments for bug +6. Matter-of-fact tone without being accusatory +7. Immediately graspable by original author +8. Avoid excessive flattery + +## Additional Guidelines + +- Ignore trivial style unless it obscures meaning or violates documented standards. +- Use one comment per distinct issue (or a multi-line range if necessary). +- Use \`\`\`suggestion blocks ONLY for concrete replacement code (minimal lines; no commentary inside the block). +- In every \`\`\`suggestion block, preserve the exact leading whitespace of the replaced lines. +- Keep line ranges as short as possible (5-10 lines max). ## Priority Levels -* [P0] Blocking / crash / exploit -* [P1] Urgent correctness or security issue -* [P2] Real bug with limited impact -* [P3] Minor but real bug +Use the following priority levels for your findings: +- **[P0]** - Drop everything to fix. Blocking release/operations +- **[P1]** - Urgent. Should be addressed in next cycle +- **[P2]** - Normal. To be fixed eventually +- **[P3]** - Low. Nice to have ---- -## Comment format +## Output Format Each inline comment must be: @@ -146,46 +71,50 @@ Each inline comment must be: One short paragraph explaining *why* this is a bug and *how* it manifests. -* Max 1 paragraph -* Code snippets ≤3 lines, Markdown fenced -* Matter-of-fact, non-accusatory tone ---- -## Phase 3: Submit Review +## PR Context + +* Repo: ${repoFullName} +* PR Number: ${prNumber} +* PR Head Ref: ${headRefName} +* PR Head SHA: ${headSha} +* PR Base Ref: ${baseRefName} +* The PR branch has already been checked out +* The gh CLI is installed and authenticated via GH_TOKEN + +## Review Process + +1. **Check existing comments**: Run \`gh pr view ${prNumber} --repo ${repoFullName} --json comments,reviews\` to see existing review threads. If issues appear fixed, reply "resolved" to those threads. + +2. **Get the merge diff**: Start by finding the merge diff between the PR branch and the base branch, e.g. (\`git merge-base HEAD "$(git rev-parse --abbrev-ref "${baseRefName}@{upstream}")"\`), then run \`git diff\` against that SHA to see what changes would be merged into the base branch. -### When NOT to submit +3. **Review the changes**: Analyze every changed line for bugs introduced by this PR. Provide prioritized, actionable findings. -* PR is formatting-only -* You cannot anchor a high-confidence issue to a specific changed line -* All findings fail the Reporting Gate above +## Submitting Review ### Tools & mechanics -* Use \`github_inline_comment___create_inline_comment\` +* Use \`github_inline_comment___create_inline_comment\` to post inline comments * Anchor using **path + side + line** * RIGHT = new/modified code, LEFT = removed code - * Line numbers must correspond to the chosen side -* Use \`github_pr___submit_review\` for the summary -* Use \`github_pr___delete_comment\` or \`github_pr___minimize_comment\` for outdated "no issues" comments +* Use \`github_pr___submit_review\` for the overall summary * Use \`github_pr___reply_to_comment\` to acknowledge resolved issues +* Use \`github_pr___delete_comment\` or \`github_pr___minimize_comment\` for outdated "no issues" comments * **Do NOT call** \`github_pr___resolve_review_thread\` -* Do **not** approve or request changes +* **Do NOT delete** comment ID ${context.droidCommentId} ### "No issues" handling -* If no issues and a prior "no issues" comment exists → skip +* If no issues and a prior "no issues" comment exists → skip submitting * If no issues and no prior comment exists → post a brief summary * If issues exist and a prior "no issues" comment exists → delete/minimize it -* **Do NOT delete** comment ID ${context.droidCommentId} - ---- -## Review summary +### Overall assessment In the submitted review body: - * State whether the changes are correct or incorrect -* Provide a 1-3 sentence overall assessment +* Provide a 1-3 sentence overall explanation +* Do **not** approve or request changes `; } From ae1f3e4930b99024eb7ac8e0732a5fa54c9c929c Mon Sep 17 00:00:00 2001 From: User Date: Wed, 14 Jan 2026 16:46:01 -0800 Subject: [PATCH 12/33] Revert "Use local /review prompt" This reverts commit 37d99926970ea38f7fb706bd9a8b0bfd5850895c. --- src/create-prompt/templates/review-prompt.ts | 211 +++++++++++++------ 1 file changed, 141 insertions(+), 70 deletions(-) diff --git a/src/create-prompt/templates/review-prompt.ts b/src/create-prompt/templates/review-prompt.ts index be5df41..a51341e 100644 --- a/src/create-prompt/templates/review-prompt.ts +++ b/src/create-prompt/templates/review-prompt.ts @@ -12,56 +12,131 @@ export function generateReviewPrompt(context: PreparedContext): string { const headSha = context.prBranchData?.headRefOid ?? "unknown"; const baseRefName = context.eventData.baseBranch ?? "unknown"; - return `# Review Guidelines - -You are acting as a code reviewer for PR #${prNumber} in ${repoFullName}. - -## How Many Findings to Return - -Output all findings that the original author would fix if they knew about it. If there is no finding that a person would definitely love to see and fix, prefer outputting no findings. Do not stop at the first qualifying finding. Continue until you've listed every qualifying finding. - -## Key Guidelines for Bug Detection - -Only flag an issue as a bug if: -1. It meaningfully impacts the accuracy, performance, security, or maintainability of the code. -2. The bug is discrete and actionable (not a general issue). -3. Fixing the bug does not demand a level of rigor not present in the rest of the codebase. -4. The bug was introduced in the commit (pre-existing bugs should not be flagged). -5. The author would likely fix the issue if made aware of it. -6. The bug does not rely on unstated assumptions. -7. Must identify provably affected code parts (not speculation). -8. The bug is clearly not intentional. - -## Comment Guidelines - -Your review comments should be: -1. Clear about why the issue is a bug -2. Appropriately communicate severity -3. Brief - at most 1 paragraph -4. Code chunks max 3 lines, wrapped in markdown -5. Clearly communicate scenarios/environments for bug -6. Matter-of-fact tone without being accusatory -7. Immediately graspable by original author -8. Avoid excessive flattery - -## Additional Guidelines - -- Ignore trivial style unless it obscures meaning or violates documented standards. -- Use one comment per distinct issue (or a multi-line range if necessary). -- Use \`\`\`suggestion blocks ONLY for concrete replacement code (minimal lines; no commentary inside the block). -- In every \`\`\`suggestion block, preserve the exact leading whitespace of the replaced lines. -- Keep line ranges as short as possible (5-10 lines max). + return `You are performing an automated code review for PR #${prNumber} in ${repoFullName}. +The gh CLI is installed and authenticated via GH_TOKEN. + +### Context + +* Repo: ${repoFullName} +* PR Number: ${prNumber} +* PR Head Ref: ${headRefName} +* PR Head SHA: ${headSha} +* PR Base Ref: ${baseRefName} +* The PR branch has already been checked out. You have full access to read any file in the codebase, not just the diff output. + +--- + +## Objectives + +1. Re-check existing review comments; if a previously reported issue appears fixed, leave a brief "resolved" reply (**do NOT programmatically resolve threads**). +2. Review the PR diff and identify **high-confidence, actionable bugs** introduced by this PR. +3. Leave concise **inline comments (1-2 sentences)** for qualifying bugs. You may comment on unchanged lines *only* if the PR clearly triggers the issue—explain the trigger path. + +--- + +## Procedure + +Follow these phases **in order**. Do not submit findings until Phase 1 and Phase 2 are complete. + +--- + +## Phase 1: Context Gathering (REQUIRED — do not report bugs yet) + +1. Inspect existing comments: + \`gh pr view ${prNumber} --repo ${repoFullName} --json comments,reviews\` + +2. Compute the exact merge diff: + + * \`git fetch origin ${baseRefName}:refs/remotes/origin/${baseRefName}\` + * \`MERGE_BASE=$(git merge-base HEAD refs/remotes/origin/${baseRefName})\` + * \`git diff $MERGE_BASE..HEAD\` + +3. For **each file in the diff**, gather context: + + * New imports → Grep to confirm the symbol exists + * New/modified functions → Grep for callers to understand usage + * Data-processing code → Read surrounding code to infer expected types + +4. Do **not** identify or report bugs yet. This phase is for understanding only. + +--- + +## Phase 2: Issue Identification (ONLY after Phase 1) + +Review **every changed line**. You must complete the review even if you find issues early. + +### Analysis discipline + +* Verify with Grep/Read before flagging (no speculation) +* Trace data flow to confirm a **real trigger path** +* Check whether the pattern exists elsewhere (may be intentional) + +### Cross-reference checks + +* When reviewing tests, search for related constants, configs, or environment variables +* Verify test assumptions match production behavior + *Example:* if a test sets an env var, Grep where it is consumed to confirm behavior matches prod + +### Import verification + +* Any import referencing a non-existent symbol is a bug (runtime ImportError) + +--- + +## **Reporting Gate (CRITICAL)** + +Only report findings that meet **at least one** of the following: + +### Reportable bugs + +* **Definite runtime failures** (TypeError, KeyError, AttributeError, ImportError) +* **Incorrect logic** with a clear trigger path and observable wrong result +* **Security vulnerabilities** with a realistic exploit path +* **Data corruption or loss** +* **Breaking contract changes** (API / response / schema / validator behavior) where the contract is discoverable in code, tests, or docs + +### Do NOT report + +* Test code hygiene (unused vars, setup patterns) unless it causes test failure +* Defensive "what-if" scenarios without a realistic trigger +* Cosmetic issues (message text, naming, formatting) +* Suggestions to "add guards," "add try/catch," or "be safer" without a concrete failure + +### Confidence rule + +* Prefer **DEFINITE** bugs over **POSSIBLE** bugs +* Report POSSIBLE bugs **only** if you can identify a realistic execution path + +--- + +## Targeted semantic passes (apply when relevant) + +* **API / validator / serializer changes** + Explicitly check for response-format or contract breakage + *(e.g., changed error response structure, removed or renamed fields, different status codes, altered required keys)* + +* **Auth / OAuth / session / state changes** + Check null-state handling, per-request randomness (state/nonce), and failure paths + +--- + +## Deduplication + +* Never open a new finding for an issue previously reported by this bot on this PR +* If an issue appears fixed, reply "resolved" in the existing thread + +--- ## Priority Levels -Use the following priority levels for your findings: -- **[P0]** - Drop everything to fix. Blocking release/operations -- **[P1]** - Urgent. Should be addressed in next cycle -- **[P2]** - Normal. To be fixed eventually -- **[P3]** - Low. Nice to have +* [P0] Blocking / crash / exploit +* [P1] Urgent correctness or security issue +* [P2] Real bug with limited impact +* [P3] Minor but real bug +--- -## Output Format +## Comment format Each inline comment must be: @@ -71,50 +146,46 @@ Each inline comment must be: One short paragraph explaining *why* this is a bug and *how* it manifests. +* Max 1 paragraph +* Code snippets ≤3 lines, Markdown fenced +* Matter-of-fact, non-accusatory tone +--- -## PR Context - -* Repo: ${repoFullName} -* PR Number: ${prNumber} -* PR Head Ref: ${headRefName} -* PR Head SHA: ${headSha} -* PR Base Ref: ${baseRefName} -* The PR branch has already been checked out -* The gh CLI is installed and authenticated via GH_TOKEN - -## Review Process - -1. **Check existing comments**: Run \`gh pr view ${prNumber} --repo ${repoFullName} --json comments,reviews\` to see existing review threads. If issues appear fixed, reply "resolved" to those threads. - -2. **Get the merge diff**: Start by finding the merge diff between the PR branch and the base branch, e.g. (\`git merge-base HEAD "$(git rev-parse --abbrev-ref "${baseRefName}@{upstream}")"\`), then run \`git diff\` against that SHA to see what changes would be merged into the base branch. +## Phase 3: Submit Review -3. **Review the changes**: Analyze every changed line for bugs introduced by this PR. Provide prioritized, actionable findings. +### When NOT to submit -## Submitting Review +* PR is formatting-only +* You cannot anchor a high-confidence issue to a specific changed line +* All findings fail the Reporting Gate above ### Tools & mechanics -* Use \`github_inline_comment___create_inline_comment\` to post inline comments +* Use \`github_inline_comment___create_inline_comment\` * Anchor using **path + side + line** * RIGHT = new/modified code, LEFT = removed code -* Use \`github_pr___submit_review\` for the overall summary -* Use \`github_pr___reply_to_comment\` to acknowledge resolved issues + * Line numbers must correspond to the chosen side +* Use \`github_pr___submit_review\` for the summary * Use \`github_pr___delete_comment\` or \`github_pr___minimize_comment\` for outdated "no issues" comments +* Use \`github_pr___reply_to_comment\` to acknowledge resolved issues * **Do NOT call** \`github_pr___resolve_review_thread\` -* **Do NOT delete** comment ID ${context.droidCommentId} +* Do **not** approve or request changes ### "No issues" handling -* If no issues and a prior "no issues" comment exists → skip submitting +* If no issues and a prior "no issues" comment exists → skip * If no issues and no prior comment exists → post a brief summary * If issues exist and a prior "no issues" comment exists → delete/minimize it +* **Do NOT delete** comment ID ${context.droidCommentId} + +--- -### Overall assessment +## Review summary In the submitted review body: + * State whether the changes are correct or incorrect -* Provide a 1-3 sentence overall explanation -* Do **not** approve or request changes +* Provide a 1-3 sentence overall assessment `; } From afc66a42122f22a20cf3fa66ec8b08572d37b33e Mon Sep 17 00:00:00 2001 From: User Date: Wed, 14 Jan 2026 17:04:01 -0800 Subject: [PATCH 13/33] add no pager to get full diff, exclude P2/P3 issues --- src/create-prompt/templates/review-prompt.ts | 3 +- .../templates/review-prompt.test.ts | 34 ++++++------------- test/integration/review-flow.test.ts | 10 +++--- 3 files changed, 17 insertions(+), 30 deletions(-) diff --git a/src/create-prompt/templates/review-prompt.ts b/src/create-prompt/templates/review-prompt.ts index a51341e..98fd279 100644 --- a/src/create-prompt/templates/review-prompt.ts +++ b/src/create-prompt/templates/review-prompt.ts @@ -49,7 +49,7 @@ Follow these phases **in order**. Do not submit findings until Phase 1 and Phase * \`git fetch origin ${baseRefName}:refs/remotes/origin/${baseRefName}\` * \`MERGE_BASE=$(git merge-base HEAD refs/remotes/origin/${baseRefName})\` - * \`git diff $MERGE_BASE..HEAD\` + * \`git --no-pager diff $MERGE_BASE..HEAD\` 3. For **each file in the diff**, gather context: @@ -158,6 +158,7 @@ One short paragraph explaining *why* this is a bug and *how* it manifests. * PR is formatting-only * You cannot anchor a high-confidence issue to a specific changed line +* All findings are low-severity (P2/P3) * All findings fail the Reporting Gate above ### Tools & mechanics diff --git a/test/create-prompt/templates/review-prompt.test.ts b/test/create-prompt/templates/review-prompt.test.ts index 3833ded..ec23288 100644 --- a/test/create-prompt/templates/review-prompt.test.ts +++ b/test/create-prompt/templates/review-prompt.test.ts @@ -27,43 +27,31 @@ describe("generateReviewPrompt", () => { const prompt = generateReviewPrompt(context); - expect(prompt).toContain("Objectives:"); + expect(prompt).toContain("## Objectives"); expect(prompt).toContain("Re-check existing review comments"); expect(prompt).toContain("git merge-base"); - expect(prompt).toContain("git diff"); - expect(prompt).toContain( - "gh pr diff 42 --repo test-owner/test-repo", - ); - expect(prompt).toContain("gh api repos/test-owner/test-repo/pulls/42/files"); + expect(prompt).toContain("git --no-pager diff $MERGE_BASE..HEAD"); expect(prompt).toContain("github_inline_comment___create_inline_comment"); - expect(prompt).toContain("github_pr___resolve_review_thread"); - expect(prompt).toContain("every substantive comment must be inline on the changed line"); + expect(prompt).toContain("**Do NOT call** `github_pr___resolve_review_thread`"); }); it("emphasizes accuracy gates and bug detection guidelines", () => { const prompt = generateReviewPrompt(createBaseContext()); - expect(prompt).toContain("How Many Findings to Return:"); - expect(prompt).toContain("Output all findings that the original author would fix"); - expect(prompt).toContain("Key Guidelines for Bug Detection:"); - expect(prompt).toContain("Priority Levels:"); + expect(prompt).toContain("## Priority Levels"); expect(prompt).toContain("[P0]"); - expect(prompt).toContain("Never raise purely stylistic"); - expect(prompt).toContain("Never repeat or re-raise an issue previously highlighted"); + expect(prompt).toContain( + "Never open a new finding for an issue previously reported by this bot", + ); }); it("describes submission guidance", () => { const prompt = generateReviewPrompt(createBaseContext()); - expect(prompt).toContain("Prefer github_inline_comment___create_inline_comment"); - expect(prompt).toContain("gh api repos/test-owner/test-repo/pulls/42/reviews"); - expect(prompt).toContain("Do not approve or request changes"); + expect(prompt).toContain("Use `github_inline_comment___create_inline_comment`"); + expect(prompt).toContain("Do **not** approve or request changes"); expect(prompt).toContain("github_pr___submit_review"); - expect(prompt).toContain("github_pr___resolve_review_thread"); - expect(prompt).toContain("skip submitting another comment to avoid redundancy"); - expect(prompt).toContain("Do not submit inline comments"); - expect(prompt).toContain( - "Do not escalate style/formatting into P0/P1 just to justify leaving an inline comment", - ); + expect(prompt).toContain("### When NOT to submit"); + expect(prompt).toContain("All findings are low-severity (P2/P3)"); }); }); diff --git a/test/integration/review-flow.test.ts b/test/integration/review-flow.test.ts index 61bbf6e..6095115 100644 --- a/test/integration/review-flow.test.ts +++ b/test/integration/review-flow.test.ts @@ -146,13 +146,11 @@ describe("review command integration", () => { expect(prompt).toContain("You are performing an automated code review"); expect(prompt).toContain("github_inline_comment___create_inline_comment"); - expect(prompt).toContain("How Many Findings to Return:"); - expect(prompt).toContain("Output all findings that the original author would fix"); - expect(prompt).toContain("Key Guidelines for Bug Detection:"); - expect(prompt).toContain("Priority Levels:"); + expect(prompt).toContain("## **Reporting Gate (CRITICAL)**"); + expect(prompt).toContain("## Priority Levels"); expect(prompt).toContain("gh pr view 7 --repo test-owner/test-repo --json comments,reviews"); - expect(prompt).toContain("every substantive comment must be inline on the changed line"); - expect(prompt).toContain("github_pr___resolve_review_thread"); + expect(prompt).toContain("git --no-pager diff $MERGE_BASE..HEAD"); + expect(prompt).toContain("**Do NOT call** `github_pr___resolve_review_thread`"); const droidArgsCall = setOutputSpy.mock.calls.find( (call: unknown[]) => call[0] === "droid_args", From 0983d10fdb13f02fe16550b6091d236cbb99c4f7 Mon Sep 17 00:00:00 2001 From: User Date: Wed, 14 Jan 2026 17:04:27 -0800 Subject: [PATCH 14/33] Add common analysis patterns for reviews --- src/create-prompt/templates/review-prompt.ts | 126 +++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/src/create-prompt/templates/review-prompt.ts b/src/create-prompt/templates/review-prompt.ts index 98fd279..819bb9f 100644 --- a/src/create-prompt/templates/review-prompt.ts +++ b/src/create-prompt/templates/review-prompt.ts @@ -83,6 +83,132 @@ Review **every changed line**. You must complete the review even if you find iss --- +## Systematic Analysis Patterns + +Apply these patterns during your review. These target common bug classes: + +### Logic & Variable Usage + +* When reviewing conditionals, verify the correct variable is referenced in each clause +* Check for AND vs OR confusion in permission/validation logic (especially security-critical paths) +* Verify return statements return the intended value: + - Not wrapper objects when data is expected (e.g., \`safeParse()\` result vs \`.data\`) + - Not intermediate variables when final result is expected + - Not wrong object properties (e.g., \`obj.original\` vs \`obj.modified\`) +* In loops or transformations, confirm variable names match semantic purpose +* Flag operations where variable names suggest type mismatches (e.g., \`*Time\` used where \`*Date\` expected) + +### Null/Undefined Safety + +* For each property access chain (\`a.b.c\`), verify none of the intermediate values can be null/undefined/None: + - Check API responses that might return null + - Check database queries that might return no results + - Check array operations like \`array[0]\` or \`array.find()\` +* When Optional types are unwrapped (\`.get()\`, \`!\`, non-null assertions), verify presence is checked first +* In conditional branches, verify null handling exists for all code paths +* Pay special attention to: + - Authentication/authorization contexts (user might not be authenticated) + - Optional relationships (foreign keys, joined data) + - Map/dictionary lookups + - Configuration values that might not be set + +### Type Compatibility & Data Flow + +* Trace what types flow into math operations: + - \`floor\`, \`ceil\`, \`round\`, modulo on datetime/string inputs cause errors + - Arithmetic operations on mixed types +* Verify comparison operators match types: + - Object reference equality (\`===\`, \`==\`) on different instances always false + - Comparing object types when value comparison intended (e.g., dayjs objects) +* Check function parameters receive expected types after transformations: + - Serialization/deserialization boundary crossings + - Database field type conversions + - API request/response parsing +* Flag string operations on numbers or vice versa without explicit conversion +* When data flows through multiple functions, verify type consistency at each step + +### Async/Await Patterns (JavaScript/TypeScript) + +* **CRITICAL**: Flag \`forEach\`/\`map\`/\`filter\` with async callbacks - these don't await: + \`\`\`javascript + // BUG: async callbacks are fire-and-forget + items.forEach(async (item) => await process(item)); + + // CORRECT: use for...of or Promise.all + for (const item of items) await process(item); + \`\`\` +* Verify all async function calls are awaited when their result or side-effect is needed: + - Database writes that must complete before continuing + - API calls whose responses are used + - File operations that affect subsequent code +* Check promise chains have proper error handling (\`.catch()\` or try/catch) +* Verify parallel operations use \`Promise.all\`/\`Promise.allSettled\` appropriately + +### Security Vulnerabilities + +* **SSRF (Server-Side Request Forgery)**: + - Flag unvalidated URL fetching: \`open(url)\`, \`fetch(user_input)\`, \`curl\` with user input + - Verify URL allowlists exist for external requests +* **XSS (Cross-Site Scripting)**: + - Check for unescaped user input in HTML/template contexts + - Verify template engines auto-escape by default +* **Authentication & Session State**: + - OAuth state/nonce must be per-request random, not static or reused + - CSRF tokens must be unpredictable and verified + - Session data must not leak between requests +* **Input Validation Bypasses**: + - \`indexOf()\`/\`startsWith()\` for origin validation can be bypassed (use exact allowlist matching) + - Case sensitivity in security checks (email blacklists, domain checks) +* **Timing Attacks**: + - Secret/token comparison should use constant-time comparison functions +* **Cache Poisoning**: + - Verify security decisions (permissions, auth) aren't cached asymmetrically + - If grants are cached, denials must also be cached (or neither) + +### Concurrency Issues (when applicable) + +* Check if shared state is modified without synchronization: + - Global variables or class fields accessed by multiple threads/goroutines + - Database records that can be modified concurrently +* Verify double-checked locking re-checks condition after acquiring lock: + \`\`\`go + // BUG: doesn't re-check after lock + if !initialized { + lock.Lock() + initialize() + lock.Unlock() + } + + // CORRECT: re-check after acquiring lock + if !initialized { + lock.Lock() + if !initialized { + initialize() + } + lock.Unlock() + } + \`\`\` +* Flag non-atomic operations on shared counters: + - \`count = count + 1\` in concurrent code (use atomic operations) + - Read-modify-write without locks +* Check that resources accessed by multiple threads have proper synchronization + +### API Contract & Breaking Changes + +* When serializers/validators/response formatters change: + - Use Grep to find API consumers and test files + - Verify response structure remains compatible (field names, types, nesting) + - Check for removed fields, changed types, or new required fields +* When database models/schemas change: + - Verify migrations include data backfill for existing records + - Check that existing queries still work with new schema +* When function signatures change: + - Grep for all callers to verify compatibility + - Check if return type changes break caller assumptions +* Review test files for expected API shapes and verify changes match tests + +--- + ## **Reporting Gate (CRITICAL)** Only report findings that meet **at least one** of the following: From cbbe3c1b130932fd62548f2aaa9a14c87296670f Mon Sep 17 00:00:00 2001 From: User Date: Thu, 15 Jan 2026 12:21:57 -0800 Subject: [PATCH 15/33] Pre-compute PR diff and comments, enforce thorough review with xhigh reasoning --- action.yml | 12 -- src/create-prompt/index.ts | 36 +++-- src/create-prompt/templates/review-prompt.ts | 47 +++++-- src/create-prompt/types.ts | 6 + src/tag/commands/review.ts | 132 +++++++++++++++++- .../templates/review-prompt.test.ts | 58 +++++++- test/modes/tag/review-command.test.ts | 74 +++++++++- 7 files changed, 318 insertions(+), 47 deletions(-) diff --git a/action.yml b/action.yml index f05b5fe..3125856 100644 --- a/action.yml +++ b/action.yml @@ -178,18 +178,6 @@ runs: env: EXPERIMENTAL_ALLOWED_DOMAINS: ${{ inputs.experimental_allowed_domains }} - - name: Checkout PR branch for review - if: steps.prepare.outputs.contains_trigger == 'true' && steps.prepare.outputs.review_pr_number != '' - shell: bash - run: | - echo "Checking out PR #${{ steps.prepare.outputs.review_pr_number }} branch for full file access..." - # Reset any local changes from the merge commit to allow clean checkout - git reset --hard HEAD - gh pr checkout ${{ steps.prepare.outputs.review_pr_number }} - echo "Successfully checked out PR branch: $(git rev-parse --abbrev-ref HEAD)" - env: - GH_TOKEN: ${{ steps.prepare.outputs.github_token }} - - name: Run Droid Exec id: droid if: steps.prepare.outputs.contains_trigger == 'true' diff --git a/src/create-prompt/index.ts b/src/create-prompt/index.ts index 511fd34..d87c90f 100644 --- a/src/create-prompt/index.ts +++ b/src/create-prompt/index.ts @@ -15,9 +15,14 @@ import { isPullRequestReviewCommentEvent, } from "../github/context"; import type { ParsedGitHubContext } from "../github/context"; -import type { CommonFields, PreparedContext, EventData } from "./types"; +import type { + CommonFields, + PreparedContext, + EventData, + ReviewArtifacts, +} from "./types"; -export type { CommonFields, PreparedContext } from "./types"; +export type { CommonFields, PreparedContext, ReviewArtifacts } from "./types"; const BASE_ALLOWED_TOOLS = [ "Execute", @@ -70,6 +75,7 @@ export function prepareContext( baseBranch?: string, droidBranch?: string, prBranchData?: { headRefName: string; headRefOid: string }, + reviewArtifacts?: ReviewArtifacts, ): PreparedContext { const repository = context.repository.full_name; const triggerPhrase = context.inputs.triggerPhrase || "@droid"; @@ -108,15 +114,12 @@ export function prepareContext( commonFields.droidBranch = droidBranch; } - const eventData = buildEventData( - context, - { - commentId, - commentBody, - baseBranch, - droidBranch, - }, - ); + const eventData = buildEventData(context, { + commentId, + commentBody, + baseBranch, + droidBranch, + }); const result: PreparedContext = { ...commonFields, @@ -128,6 +131,10 @@ export function prepareContext( result.prBranchData = prBranchData; } + if (reviewArtifacts) { + result.reviewArtifacts = reviewArtifacts; + } + return result; } @@ -282,9 +289,7 @@ function buildEventData( } } -export type PromptGenerator = ( - context: PreparedContext, -) => string; +export type PromptGenerator = (context: PreparedContext) => string; export type PromptCreationOptions = { githubContext: ParsedGitHubContext; @@ -296,6 +301,7 @@ export type PromptCreationOptions = { allowedTools?: string[]; disallowedTools?: string[]; includeActionsTools?: boolean; + reviewArtifacts?: ReviewArtifacts; }; export async function createPrompt({ @@ -308,6 +314,7 @@ export async function createPrompt({ allowedTools = [], disallowedTools = [], includeActionsTools = false, + reviewArtifacts, }: PromptCreationOptions) { try { const droidCommentId = commentId.toString(); @@ -317,6 +324,7 @@ export async function createPrompt({ baseBranch, droidBranch, prBranchData, + reviewArtifacts, ); await mkdir(`${process.env.RUNNER_TEMP || "/tmp"}/droid-prompts`, { diff --git a/src/create-prompt/templates/review-prompt.ts b/src/create-prompt/templates/review-prompt.ts index 819bb9f..044cb44 100644 --- a/src/create-prompt/templates/review-prompt.ts +++ b/src/create-prompt/templates/review-prompt.ts @@ -12,6 +12,12 @@ export function generateReviewPrompt(context: PreparedContext): string { const headSha = context.prBranchData?.headRefOid ?? "unknown"; const baseRefName = context.eventData.baseBranch ?? "unknown"; + const diffPath = + context.reviewArtifacts?.diffPath ?? "$RUNNER_TEMP/droid-prompts/pr.diff"; + const commentsPath = + context.reviewArtifacts?.commentsPath ?? + "$RUNNER_TEMP/droid-prompts/existing_comments.json"; + return `You are performing an automated code review for PR #${prNumber} in ${repoFullName}. The gh CLI is installed and authenticated via GH_TOKEN. @@ -24,6 +30,29 @@ The gh CLI is installed and authenticated via GH_TOKEN. * PR Base Ref: ${baseRefName} * The PR branch has already been checked out. You have full access to read any file in the codebase, not just the diff output. +### Pre-computed Review Artifacts + +The following files have been pre-computed and contain the COMPLETE data for this PR: + +* **Full PR Diff**: \`${diffPath}\` - Contains the COMPLETE diff of ALL changed files (already computed via \`git merge-base\` and \`git diff\`) +* **Existing Comments**: \`${commentsPath}\` - Contains all existing PR comments and reviews in JSON format + +**IMPORTANT**: Use these pre-computed files instead of running git or gh commands to fetch diff/comments. This ensures you have access to the COMPLETE data without truncation. + +--- + +## CRITICAL INSTRUCTION + +**DO NOT STOP UNTIL YOU HAVE REVIEWED EVERY SINGLE CHANGED FILE IN THE DIFF.** + +You MUST: +1. Read the ENTIRE \`${diffPath}\` file first (use offset/limit if needed for large files) +2. Create a mental checklist of ALL files that were changed +3. Review EACH file systematically - do not skip any file +4. Only submit your review AFTER you have analyzed every single changed file + +If the diff is large, work through it methodically. Do not rush. Do not skip files. The quality of your review depends on thoroughness. + --- ## Objectives @@ -42,22 +71,24 @@ Follow these phases **in order**. Do not submit findings until Phase 1 and Phase ## Phase 1: Context Gathering (REQUIRED — do not report bugs yet) -1. Inspect existing comments: - \`gh pr view ${prNumber} --repo ${repoFullName} --json comments,reviews\` +1. **Read existing comments** from the pre-computed file: + \`Read ${commentsPath}\` -2. Compute the exact merge diff: +2. **Read the COMPLETE diff** from the pre-computed file: + \`Read ${diffPath}\` + + If the file is large (>2400 lines), read it in chunks using offset/limit parameters. + **DO NOT proceed until you have read the ENTIRE diff.** - * \`git fetch origin ${baseRefName}:refs/remotes/origin/${baseRefName}\` - * \`MERGE_BASE=$(git merge-base HEAD refs/remotes/origin/${baseRefName})\` - * \`git --no-pager diff $MERGE_BASE..HEAD\` +3. **List all changed files** - After reading the diff, explicitly list every file that was changed. This is your checklist. -3. For **each file in the diff**, gather context: +4. For **each file in the diff**, gather context: * New imports → Grep to confirm the symbol exists * New/modified functions → Grep for callers to understand usage * Data-processing code → Read surrounding code to infer expected types -4. Do **not** identify or report bugs yet. This phase is for understanding only. +5. Do **not** identify or report bugs yet. This phase is for understanding only. --- diff --git a/src/create-prompt/types.ts b/src/create-prompt/types.ts index 75af3a4..caffddc 100644 --- a/src/create-prompt/types.ts +++ b/src/create-prompt/types.ts @@ -103,6 +103,11 @@ export type EventData = | PullRequestEvent | PullRequestTargetEvent; +export type ReviewArtifacts = { + diffPath: string; + commentsPath: string; +}; + export type PreparedContext = CommonFields & { eventData: EventData; githubContext?: GitHubContext; @@ -110,4 +115,5 @@ export type PreparedContext = CommonFields & { headRefName: string; headRefOid: string; }; + reviewArtifacts?: ReviewArtifacts; }; diff --git a/src/tag/commands/review.ts b/src/tag/commands/review.ts index bc734c5..3d11b87 100644 --- a/src/tag/commands/review.ts +++ b/src/tag/commands/review.ts @@ -1,7 +1,10 @@ import * as core from "@actions/core"; +import { execSync } from "child_process"; +import { writeFile, mkdir } from "fs/promises"; import type { GitHubContext } from "../../github/context"; import { fetchPRBranchData } from "../../github/data/pr-fetcher"; import { createPrompt } from "../../create-prompt"; +import type { ReviewArtifacts } from "../../create-prompt"; import { prepareMcpTools } from "../../mcp/install-mcp-server"; import { createInitialComment } from "../../github/operations/comments/create-initial"; import { normalizeDroidArgs, parseAllowedTools } from "../../utils/parse-tools"; @@ -10,6 +13,90 @@ import { generateReviewPrompt } from "../../create-prompt/templates/review-promp import type { Octokits } from "../../github/api/client"; import type { PrepareResult } from "../../prepare/types"; +const DIFF_MAX_BUFFER = 50 * 1024 * 1024; // 50MB buffer for large diffs + +async function computeAndStoreDiff( + baseRef: string, + tempDir: string, +): Promise { + const promptsDir = `${tempDir}/droid-prompts`; + await mkdir(promptsDir, { recursive: true }); + + // Unshallow the repo if it's a shallow clone (needed for merge-base to work) + try { + execSync("git fetch --unshallow", { encoding: "utf8", stdio: "pipe" }); + console.log("Unshallowed repository"); + } catch (e) { + // Already unshallowed or not a shallow clone, continue + console.log("Repository already has full history"); + } + + // Fetch the base branch (it may not exist locally yet) + try { + execSync( + `git fetch origin ${baseRef}:refs/remotes/origin/${baseRef}`, + { encoding: "utf8", stdio: "pipe" }, + ); + console.log(`Fetched base branch: ${baseRef}`); + } catch (e) { + // Branch might already exist, continue + console.log(`Base branch fetch skipped (may already exist): ${baseRef}`); + } + + const mergeBase = execSync( + `git merge-base HEAD refs/remotes/origin/${baseRef}`, + { encoding: "utf8" }, + ).trim(); + + const diff = execSync(`git --no-pager diff ${mergeBase}..HEAD`, { + encoding: "utf8", + maxBuffer: DIFF_MAX_BUFFER, + }); + + const diffPath = `${promptsDir}/pr.diff`; + await writeFile(diffPath, diff); + console.log(`Stored PR diff (${diff.length} bytes) at ${diffPath}`); + return diffPath; +} + +async function fetchAndStoreComments( + octokit: Octokits, + owner: string, + repo: string, + prNumber: number, + tempDir: string, +): Promise { + const promptsDir = `${tempDir}/droid-prompts`; + await mkdir(promptsDir, { recursive: true }); + + const [issueComments, reviewComments] = await Promise.all([ + octokit.rest.issues.listComments({ + owner, + repo, + issue_number: prNumber, + per_page: 100, + }), + octokit.rest.pulls.listReviewComments({ + owner, + repo, + pull_number: prNumber, + per_page: 100, + }), + ]); + + const comments = { + issueComments: issueComments.data, + reviewComments: reviewComments.data, + }; + + const commentsPath = `${promptsDir}/existing_comments.json`; + await writeFile(commentsPath, JSON.stringify(comments, null, 2)); + console.log( + `Stored existing comments (${issueComments.data.length} issue, ${reviewComments.data.length} review) at ${commentsPath}`, + ); + return commentsPath; +} + type ReviewCommandOptions = { context: GitHubContext; octokit: Octokits; @@ -46,6 +133,46 @@ export async function prepareReviewMode({ currentBranch: prData.headRefName, }; + // Checkout the PR branch before computing diff + // This ensures HEAD points to the PR head commit, not the merge commit or default branch + console.log( + `Checking out PR #${context.entityNumber} branch for diff computation...`, + ); + try { + execSync("git reset --hard HEAD", { encoding: "utf8", stdio: "pipe" }); + execSync(`gh pr checkout ${context.entityNumber}`, { + encoding: "utf8", + stdio: "pipe", + env: { ...process.env, GH_TOKEN: githubToken }, + }); + console.log( + `Successfully checked out PR branch: ${execSync("git rev-parse --abbrev-ref HEAD", { encoding: "utf8" }).trim()}`, + ); + } catch (e) { + console.error(`Failed to checkout PR branch: ${e}`); + throw new Error( + `Failed to checkout PR #${context.entityNumber} branch for review`, + ); + } + + // Pre-compute review artifacts (diff and existing comments) + const tempDir = process.env.RUNNER_TEMP || "/tmp"; + const [diffPath, commentsPath] = await Promise.all([ + computeAndStoreDiff(prData.baseRefName, tempDir), + fetchAndStoreComments( + octokit, + context.repository.owner, + context.repository.repo, + context.entityNumber, + tempDir, + ), + ]); + + const reviewArtifacts: ReviewArtifacts = { + diffPath, + commentsPath, + }; + await createPrompt({ githubContext: context, commentId, @@ -56,6 +183,7 @@ export async function prepareReviewMode({ headRefOid: prData.headRefOid, }, generatePrompt: generateReviewPrompt, + reviewArtifacts, }); core.exportVariable("DROID_EXEC_RUN_TYPE", "droid-review"); @@ -104,10 +232,10 @@ export async function prepareReviewMode({ const reviewModel = process.env.REVIEW_MODEL?.trim(); const reasoningEffort = process.env.REASONING_EFFORT?.trim(); - // Default behavior (behind the scenes): if neither is provided, run GPT-5.2 at high reasoning. + // Default behavior (behind the scenes): if neither is provided, run GPT-5.2 at xhigh reasoning. if (!reviewModel && !reasoningEffort) { droidArgParts.push(`--model "gpt-5.2"`); - droidArgParts.push(`--reasoning-effort "high"`); + droidArgParts.push(`--reasoning-effort "xhigh"`); } else { // Add model override if specified if (reviewModel) { diff --git a/test/create-prompt/templates/review-prompt.test.ts b/test/create-prompt/templates/review-prompt.test.ts index ec23288..0d0e1da 100644 --- a/test/create-prompt/templates/review-prompt.test.ts +++ b/test/create-prompt/templates/review-prompt.test.ts @@ -29,10 +29,58 @@ describe("generateReviewPrompt", () => { expect(prompt).toContain("## Objectives"); expect(prompt).toContain("Re-check existing review comments"); - expect(prompt).toContain("git merge-base"); - expect(prompt).toContain("git --no-pager diff $MERGE_BASE..HEAD"); expect(prompt).toContain("github_inline_comment___create_inline_comment"); - expect(prompt).toContain("**Do NOT call** `github_pr___resolve_review_thread`"); + expect(prompt).toContain( + "**Do NOT call** `github_pr___resolve_review_thread`", + ); + }); + + it("includes pre-computed artifact references when provided", () => { + const context = createBaseContext({ + reviewArtifacts: { + diffPath: "/tmp/test/pr.diff", + commentsPath: "/tmp/test/existing_comments.json", + }, + }); + + const prompt = generateReviewPrompt(context); + + expect(prompt).toContain("### Pre-computed Review Artifacts"); + expect(prompt).toContain("/tmp/test/pr.diff"); + expect(prompt).toContain("/tmp/test/existing_comments.json"); + expect(prompt).toContain("COMPLETE diff"); + }); + + it("includes critical instruction to review all files", () => { + const context = createBaseContext(); + + const prompt = generateReviewPrompt(context); + + expect(prompt).toContain("## CRITICAL INSTRUCTION"); + expect(prompt).toContain( + "DO NOT STOP UNTIL YOU HAVE REVIEWED EVERY SINGLE CHANGED FILE", + ); + expect(prompt).toContain("Review EACH file systematically"); + }); + + it("instructs to read from pre-computed files in Phase 1", () => { + const context = createBaseContext({ + reviewArtifacts: { + diffPath: "/tmp/droid/pr.diff", + commentsPath: "/tmp/droid/comments.json", + }, + }); + + const prompt = generateReviewPrompt(context); + + expect(prompt).toContain( + "Read existing comments** from the pre-computed file", + ); + expect(prompt).toContain("Read /tmp/droid/comments.json"); + expect(prompt).toContain( + "Read the COMPLETE diff** from the pre-computed file", + ); + expect(prompt).toContain("Read /tmp/droid/pr.diff"); }); it("emphasizes accuracy gates and bug detection guidelines", () => { @@ -48,7 +96,9 @@ describe("generateReviewPrompt", () => { it("describes submission guidance", () => { const prompt = generateReviewPrompt(createBaseContext()); - expect(prompt).toContain("Use `github_inline_comment___create_inline_comment`"); + expect(prompt).toContain( + "Use `github_inline_comment___create_inline_comment`", + ); expect(prompt).toContain("Do **not** approve or request changes"); expect(prompt).toContain("github_pr___submit_review"); expect(prompt).toContain("### When NOT to submit"); diff --git a/test/modes/tag/review-command.test.ts b/test/modes/tag/review-command.test.ts index e31db2c..af21e1d 100644 --- a/test/modes/tag/review-command.test.ts +++ b/test/modes/tag/review-command.test.ts @@ -6,6 +6,8 @@ import { createMockContext } from "../../mockContext"; import * as promptModule from "../../../src/create-prompt"; import * as mcpInstaller from "../../../src/mcp/install-mcp-server"; import * as comments from "../../../src/github/operations/comments/create-initial"; +import * as childProcess from "child_process"; +import * as fsPromises from "fs/promises"; const MOCK_PR_DATA = { title: "PR for review", @@ -27,16 +29,21 @@ const MOCK_PR_DATA = { describe("prepareReviewMode", () => { const originalArgs = process.env.DROID_ARGS; const originalReviewModel = process.env.REVIEW_MODEL; + const originalRunnerTemp = process.env.RUNNER_TEMP; let graphqlSpy: ReturnType; let promptSpy: ReturnType; let mcpSpy: ReturnType; let setOutputSpy: ReturnType; let createInitialSpy: ReturnType; let exportVariableSpy: ReturnType; + let execSyncSpy: ReturnType; + let writeFileSpy: ReturnType; + let mkdirSpy: ReturnType; beforeEach(() => { process.env.DROID_ARGS = ""; delete process.env.REVIEW_MODEL; + process.env.RUNNER_TEMP = "/tmp/test-runner"; promptSpy = spyOn(promptModule, "createPrompt").mockResolvedValue(); mcpSpy = spyOn(mcpInstaller, "prepareMcpTools").mockResolvedValue( @@ -50,6 +57,21 @@ describe("prepareReviewMode", () => { exportVariableSpy = spyOn(core, "exportVariable").mockImplementation( () => {}, ); + // Mock execSync for git commands + execSyncSpy = spyOn(childProcess, "execSync").mockImplementation( + ((cmd: string) => { + if (cmd.includes("merge-base")) { + return "abc123def456\n"; + } + if (cmd.includes("diff")) { + return "diff --git a/file.ts b/file.ts\n+added line\n"; + } + return ""; + }) as typeof childProcess.execSync, + ); + // Mock file system operations + writeFileSpy = spyOn(fsPromises, "writeFile").mockResolvedValue(); + mkdirSpy = spyOn(fsPromises, "mkdir").mockResolvedValue(undefined); }); afterEach(() => { @@ -59,12 +81,20 @@ describe("prepareReviewMode", () => { setOutputSpy.mockRestore(); createInitialSpy.mockRestore(); exportVariableSpy.mockRestore(); + execSyncSpy.mockRestore(); + writeFileSpy.mockRestore(); + mkdirSpy.mockRestore(); process.env.DROID_ARGS = originalArgs; if (originalReviewModel !== undefined) { process.env.REVIEW_MODEL = originalReviewModel; } else { delete process.env.REVIEW_MODEL; } + if (originalRunnerTemp !== undefined) { + process.env.RUNNER_TEMP = originalRunnerTemp; + } else { + delete process.env.RUNNER_TEMP; + } }); it("prepares review flow with limited toolset when tracking comment exists", async () => { @@ -81,7 +111,14 @@ describe("prepareReviewMode", () => { }); const octokit = { - rest: {}, + rest: { + issues: { + listComments: () => Promise.resolve({ data: [] }), + }, + pulls: { + listReviewComments: () => Promise.resolve({ data: [] }), + }, + }, graphql: () => Promise.resolve({ repository: { @@ -166,7 +203,14 @@ describe("prepareReviewMode", () => { }); const octokit = { - rest: {}, + rest: { + issues: { + listComments: () => Promise.resolve({ data: [] }), + }, + pulls: { + listReviewComments: () => Promise.resolve({ data: [] }), + }, + }, graphql: () => Promise.resolve({ repository: { @@ -227,7 +271,14 @@ describe("prepareReviewMode", () => { }); const octokit = { - rest: {}, + rest: { + issues: { + listComments: () => Promise.resolve({ data: [] }), + }, + pulls: { + listReviewComments: () => Promise.resolve({ data: [] }), + }, + }, graphql: () => Promise.resolve({ repository: { @@ -260,7 +311,9 @@ describe("prepareReviewMode", () => { const droidArgsCall = setOutputSpy.mock.calls.find( (call: unknown[]) => call[0] === "droid_args", ) as [string, string] | undefined; - expect(droidArgsCall?.[1]).toContain('--model "claude-sonnet-4-5-20250929"'); + expect(droidArgsCall?.[1]).toContain( + '--model "claude-sonnet-4-5-20250929"', + ); }); it("does not add --model flag when REVIEW_MODEL is empty", async () => { @@ -280,7 +333,14 @@ describe("prepareReviewMode", () => { }); const octokit = { - rest: {}, + rest: { + issues: { + listComments: () => Promise.resolve({ data: [] }), + }, + pulls: { + listReviewComments: () => Promise.resolve({ data: [] }), + }, + }, graphql: () => Promise.resolve({ repository: { @@ -313,8 +373,8 @@ describe("prepareReviewMode", () => { const droidArgsCall = setOutputSpy.mock.calls.find( (call: unknown[]) => call[0] === "droid_args", ) as [string, string] | undefined; - // When neither REVIEW_MODEL nor REASONING_EFFORT is provided, we default to gpt-5.2 at high reasoning. + // When neither REVIEW_MODEL nor REASONING_EFFORT is provided, we default to gpt-5.2 at xhigh reasoning. expect(droidArgsCall?.[1]).toContain('--model "gpt-5.2"'); - expect(droidArgsCall?.[1]).toContain('--reasoning-effort "high"'); + expect(droidArgsCall?.[1]).toContain('--reasoning-effort "xhigh"'); }); }); From e53cbd859e350442bda5a6efb08bde8e4ac360a6 Mon Sep 17 00:00:00 2001 From: User Date: Thu, 15 Jan 2026 13:40:53 -0800 Subject: [PATCH 16/33] remove revoking of gh app token --- action.yml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/action.yml b/action.yml index 3125856..bee4e59 100644 --- a/action.yml +++ b/action.yml @@ -237,14 +237,3 @@ runs: ~/.factory/sessions/* if-no-files-found: ignore retention-days: 7 - - - name: Revoke app token - if: always() && inputs.github_token == '' && steps.prepare.outputs.skipped_due_to_workflow_validation_mismatch != 'true' - shell: bash - run: | - curl -L \ - -X DELETE \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer ${{ steps.prepare.outputs.GITHUB_TOKEN }}" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - ${GITHUB_API_URL:-https://api.github.com}/installation/token From 9ffb2edca7cd83dae25818a58ff5be26fb34f3c3 Mon Sep 17 00:00:00 2001 From: User Date: Thu, 15 Jan 2026 13:58:24 -0800 Subject: [PATCH 17/33] switch gpt-5.2 reasoning effort back to high --- src/tag/commands/review.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tag/commands/review.ts b/src/tag/commands/review.ts index 3d11b87..dd43d14 100644 --- a/src/tag/commands/review.ts +++ b/src/tag/commands/review.ts @@ -232,10 +232,10 @@ export async function prepareReviewMode({ const reviewModel = process.env.REVIEW_MODEL?.trim(); const reasoningEffort = process.env.REASONING_EFFORT?.trim(); - // Default behavior (behind the scenes): if neither is provided, run GPT-5.2 at xhigh reasoning. + // Default behavior (behind the scenes): if neither is provided, run GPT-5.2 at high reasoning. if (!reviewModel && !reasoningEffort) { droidArgParts.push(`--model "gpt-5.2"`); - droidArgParts.push(`--reasoning-effort "xhigh"`); + droidArgParts.push(`--reasoning-effort "high"`); } else { // Add model override if specified if (reviewModel) { From 86ec27722d1e31c299a6beba3b924fec85338ebd Mon Sep 17 00:00:00 2001 From: User Date: Wed, 21 Jan 2026 11:47:37 -0800 Subject: [PATCH 18/33] fix tests --- base-action/test/run-droid-mcp.test.ts | 1 + test/integration/review-flow.test.ts | 27 +++++++++++++++++++++++--- test/modes/tag/review-command.test.ts | 2 +- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/base-action/test/run-droid-mcp.test.ts b/base-action/test/run-droid-mcp.test.ts index ea652a2..fa1bc16 100644 --- a/base-action/test/run-droid-mcp.test.ts +++ b/base-action/test/run-droid-mcp.test.ts @@ -90,6 +90,7 @@ mock.module("child_process", () => ({ }); }, spawn: mockSpawn, + execSync: (_cmd: string) => "", })); type RunDroidModule = typeof import("../src/run-droid"); diff --git a/test/integration/review-flow.test.ts b/test/integration/review-flow.test.ts index 6095115..3bcb753 100644 --- a/test/integration/review-flow.test.ts +++ b/test/integration/review-flow.test.ts @@ -15,6 +15,7 @@ import * as createInitial from "../../src/github/operations/comments/create-init import * as mcpInstaller from "../../src/mcp/install-mcp-server"; import * as actorValidation from "../../src/github/validation/actor"; import * as core from "@actions/core"; +import * as childProcess from "node:child_process"; describe("review command integration", () => { const originalRunnerTemp = process.env.RUNNER_TEMP; @@ -26,6 +27,7 @@ describe("review command integration", () => { let actorSpy: ReturnType; let setOutputSpy: ReturnType; let exportVarSpy: ReturnType; + let execSyncSpy: ReturnType; beforeEach(async () => { tmpDir = await mkdtemp(path.join(os.tmpdir(), "review-int-")); @@ -43,6 +45,16 @@ describe("review command integration", () => { actorSpy = spyOn(actorValidation, "checkHumanActor").mockResolvedValue(); setOutputSpy = spyOn(core, "setOutput").mockImplementation(() => {}); exportVarSpy = spyOn(core, "exportVariable").mockImplementation(() => {}); + + execSyncSpy = spyOn(childProcess, "execSync").mockImplementation( + ((cmd: string) => { + if (cmd.includes("merge-base")) return "abc123def456\n"; + if (cmd.includes("git --no-pager diff")) { + return "diff --git a/file.ts b/file.ts\n+added line\n"; + } + return ""; + }) as typeof childProcess.execSync, + ); }); afterEach(async () => { @@ -52,6 +64,7 @@ describe("review command integration", () => { actorSpy.mockRestore(); setOutputSpy.mockRestore(); exportVarSpy.mockRestore(); + execSyncSpy.mockRestore(); if (process.env.RUNNER_TEMP) { await rm(process.env.RUNNER_TEMP, { recursive: true, force: true }); @@ -96,7 +109,14 @@ describe("review command integration", () => { }); const octokit = { - rest: {}, + rest: { + issues: { + listComments: () => Promise.resolve({ data: [] }), + }, + pulls: { + listReviewComments: () => Promise.resolve({ data: [] }), + }, + }, graphql: () => Promise.resolve({ repository: { pullRequest: { @@ -148,8 +168,9 @@ describe("review command integration", () => { expect(prompt).toContain("github_inline_comment___create_inline_comment"); expect(prompt).toContain("## **Reporting Gate (CRITICAL)**"); expect(prompt).toContain("## Priority Levels"); - expect(prompt).toContain("gh pr view 7 --repo test-owner/test-repo --json comments,reviews"); - expect(prompt).toContain("git --no-pager diff $MERGE_BASE..HEAD"); + expect(prompt).toContain("Pre-computed Review Artifacts"); + expect(prompt).toContain("Full PR Diff"); + expect(prompt).toContain("Existing Comments"); expect(prompt).toContain("**Do NOT call** `github_pr___resolve_review_thread`"); const droidArgsCall = setOutputSpy.mock.calls.find( diff --git a/test/modes/tag/review-command.test.ts b/test/modes/tag/review-command.test.ts index af21e1d..9a59d6d 100644 --- a/test/modes/tag/review-command.test.ts +++ b/test/modes/tag/review-command.test.ts @@ -375,6 +375,6 @@ describe("prepareReviewMode", () => { ) as [string, string] | undefined; // When neither REVIEW_MODEL nor REASONING_EFFORT is provided, we default to gpt-5.2 at xhigh reasoning. expect(droidArgsCall?.[1]).toContain('--model "gpt-5.2"'); - expect(droidArgsCall?.[1]).toContain('--reasoning-effort "xhigh"'); + expect(droidArgsCall?.[1]).toContain('--reasoning-effort "high"'); }); }); From 5da3f3fef34c6e7eee86126d1d58604b1930dfac Mon Sep 17 00:00:00 2001 From: User Date: Wed, 21 Jan 2026 11:55:06 -0800 Subject: [PATCH 19/33] test validator run --- action.yml | 58 ++++- src/create-prompt/index.ts | 1 + .../templates/review-candidates-prompt.ts | 137 +++++++++++ .../templates/review-validator-prompt.ts | 170 ++++++++++++++ src/entrypoints/prepare-validator.ts | 51 ++++ src/tag/commands/review-validator.ts | 222 ++++++++++++++++++ src/tag/commands/review.ts | 41 +++- test/entrypoints/prepare-validator.test.ts | 64 +++++ 8 files changed, 732 insertions(+), 12 deletions(-) create mode 100644 src/create-prompt/templates/review-candidates-prompt.ts create mode 100644 src/create-prompt/templates/review-validator-prompt.ts create mode 100644 src/entrypoints/prepare-validator.ts create mode 100644 src/tag/commands/review-validator.ts create mode 100644 test/entrypoints/prepare-validator.test.ts diff --git a/action.yml b/action.yml index bee4e59..390bde6 100644 --- a/action.yml +++ b/action.yml @@ -67,6 +67,18 @@ inputs: description: "Override reasoning effort for review flows (passed to Droid Exec as --reasoning-effort). If empty and review_model is also empty, the action defaults internally to gpt-5.2 at high reasoning." required: false default: "" + review_use_validator: + description: "Enable two-pass review: generate candidate comments to JSON, then validate and post only approved ones." + required: false + default: "true" + review_candidates_path: + description: "Path to write review candidates JSON (run #1 when review_use_validator=true)." + required: false + default: "${{ runner.temp }}/droid-prompts/review_candidates.json" + review_validated_path: + description: "Path to write review validated JSON (run #2 when review_use_validator=true)." + required: false + default: "${{ runner.temp }}/droid-prompts/review_validated.json" fill_model: description: "Override the model used for PR description fill (e.g., 'claude-sonnet-4-5-20250929', 'gpt-5.1-codex'). Only applies to fill flows." required: false @@ -137,6 +149,9 @@ runs: AUTOMATIC_REVIEW: ${{ inputs.automatic_review }} REVIEW_MODEL: ${{ inputs.review_model }} REASONING_EFFORT: ${{ inputs.reasoning_effort }} + REVIEW_USE_VALIDATOR: ${{ inputs.review_use_validator }} + REVIEW_CANDIDATES_PATH: ${{ inputs.review_candidates_path }} + REVIEW_VALIDATED_PATH: ${{ inputs.review_validated_path }} FILL_MODEL: ${{ inputs.fill_model }} ADDITIONAL_PERMISSIONS: ${{ inputs.additional_permissions }} DROID_ARGS: ${{ inputs.droid_args }} @@ -204,6 +219,46 @@ runs: DETAILED_PERMISSION_MESSAGES: "1" FACTORY_API_KEY: ${{ inputs.factory_api_key }} + - name: Prepare validator + id: prepare_validator + if: steps.prepare.outputs.contains_trigger == 'true' && inputs.review_use_validator == 'true' + shell: bash + run: | + bun run ${GITHUB_ACTION_PATH}/src/entrypoints/prepare-validator.ts + env: + GITHUB_TOKEN: ${{ steps.prepare.outputs.github_token }} + REVIEW_USE_VALIDATOR: ${{ inputs.review_use_validator }} + REVIEW_VALIDATED_PATH: ${{ inputs.review_validated_path }} + REVIEW_CANDIDATES_PATH: ${{ inputs.review_candidates_path }} + DROID_COMMENT_ID: ${{ steps.prepare.outputs.droid_comment_id }} + + - name: Run Droid Exec (validator) + id: droid_validator + if: steps.prepare.outputs.contains_trigger == 'true' && inputs.review_use_validator == 'true' + shell: bash + run: | + + # Run the base-action + bun run ${GITHUB_ACTION_PATH}/base-action/src/index.ts + env: + # Base-action inputs + INPUT_PROMPT_FILE: ${{ runner.temp }}/droid-prompts/droid-prompt.txt + INPUT_SETTINGS: ${{ inputs.settings }} + INPUT_DROID_ARGS: ${{ steps.prepare_validator.outputs.droid_args }} + INPUT_MCP_TOOLS: ${{ steps.prepare_validator.outputs.mcp_tools }} + INPUT_EXPERIMENTAL_SLASH_COMMANDS_DIR: ${{ github.action_path }}/slash-commands + INPUT_ACTION_INPUTS_PRESENT: ${{ steps.prepare.outputs.action_inputs_present }} + INPUT_PATH_TO_DROID_EXECUTABLE: ${{ inputs.path_to_droid_executable }} + INPUT_PATH_TO_BUN_EXECUTABLE: ${{ inputs.path_to_bun_executable }} + INPUT_SHOW_FULL_OUTPUT: ${{ inputs.show_full_output }} + + # Model configuration + GITHUB_TOKEN: ${{ steps.prepare.outputs.GITHUB_TOKEN }} + NODE_VERSION: ${{ env.NODE_VERSION }} + DETAILED_PERMISSION_MESSAGES: "1" + FACTORY_API_KEY: ${{ inputs.factory_api_key }} + + - name: Update comment with job link if: steps.prepare.outputs.contains_trigger == 'true' && steps.prepare.outputs.droid_comment_id && always() shell: bash @@ -218,7 +273,7 @@ runs: GITHUB_EVENT_NAME: ${{ github.event_name }} TRIGGER_COMMENT_ID: ${{ github.event.comment.id }} IS_PR: ${{ github.event.issue.pull_request != null || github.event_name == 'pull_request_target' || github.event_name == 'pull_request_review_comment' }} - DROID_SUCCESS: ${{ steps.droid.outputs.conclusion == 'success' }} + DROID_SUCCESS: ${{ (inputs.review_use_validator == 'true' && steps.droid_validator.outputs.conclusion == 'success') || (inputs.review_use_validator != 'true' && steps.droid.outputs.conclusion == 'success') }} TRIGGER_USERNAME: ${{ github.event.comment.user.login || github.event.issue.user.login || github.event.pull_request.user.login || github.event.sender.login || github.triggering_actor || github.actor || '' }} PREPARE_SUCCESS: ${{ steps.prepare.outcome == 'success' }} PREPARE_ERROR: ${{ steps.prepare.outputs.prepare_error || '' }} @@ -235,5 +290,6 @@ runs: ~/.factory/logs/droid-log-single.log ~/.factory/logs/console.log ~/.factory/sessions/* + ${{ runner.temp }}/droid-prompts/** if-no-files-found: ignore retention-days: 7 diff --git a/src/create-prompt/index.ts b/src/create-prompt/index.ts index d87c90f..8ca3cd6 100644 --- a/src/create-prompt/index.ts +++ b/src/create-prompt/index.ts @@ -28,6 +28,7 @@ const BASE_ALLOWED_TOOLS = [ "Execute", "Edit", "Create", + "ApplyPatch", "Read", "Glob", "Grep", diff --git a/src/create-prompt/templates/review-candidates-prompt.ts b/src/create-prompt/templates/review-candidates-prompt.ts new file mode 100644 index 0000000..34619ac --- /dev/null +++ b/src/create-prompt/templates/review-candidates-prompt.ts @@ -0,0 +1,137 @@ +import type { PreparedContext } from "../types"; + +export function generateReviewCandidatesPrompt(context: PreparedContext): string { + const prNumber = context.eventData.isPR + ? context.eventData.prNumber + : context.githubContext && "entityNumber" in context.githubContext + ? String(context.githubContext.entityNumber) + : "unknown"; + + const repoFullName = context.repository; + const prHeadRef = context.prBranchData?.headRefName ?? "unknown"; + const prHeadSha = context.prBranchData?.headRefOid ?? "unknown"; + const prBaseRef = context.eventData.baseBranch ?? "unknown"; + + const diffPath = + context.reviewArtifacts?.diffPath ?? "$RUNNER_TEMP/droid-prompts/pr.diff"; + const commentsPath = + context.reviewArtifacts?.commentsPath ?? + "$RUNNER_TEMP/droid-prompts/existing_comments.json"; + + const reviewCandidatesPath = + process.env.REVIEW_CANDIDATES_PATH ?? + "$RUNNER_TEMP/droid-prompts/review_candidates.json"; + + return `You are generating **candidate** inline review comments for PR #${prNumber} in ${repoFullName}. + +IMPORTANT: This is Phase 1 of a two-pass review pipeline. + +### Context + +* Repo: ${repoFullName} +* PR Number: ${prNumber} +* PR Head Ref: ${prHeadRef} +* PR Head SHA: ${prHeadSha} +* PR Base Ref: ${prBaseRef} + +### Pre-computed Review Artifacts + +The following files have been pre-computed and contain the COMPLETE data for this PR: + +* **Full PR Diff**: \`${diffPath}\` +* **Existing Comments**: \`${commentsPath}\` + +### Output + +Write your candidates to: \`${reviewCandidatesPath}\` + +You must write a single JSON object with the schema below. + +### CRITICAL RULES + +* **DO NOT** post to GitHub in this run. +* **DO NOT** call any PR mutation tools (inline comments, submit review, delete/minimize/reply/resolve, etc.). +* You MAY update the tracking comment for progress. + +======================= + +## Phase 1: Context Gathering (REQUIRED — do not output yet) + +1. Read existing comments: + Read \`${commentsPath}\` + +2. Read the COMPLETE diff: + Read \`${diffPath}\` + If large, read in chunks (offset/limit). **Do not proceed until you have read the ENTIRE diff.** + +3. List every changed file (your checklist) and review ALL of them. + +======================= + +## Phase 2: Candidate Generation + +Generate **high-confidence, actionable** candidate inline comments following the same standards as the single-pass review: + +### Reporting Gate (same as review) + +Only include candidates that meet at least one: +* Definite runtime failure +* Incorrect logic with a concrete trigger path and wrong outcome +* Security vulnerability with realistic exploit +* Data corruption/loss +* Breaking contract change (discoverable in code/tests) + +Do NOT include: +* Style/naming/formatting +* "What-if" speculation without a realistic execution path +* Vague suggestions to add guards/try-catch without a concrete failure + +### Deduplication + +Use \`${commentsPath}\` to avoid duplicating issues already reported by this bot. +If an issue appears fixed, do NOT create a new candidate; the validator run will handle replies. + +======================= + +## Phase 3: Write candidates JSON (REQUIRED) + +Write \`${reviewCandidatesPath}\` with this schema: + +\`\`\`json +{ + "version": 1, + "meta": { + "repo": "owner/repo", + "prNumber": 123, + "headSha": "", + "baseRef": "main", + "generatedAt": "" + }, + "comments": [ + { + "path": "src/index.ts", + "body": "[P1] Title\n\n1 paragraph.", + "line": 42, + "startLine": null, + "side": "RIGHT", + "commit_id": "" + } + ], + "reviewSummary": { + "body": "1–3 sentence overall assessment" + } +} +\`\`\` + +Notes: +* \`comments[]\` entries MUST match the input shape of \`github_inline_comment___create_inline_comment\`. +* Use \`commit_id\` = \`${prHeadSha}\`. +* \`startLine\` should be \`null\` unless you are making a multi-line comment. + +Then write the file using the local file tool. + +Tooling note: +* If the tools list includes \`ApplyPatch\` (common for OpenAI models like GPT-5.2), use \`ApplyPatch\` to create/update the file at the exact path. +* Otherwise, use \`Create\` (or \`Edit\` if overwriting) to write the file. +`; +} diff --git a/src/create-prompt/templates/review-validator-prompt.ts b/src/create-prompt/templates/review-validator-prompt.ts new file mode 100644 index 0000000..38561cf --- /dev/null +++ b/src/create-prompt/templates/review-validator-prompt.ts @@ -0,0 +1,170 @@ +import type { PreparedContext } from "../types"; + +export function generateReviewValidatorPrompt(context: PreparedContext): string { + const prNumber = context.eventData.isPR + ? context.eventData.prNumber + : context.githubContext && "entityNumber" in context.githubContext + ? String(context.githubContext.entityNumber) + : "unknown"; + + const repoFullName = context.repository; + const prHeadRef = context.prBranchData?.headRefName ?? "unknown"; + const prHeadSha = context.prBranchData?.headRefOid ?? "unknown"; + const prBaseRef = context.eventData.baseBranch ?? "unknown"; + + const diffPath = + context.reviewArtifacts?.diffPath ?? "$RUNNER_TEMP/droid-prompts/pr.diff"; + const commentsPath = + context.reviewArtifacts?.commentsPath ?? + "$RUNNER_TEMP/droid-prompts/existing_comments.json"; + + const reviewCandidatesPath = + process.env.REVIEW_CANDIDATES_PATH ?? + "$RUNNER_TEMP/droid-prompts/review_candidates.json"; + const reviewValidatedPath = + process.env.REVIEW_VALIDATED_PATH ?? + "$RUNNER_TEMP/droid-prompts/review_validated.json"; + + return `You are validating candidate review comments for PR #${prNumber} in ${repoFullName}. + +IMPORTANT: This is Phase 2 (validator) of a two-pass review pipeline. + +### Context + +* Repo: ${repoFullName} +* PR Number: ${prNumber} +* PR Head Ref: ${prHeadRef} +* PR Head SHA: ${prHeadSha} +* PR Base Ref: ${prBaseRef} + +### Inputs + +Read: +* Candidates: \`${reviewCandidatesPath}\` +* Full PR Diff: \`${diffPath}\` +* Existing Comments: \`${commentsPath}\` + +### Outputs + +1) Write validated results to: \`${reviewValidatedPath}\` +2) Post ONLY the approved inline comments to the PR +3) Submit a PR review summary (if applicable) + +======================= + +## CRITICAL REQUIREMENTS + +1. You MUST read and validate **every** candidate before posting anything. +2. For each candidate, confirm: + * It is a real, actionable bug (not speculative) + * There is a realistic trigger path and observable wrong behavior + * The anchor is valid (path + side + line/startLine correspond to the diff) +3. **Posting rule (STRICT):** + * Only post comments where \`status === "approved"\`. + * Never post rejected items. +4. Preserve ordering: keep results in the same order as candidates. + +======================= + +## Phase 1: Load context (REQUIRED) + +1. Read existing comments: + Read \`${commentsPath}\` + +2. Read the COMPLETE diff: + Read \`${diffPath}\` + If large, read in chunks (offset/limit). **Do not proceed until you have read the ENTIRE diff.** + +3. Read candidates: + Read \`${reviewCandidatesPath}\` + +======================= + +## Phase 2: Validate candidates + +Apply the same Reporting Gate as review: + +### Approve ONLY if at least one is true +* Definite runtime failure +* Incorrect logic with a concrete trigger path and wrong outcome +* Security vulnerability with realistic exploit +* Data corruption/loss +* Breaking contract change (discoverable in code/tests) + +Reject if: +* It's speculative / "might" without a concrete trigger +* It's stylistic / naming / formatting +* It's not anchored to a valid changed line +* It's already reported (dedupe against existing comments) + +When rejecting, write a concise reason. + +======================= + +## Phase 3: Write review_validated.json (REQUIRED) + +Write \`${reviewValidatedPath}\` with this schema: + +\`\`\`json +{ + "version": 1, + "meta": { + "repo": "owner/repo", + "prNumber": 123, + "headSha": "", + "baseRef": "main", + "validatedAt": "" + }, + "results": [ + { + "status": "approved", + "comment": { + "path": "src/index.ts", + "body": "[P1] Title\n\n1 paragraph.", + "line": 42, + "startLine": null, + "side": "RIGHT", + "commit_id": "" + } + }, + { + "status": "rejected", + "candidate": { + "path": "src/other.ts", + "body": "[P2] ...", + "line": 10, + "startLine": null, + "side": "RIGHT", + "commit_id": "" + }, + "reason": "Not a real bug because ..." + } + ], + "reviewSummary": { + "status": "approved", + "body": "1–3 sentence overall assessment" + } +} +\`\`\` + +Notes: +* Use \`commit_id\` = \`${prHeadSha}\`. +* \`results\` MUST have exactly one entry per candidate, in the same order. + +Then write the file using the local file tool. + +Tooling note: +* If the tools list includes \`ApplyPatch\` (common for OpenAI models like GPT-5.2), use \`ApplyPatch\` to create/update the file at the exact path. +* Otherwise, use \`Create\` (or \`Edit\` if overwriting) to write the file. + +======================= + +## Phase 4: Post approved items + +After writing \`${reviewValidatedPath}\`, post comments ONLY for \`status === "approved"\`: + +* For each approved entry, call \`github_inline_comment___create_inline_comment\` with the \`comment\` object. +* Submit a review via \`github_pr___submit_review\` using the summary body (if there are any approved items OR a meaningful overall assessment). +* Do not approve or request changes. +`; +} diff --git a/src/entrypoints/prepare-validator.ts b/src/entrypoints/prepare-validator.ts new file mode 100644 index 0000000..f62a942 --- /dev/null +++ b/src/entrypoints/prepare-validator.ts @@ -0,0 +1,51 @@ +#!/usr/bin/env bun + +import * as core from "@actions/core"; +import { setupGitHubToken } from "../github/token"; +import { createOctokit } from "../github/api/client"; +import { parseGitHubContext, isEntityContext } from "../github/context"; +import { prepareReviewValidatorMode } from "../tag/commands/review-validator"; + +async function run() { + try { + const context = parseGitHubContext(); + + if (!isEntityContext(context) || !context.isPR) { + throw new Error("prepare-validator requires a pull request context"); + } + + // This entrypoint only makes sense when the workflow input is enabled. + if ((process.env.REVIEW_USE_VALIDATOR ?? "").trim() !== "true") { + throw new Error("reviewUseValidator must be true to run prepare-validator"); + } + + const githubToken = await setupGitHubToken(); + const octokit = createOctokit(githubToken); + + const trackingCommentId = Number(process.env.DROID_COMMENT_ID); + if (!trackingCommentId || Number.isNaN(trackingCommentId)) { + throw new Error("DROID_COMMENT_ID is required for validator run"); + } + + const result = await prepareReviewValidatorMode({ + context, + octokit, + githubToken, + trackingCommentId, + }); + + core.setOutput("github_token", githubToken); + if (result?.mcpTools) core.setOutput("mcp_tools", result.mcpTools); + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error); + core.setFailed(`Prepare validator step failed with error: ${errorMessage}`); + core.setOutput("prepare_error", errorMessage); + process.exit(1); + } +} + +export default run; + +if (import.meta.main) { + run(); +} diff --git a/src/tag/commands/review-validator.ts b/src/tag/commands/review-validator.ts new file mode 100644 index 0000000..2648ba3 --- /dev/null +++ b/src/tag/commands/review-validator.ts @@ -0,0 +1,222 @@ +import * as core from "@actions/core"; +import type { GitHubContext } from "../../github/context"; +import { isEntityContext } from "../../github/context"; +import type { Octokits } from "../../github/api/client"; +import { fetchPRBranchData } from "../../github/data/pr-fetcher"; +import { createPrompt } from "../../create-prompt"; +import type { ReviewArtifacts } from "../../create-prompt"; +import { prepareMcpTools } from "../../mcp/install-mcp-server"; +import { normalizeDroidArgs, parseAllowedTools } from "../../utils/parse-tools"; +import type { PrepareResult } from "../../prepare/types"; +import { generateReviewValidatorPrompt } from "../../create-prompt/templates/review-validator-prompt"; +import { execSync } from "child_process"; +import { mkdir, writeFile } from "fs/promises"; + +const DIFF_MAX_BUFFER = 50 * 1024 * 1024; + +async function computeAndStoreDiff( + baseRef: string, + tempDir: string, +): Promise { + const promptsDir = `${tempDir}/droid-prompts`; + await mkdir(promptsDir, { recursive: true }); + + try { + execSync("git fetch --unshallow", { encoding: "utf8", stdio: "pipe" }); + console.log("Unshallowed repository"); + } catch { + console.log("Repository already has full history"); + } + + try { + execSync(`git fetch origin ${baseRef}:refs/remotes/origin/${baseRef}`, { + encoding: "utf8", + stdio: "pipe", + }); + console.log(`Fetched base branch: ${baseRef}`); + } catch (e) { + console.error(`Failed to fetch base branch ${baseRef}: ${e}`); + throw new Error(`Failed to fetch base branch ${baseRef} for review`); + } + + const mergeBase = execSync(`git merge-base HEAD origin/${baseRef}`, { + encoding: "utf8", + stdio: "pipe", + }).trim(); + + const diff = execSync(`git --no-pager diff ${mergeBase}..HEAD`, { + encoding: "utf8", + stdio: "pipe", + maxBuffer: DIFF_MAX_BUFFER, + }); + + const diffPath = `${promptsDir}/pr.diff`; + await writeFile(diffPath, diff); + console.log(`Stored PR diff (${diff.length} bytes) at ${diffPath}`); + + return diffPath; +} + +async function fetchAndStoreComments( + octokit: Octokits, + owner: string, + repo: string, + prNumber: number, + tempDir: string, +): Promise { + const promptsDir = `${tempDir}/droid-prompts`; + await mkdir(promptsDir, { recursive: true }); + + const [issueComments, reviewComments] = await Promise.all([ + octokit.rest.issues.listComments({ owner, repo, issue_number: prNumber }), + octokit.rest.pulls.listReviewComments({ owner, repo, pull_number: prNumber }), + ]); + + const commentsPath = `${promptsDir}/existing_comments.json`; + const payload = { + issueComments: issueComments.data, + reviewComments: reviewComments.data, + }; + await writeFile(commentsPath, JSON.stringify(payload, null, 2)); + console.log( + `Stored existing comments (${issueComments.data.length} issue, ${reviewComments.data.length} review) at ${commentsPath}`, + ); + + return commentsPath; +} + +export async function prepareReviewValidatorMode({ + context, + octokit, + githubToken, + trackingCommentId, +}: { + context: GitHubContext; + octokit: Octokits; + githubToken: string; + trackingCommentId: number; +}): Promise { + if (!isEntityContext(context) || !context.isPR) { + throw new Error("review validator mode requires pull request context"); + } + + const prData = await fetchPRBranchData({ + octokits: octokit, + repository: { owner: context.repository.owner, repo: context.repository.repo }, + prNumber: context.entityNumber, + }); + + const tempDir = process.env.RUNNER_TEMP || "/tmp"; + + // Ensure PR branch is checked out (same as review mode) + try { + execSync("git reset --hard HEAD", { encoding: "utf8", stdio: "pipe" } as any); + execSync(`gh pr checkout ${context.entityNumber}`, { + encoding: "utf8", + stdio: "pipe", + env: { ...process.env, GH_TOKEN: githubToken }, + }); + console.log( + `Successfully checked out PR branch: ${execSync("git rev-parse --abbrev-ref HEAD", { encoding: "utf8" } as any).trim()}`, + ); + } catch (e) { + console.error(`Failed to checkout PR branch: ${e}`); + throw new Error(`Failed to checkout PR #${context.entityNumber} branch for review`); + } + + const [diffPath, commentsPath] = await Promise.all([ + computeAndStoreDiff(prData.baseRefName, tempDir), + fetchAndStoreComments( + octokit, + context.repository.owner, + context.repository.repo, + context.entityNumber, + tempDir, + ), + ]); + + const reviewArtifacts: ReviewArtifacts = { diffPath, commentsPath }; + + await createPrompt({ + githubContext: context, + commentId: trackingCommentId, + baseBranch: prData.baseRefName, + droidBranch: prData.headRefName, + prBranchData: { + headRefName: prData.headRefName, + headRefOid: prData.headRefOid, + }, + generatePrompt: generateReviewValidatorPrompt, + reviewArtifacts, + }); + + core.exportVariable("DROID_EXEC_RUN_TYPE", "droid-review"); + + const rawUserArgs = process.env.DROID_ARGS || ""; + const normalizedUserArgs = normalizeDroidArgs(rawUserArgs); + const userAllowedMCPTools = parseAllowedTools(normalizedUserArgs).filter( + (tool) => tool.startsWith("github_") && tool.includes("___"), + ); + + const baseTools = [ + "Read", + "Grep", + "Glob", + "LS", + "Execute", + "ApplyPatch", + "Create", + "Edit", + "github_comment___update_droid_comment", + "github_inline_comment___create_inline_comment", + ]; + + const validatorTools = ["github_pr___submit_review"]; + + const allowedTools = Array.from( + new Set([...baseTools, ...validatorTools, ...userAllowedMCPTools]), + ); + + const mcpTools = await prepareMcpTools({ + githubToken, + owner: context.repository.owner, + repo: context.repository.repo, + droidCommentId: trackingCommentId.toString(), + allowedTools, + mode: "tag", + context, + }); + + const droidArgParts: string[] = []; + droidArgParts.push(`--enabled-tools "${allowedTools.join(",")}"`); + + const reviewModel = process.env.REVIEW_MODEL?.trim(); + const reasoningEffort = process.env.REASONING_EFFORT?.trim(); + + if (!reviewModel && !reasoningEffort) { + droidArgParts.push(`--model "gpt-5.2"`); + droidArgParts.push(`--reasoning-effort "high"`); + } else { + if (reviewModel) droidArgParts.push(`--model "${reviewModel}"`); + if (reasoningEffort) droidArgParts.push(`--reasoning-effort "${reasoningEffort}"`); + } + + if (normalizedUserArgs) { + droidArgParts.push(normalizedUserArgs); + } + + core.setOutput("droid_args", droidArgParts.join(" ").trim()); + core.setOutput("mcp_tools", mcpTools); + core.setOutput("review_pr_number", context.entityNumber.toString()); + core.setOutput("droid_comment_id", trackingCommentId.toString()); + + return { + commentId: trackingCommentId, + branchInfo: { + baseBranch: prData.baseRefName, + droidBranch: prData.headRefName, + currentBranch: prData.headRefName, + }, + mcpTools, + }; +} diff --git a/src/tag/commands/review.ts b/src/tag/commands/review.ts index dd43d14..569405f 100644 --- a/src/tag/commands/review.ts +++ b/src/tag/commands/review.ts @@ -10,6 +10,7 @@ import { createInitialComment } from "../../github/operations/comments/create-in import { normalizeDroidArgs, parseAllowedTools } from "../../utils/parse-tools"; import { isEntityContext } from "../../github/context"; import { generateReviewPrompt } from "../../create-prompt/templates/review-prompt"; +import { generateReviewCandidatesPrompt } from "../../create-prompt/templates/review-candidates-prompt"; import type { Octokits } from "../../github/api/client"; import type { PrepareResult } from "../../prepare/types"; @@ -173,6 +174,9 @@ export async function prepareReviewMode({ commentsPath, }; + const reviewUseValidator = + (process.env.REVIEW_USE_VALIDATOR ?? "").trim() === "true"; + await createPrompt({ githubContext: context, commentId, @@ -182,7 +186,9 @@ export async function prepareReviewMode({ headRefName: prData.headRefName, headRefOid: prData.headRefOid, }, - generatePrompt: generateReviewPrompt, + generatePrompt: reviewUseValidator + ? generateReviewCandidatesPrompt + : generateReviewPrompt, reviewArtifacts, }); core.exportVariable("DROID_EXEC_RUN_TYPE", "droid-review"); @@ -199,21 +205,34 @@ export async function prepareReviewMode({ "Glob", "LS", "Execute", + "Create", + "ApplyPatch", "github_comment___update_droid_comment", - "github_inline_comment___create_inline_comment", ]; - const reviewTools = [ - "github_pr___list_review_comments", - "github_pr___submit_review", - "github_pr___delete_comment", - "github_pr___minimize_comment", - "github_pr___reply_to_comment", - "github_pr___resolve_review_thread", - ]; + const reviewTools = reviewUseValidator + ? [] + : [ + "github_inline_comment___create_inline_comment", + "github_pr___list_review_comments", + "github_pr___submit_review", + "github_pr___delete_comment", + "github_pr___minimize_comment", + "github_pr___reply_to_comment", + "github_pr___resolve_review_thread", + ]; + + const safeUserAllowedMCPTools = reviewUseValidator + ? userAllowedMCPTools.filter( + (tool) => + tool === "github_comment___update_droid_comment" || + (!tool.startsWith("github_pr___") && + tool !== "github_inline_comment___create_inline_comment"), + ) + : userAllowedMCPTools; const allowedTools = Array.from( - new Set([...baseTools, ...reviewTools, ...userAllowedMCPTools]), + new Set([...baseTools, ...reviewTools, ...safeUserAllowedMCPTools]), ); const mcpTools = await prepareMcpTools({ diff --git a/test/entrypoints/prepare-validator.test.ts b/test/entrypoints/prepare-validator.test.ts new file mode 100644 index 0000000..6b3bc80 --- /dev/null +++ b/test/entrypoints/prepare-validator.test.ts @@ -0,0 +1,64 @@ +import { describe, expect, it, spyOn } from "bun:test"; +import * as core from "@actions/core"; +import * as token from "../../src/github/token"; +import * as client from "../../src/github/api/client"; +import * as contextMod from "../../src/github/context"; +import * as validator from "../../src/tag/commands/review-validator"; + +describe("prepare-validator entrypoint", () => { + it("fails when reviewUseValidator is false", async () => { + const setFailedSpy = spyOn(core, "setFailed").mockImplementation(() => {}); + const setOutputSpy = spyOn(core, "setOutput").mockImplementation(() => {}); + const exitSpy = spyOn(process, "exit").mockImplementation((() => { + throw new Error("process.exit"); + }) as any); + + spyOn(contextMod, "parseGitHubContext").mockReturnValue({ + eventName: "issue_comment", + runId: "1", + repository: { owner: "o", repo: "r", full_name: "o/r" }, + actor: "a", + inputs: { + triggerPhrase: "@droid", + assigneeTrigger: "", + labelTrigger: "droid", + useStickyComment: false, + allowedBots: "", + allowedNonWriteUsers: "", + trackProgress: false, + automaticReview: false, + }, + payload: {} as any, + entityNumber: 1, + isPR: true, + } as any); + + process.env.REVIEW_USE_VALIDATOR = "false"; + process.env.DROID_COMMENT_ID = "123"; + + spyOn(token, "setupGitHubToken").mockResolvedValue("token"); + spyOn(client, "createOctokit").mockReturnValue({} as any); + spyOn(validator, "prepareReviewValidatorMode").mockResolvedValue({ + commentId: 123, + branchInfo: { baseBranch: "main", droidBranch: "feat" }, + mcpTools: "{}", + } as any); + + const mod = await import( + `../../src/entrypoints/prepare-validator.ts?test=${Math.random()}`, + ); + + await expect(mod.default()).rejects.toBeTruthy(); + + expect(setFailedSpy).toHaveBeenCalled(); + expect(setOutputSpy).toHaveBeenCalledWith( + "prepare_error", + expect.stringContaining("reviewUseValidator"), + ); + expect(exitSpy).toHaveBeenCalledWith(1); + + setFailedSpy.mockRestore(); + setOutputSpy.mockRestore(); + exitSpy.mockRestore(); + }); +}); From a7066fbd09c0c4d0ef3dfa78f1d805334975caa6 Mon Sep 17 00:00:00 2001 From: User Date: Wed, 21 Jan 2026 15:56:32 -0800 Subject: [PATCH 20/33] increase candidate volume/coverage --- .../templates/review-candidates-prompt.ts | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/create-prompt/templates/review-candidates-prompt.ts b/src/create-prompt/templates/review-candidates-prompt.ts index 34619ac..f5a2c34 100644 --- a/src/create-prompt/templates/review-candidates-prompt.ts +++ b/src/create-prompt/templates/review-candidates-prompt.ts @@ -70,10 +70,34 @@ You must write a single JSON object with the schema below. ## Phase 2: Candidate Generation -Generate **high-confidence, actionable** candidate inline comments following the same standards as the single-pass review: +Generate **actionable** candidate inline comments. + +This phase is optimized for recall: you MAY include candidates that are **likely-real** (not 100% proven), because Phase 2 (validator) will filter. However, every candidate MUST still include: + +* A concrete trigger path (inputs/state that makes it happen) +* An observable bad outcome (exception type, wrong return, corrupted data, violated security property) +* The exact relevant symbols (function/class/variable names) so the validator can verify quickly + +### Mandatory second-pass recall sweep (REQUIRED) + +After completing your first pass, do a second sweep over **every changed file** specifically hunting for bugs in these high-yield categories: + +* Optional/None dereferences (Optional types, nullable attributes, missing membership/context) +* Missing-key errors on external/untrusted dict/JSON payloads (KeyError, NoneType access) +* Wrong-variable / shadowing mistakes (e.g., using an outer variable instead of the scoped/local one) +* Type assumption bugs (e.g., numeric ops like math.floor/ceil on datetime/strings) +* Serializer/validated_data contract mismatches (field is named one thing but code reads another) +* Abstract base class contract issues (subclass missing required abstract members; runtime TypeError on instantiation) +* Concurrency/process lifecycle hazards (spawn process types, isinstance guards, join/terminate loops) +* Offset/cursor semantics mismatches (off-by-one, commit semantics, prev/next cursor behavior) +* OAuth/security invariants (e.g., OAuth state must be per-flow unpredictable; deterministic state is a vulnerability) + +If you find a candidate during this sweep, it must still meet the Reporting Gate below, but do not stop early—complete the sweep across all files. ### Reporting Gate (same as review) +Do NOT over-focus on P1 crashes. Also include medium-severity correctness/contract issues when they have a concrete trigger and a clear wrong outcome. + Only include candidates that meet at least one: * Definite runtime failure * Incorrect logic with a concrete trigger path and wrong outcome From 03d7bc8c1ad00d94ef93a4322be6c790370d54a3 Mon Sep 17 00:00:00 2001 From: User Date: Wed, 21 Jan 2026 17:10:26 -0800 Subject: [PATCH 21/33] increase candidate volume/coverage 2 --- .../templates/review-candidates-prompt.ts | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/create-prompt/templates/review-candidates-prompt.ts b/src/create-prompt/templates/review-candidates-prompt.ts index f5a2c34..0c50444 100644 --- a/src/create-prompt/templates/review-candidates-prompt.ts +++ b/src/create-prompt/templates/review-candidates-prompt.ts @@ -70,13 +70,7 @@ You must write a single JSON object with the schema below. ## Phase 2: Candidate Generation -Generate **actionable** candidate inline comments. - -This phase is optimized for recall: you MAY include candidates that are **likely-real** (not 100% proven), because Phase 2 (validator) will filter. However, every candidate MUST still include: - -* A concrete trigger path (inputs/state that makes it happen) -* An observable bad outcome (exception type, wrong return, corrupted data, violated security property) -* The exact relevant symbols (function/class/variable names) so the validator can verify quickly +Generate **actionable** candidate inline comments: ### Mandatory second-pass recall sweep (REQUIRED) @@ -96,9 +90,7 @@ If you find a candidate during this sweep, it must still meet the Reporting Gate ### Reporting Gate (same as review) -Do NOT over-focus on P1 crashes. Also include medium-severity correctness/contract issues when they have a concrete trigger and a clear wrong outcome. - -Only include candidates that meet at least one: +Include candidates that meet one of these: * Definite runtime failure * Incorrect logic with a concrete trigger path and wrong outcome * Security vulnerability with realistic exploit From 10804bd758907ed6f3b7cb8109df43fc3f11b2b8 Mon Sep 17 00:00:00 2001 From: User Date: Wed, 21 Jan 2026 17:31:09 -0800 Subject: [PATCH 22/33] Revert "increase candidate volume/coverage 2" This reverts commit 03d7bc8c1ad00d94ef93a4322be6c790370d54a3. --- .../templates/review-candidates-prompt.ts | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/create-prompt/templates/review-candidates-prompt.ts b/src/create-prompt/templates/review-candidates-prompt.ts index 0c50444..f5a2c34 100644 --- a/src/create-prompt/templates/review-candidates-prompt.ts +++ b/src/create-prompt/templates/review-candidates-prompt.ts @@ -70,7 +70,13 @@ You must write a single JSON object with the schema below. ## Phase 2: Candidate Generation -Generate **actionable** candidate inline comments: +Generate **actionable** candidate inline comments. + +This phase is optimized for recall: you MAY include candidates that are **likely-real** (not 100% proven), because Phase 2 (validator) will filter. However, every candidate MUST still include: + +* A concrete trigger path (inputs/state that makes it happen) +* An observable bad outcome (exception type, wrong return, corrupted data, violated security property) +* The exact relevant symbols (function/class/variable names) so the validator can verify quickly ### Mandatory second-pass recall sweep (REQUIRED) @@ -90,7 +96,9 @@ If you find a candidate during this sweep, it must still meet the Reporting Gate ### Reporting Gate (same as review) -Include candidates that meet one of these: +Do NOT over-focus on P1 crashes. Also include medium-severity correctness/contract issues when they have a concrete trigger and a clear wrong outcome. + +Only include candidates that meet at least one: * Definite runtime failure * Incorrect logic with a concrete trigger path and wrong outcome * Security vulnerability with realistic exploit From c275c9bf3ac655f60a05be25ccd8024759217f79 Mon Sep 17 00:00:00 2001 From: User Date: Wed, 21 Jan 2026 17:31:22 -0800 Subject: [PATCH 23/33] Revert "increase candidate volume/coverage" This reverts commit a7066fbd09c0c4d0ef3dfa78f1d805334975caa6. --- .../templates/review-candidates-prompt.ts | 26 +------------------ 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/src/create-prompt/templates/review-candidates-prompt.ts b/src/create-prompt/templates/review-candidates-prompt.ts index f5a2c34..34619ac 100644 --- a/src/create-prompt/templates/review-candidates-prompt.ts +++ b/src/create-prompt/templates/review-candidates-prompt.ts @@ -70,34 +70,10 @@ You must write a single JSON object with the schema below. ## Phase 2: Candidate Generation -Generate **actionable** candidate inline comments. - -This phase is optimized for recall: you MAY include candidates that are **likely-real** (not 100% proven), because Phase 2 (validator) will filter. However, every candidate MUST still include: - -* A concrete trigger path (inputs/state that makes it happen) -* An observable bad outcome (exception type, wrong return, corrupted data, violated security property) -* The exact relevant symbols (function/class/variable names) so the validator can verify quickly - -### Mandatory second-pass recall sweep (REQUIRED) - -After completing your first pass, do a second sweep over **every changed file** specifically hunting for bugs in these high-yield categories: - -* Optional/None dereferences (Optional types, nullable attributes, missing membership/context) -* Missing-key errors on external/untrusted dict/JSON payloads (KeyError, NoneType access) -* Wrong-variable / shadowing mistakes (e.g., using an outer variable instead of the scoped/local one) -* Type assumption bugs (e.g., numeric ops like math.floor/ceil on datetime/strings) -* Serializer/validated_data contract mismatches (field is named one thing but code reads another) -* Abstract base class contract issues (subclass missing required abstract members; runtime TypeError on instantiation) -* Concurrency/process lifecycle hazards (spawn process types, isinstance guards, join/terminate loops) -* Offset/cursor semantics mismatches (off-by-one, commit semantics, prev/next cursor behavior) -* OAuth/security invariants (e.g., OAuth state must be per-flow unpredictable; deterministic state is a vulnerability) - -If you find a candidate during this sweep, it must still meet the Reporting Gate below, but do not stop early—complete the sweep across all files. +Generate **high-confidence, actionable** candidate inline comments following the same standards as the single-pass review: ### Reporting Gate (same as review) -Do NOT over-focus on P1 crashes. Also include medium-severity correctness/contract issues when they have a concrete trigger and a clear wrong outcome. - Only include candidates that meet at least one: * Definite runtime failure * Incorrect logic with a concrete trigger path and wrong outcome From 9502f9e12ab6fcb7e3487075f244ca0ebb3d3937 Mon Sep 17 00:00:00 2001 From: User Date: Wed, 21 Jan 2026 17:37:19 -0800 Subject: [PATCH 24/33] add thoroughness --- .../templates/review-candidates-prompt.ts | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/create-prompt/templates/review-candidates-prompt.ts b/src/create-prompt/templates/review-candidates-prompt.ts index 34619ac..08c6c09 100644 --- a/src/create-prompt/templates/review-candidates-prompt.ts +++ b/src/create-prompt/templates/review-candidates-prompt.ts @@ -66,6 +66,25 @@ You must write a single JSON object with the schema below. 3. List every changed file (your checklist) and review ALL of them. +4. Read the underlying repo files for backend changes (REQUIRED) + +For each changed **backend** file, you MUST read the actual file from the checked-out workspace (not just the diff) to get surrounding context. + +Backend files include (at minimum): +* Any changed file under \`src/\` that is NOT under \`static/\` and ends in \`.py\`, \`.ts\`, or \`.js\`. + +How to read files: +* Use the absolute repo root shown in the system info (\`% pwd\`) and append the file path. + * Example: if \`pwd\` is \`/home/runner/work/droid-sentry/droid-sentry\` and the diff includes \`src/sentry/api/paginator.py\`, then read: + \`/home/runner/work/droid-sentry/droid-sentry/src/sentry/api/paginator.py\` +* If a file is very large, it is OK to read it in targeted chunks, but you MUST include: + * the changed hunk region(s), plus + * the definition(s) of any key symbols referenced by your candidates. + +Work quota (STRICT): +* Before writing output, you MUST read at least \`min(10, number_of_changed_backend_files)\` backend files from the repo root. +* In your final response, list the exact backend file paths you read (for auditability). + ======================= ## Phase 2: Candidate Generation From 56ab298a10e94e1b08b2d6f24b1052c4113d943e Mon Sep 17 00:00:00 2001 From: User Date: Fri, 23 Jan 2026 15:53:30 -0800 Subject: [PATCH 25/33] improvements to candidate and validator prompts --- .../templates/review-candidates-prompt.ts | 150 ++++++------------ .../templates/review-validator-prompt.ts | 149 ++++++++--------- 2 files changed, 114 insertions(+), 185 deletions(-) diff --git a/src/create-prompt/templates/review-candidates-prompt.ts b/src/create-prompt/templates/review-candidates-prompt.ts index 08c6c09..462f7b9 100644 --- a/src/create-prompt/templates/review-candidates-prompt.ts +++ b/src/create-prompt/templates/review-candidates-prompt.ts @@ -22,99 +22,32 @@ export function generateReviewCandidatesPrompt(context: PreparedContext): string process.env.REVIEW_CANDIDATES_PATH ?? "$RUNNER_TEMP/droid-prompts/review_candidates.json"; - return `You are generating **candidate** inline review comments for PR #${prNumber} in ${repoFullName}. + return `You are a senior staff software engineer and expert code reviewer. -IMPORTANT: This is Phase 1 of a two-pass review pipeline. +Your task: Review PR #${prNumber} in ${repoFullName} and generate a JSON file with **high-confidence, actionable** review comments that pinpoint genuine issues. -### Context + +Repo: ${repoFullName} +PR Number: ${prNumber} +PR Head Ref: ${prHeadRef} +PR Head SHA: ${prHeadSha} +PR Base Ref: ${prBaseRef} -* Repo: ${repoFullName} -* PR Number: ${prNumber} -* PR Head Ref: ${prHeadRef} -* PR Head SHA: ${prHeadSha} -* PR Base Ref: ${prBaseRef} +Precomputed data files: +- Full PR Diff: \`${diffPath}\` +- Existing Comments: \`${commentsPath}\` + -### Pre-computed Review Artifacts + +- You are currently checked out to the PR branch. +- Review ALL modified files in the PR branch. +- Focus on: functional correctness, syntax errors, logic bugs, broken dependencies/contracts/tests, security issues, and performance problems. +- Do NOT duplicate comments already in \`${commentsPath}\`. +- Only flag issues you are confident about—avoid speculative or stylistic nitpicks. + -The following files have been pre-computed and contain the COMPLETE data for this PR: - -* **Full PR Diff**: \`${diffPath}\` -* **Existing Comments**: \`${commentsPath}\` - -### Output - -Write your candidates to: \`${reviewCandidatesPath}\` - -You must write a single JSON object with the schema below. - -### CRITICAL RULES - -* **DO NOT** post to GitHub in this run. -* **DO NOT** call any PR mutation tools (inline comments, submit review, delete/minimize/reply/resolve, etc.). -* You MAY update the tracking comment for progress. - -======================= - -## Phase 1: Context Gathering (REQUIRED — do not output yet) - -1. Read existing comments: - Read \`${commentsPath}\` - -2. Read the COMPLETE diff: - Read \`${diffPath}\` - If large, read in chunks (offset/limit). **Do not proceed until you have read the ENTIRE diff.** - -3. List every changed file (your checklist) and review ALL of them. - -4. Read the underlying repo files for backend changes (REQUIRED) - -For each changed **backend** file, you MUST read the actual file from the checked-out workspace (not just the diff) to get surrounding context. - -Backend files include (at minimum): -* Any changed file under \`src/\` that is NOT under \`static/\` and ends in \`.py\`, \`.ts\`, or \`.js\`. - -How to read files: -* Use the absolute repo root shown in the system info (\`% pwd\`) and append the file path. - * Example: if \`pwd\` is \`/home/runner/work/droid-sentry/droid-sentry\` and the diff includes \`src/sentry/api/paginator.py\`, then read: - \`/home/runner/work/droid-sentry/droid-sentry/src/sentry/api/paginator.py\` -* If a file is very large, it is OK to read it in targeted chunks, but you MUST include: - * the changed hunk region(s), plus - * the definition(s) of any key symbols referenced by your candidates. - -Work quota (STRICT): -* Before writing output, you MUST read at least \`min(10, number_of_changed_backend_files)\` backend files from the repo root. -* In your final response, list the exact backend file paths you read (for auditability). - -======================= - -## Phase 2: Candidate Generation - -Generate **high-confidence, actionable** candidate inline comments following the same standards as the single-pass review: - -### Reporting Gate (same as review) - -Only include candidates that meet at least one: -* Definite runtime failure -* Incorrect logic with a concrete trigger path and wrong outcome -* Security vulnerability with realistic exploit -* Data corruption/loss -* Breaking contract change (discoverable in code/tests) - -Do NOT include: -* Style/naming/formatting -* "What-if" speculation without a realistic execution path -* Vague suggestions to add guards/try-catch without a concrete failure - -### Deduplication - -Use \`${commentsPath}\` to avoid duplicating issues already reported by this bot. -If an issue appears fixed, do NOT create a new candidate; the validator run will handle replies. - -======================= - -## Phase 3: Write candidates JSON (REQUIRED) - -Write \`${reviewCandidatesPath}\` with this schema: + +Write output to \`${reviewCandidatesPath}\` using this exact schema: \`\`\`json { @@ -137,20 +70,39 @@ Write \`${reviewCandidatesPath}\` with this schema: } ], "reviewSummary": { - "body": "1–3 sentence overall assessment" + "body": "1-3 sentence overall assessment" } } \`\`\` -Notes: -* \`comments[]\` entries MUST match the input shape of \`github_inline_comment___create_inline_comment\`. -* Use \`commit_id\` = \`${prHeadSha}\`. -* \`startLine\` should be \`null\` unless you are making a multi-line comment. - -Then write the file using the local file tool. - -Tooling note: -* If the tools list includes \`ApplyPatch\` (common for OpenAI models like GPT-5.2), use \`ApplyPatch\` to create/update the file at the exact path. -* Otherwise, use \`Create\` (or \`Edit\` if overwriting) to write the file. + +- **version**: Always \`1\` + +- **meta**: Metadata object + - \`repo\`: "${repoFullName}" + - \`prNumber\`: ${prNumber} + - \`headSha\`: "${prHeadSha}" + - \`baseRef\`: "${prBaseRef}" + - \`generatedAt\`: ISO 8601 timestamp (e.g., "2024-01-15T10:30:00Z") + +- **comments**: Array of comment objects + - \`path\`: Relative file path (e.g., "src/index.ts") + - \`body\`: Comment text starting with priority tag [P0|P1|P2], then title, then 1 paragraph explanation + - \`line\`: Target line number (single-line) or end line number (multi-line). Must be ≥ 0. + - \`startLine\`: \`null\` for single-line comments, or start line number for multi-line comments + - \`side\`: "RIGHT" for new/modified code (default), "LEFT" only for removed code + - \`commit_id\`: "${prHeadSha}" + +- **reviewSummary**: + - \`body\`: 1-3 sentence overall assessment + + + + +**DO NOT** post to GitHub. +**DO NOT** invoke any PR mutation tools (inline comments, submit review, delete/minimize/reply/resolve, etc.). +**DO NOT** modify any files other than writing to \`${reviewCandidatesPath}\`. +Output ONLY the JSON file—no additional commentary. + `; } diff --git a/src/create-prompt/templates/review-validator-prompt.ts b/src/create-prompt/templates/review-validator-prompt.ts index 38561cf..1883904 100644 --- a/src/create-prompt/templates/review-validator-prompt.ts +++ b/src/create-prompt/templates/review-validator-prompt.ts @@ -25,85 +25,44 @@ export function generateReviewValidatorPrompt(context: PreparedContext): string process.env.REVIEW_VALIDATED_PATH ?? "$RUNNER_TEMP/droid-prompts/review_validated.json"; - return `You are validating candidate review comments for PR #${prNumber} in ${repoFullName}. + return `You are a senior staff software engineer and expert code reviewer. +Your task: Act as a validator agent by reviewing candidate review comments for PR #${prNumber} in ${repoFullName}. Your primary objective is to identify and filter out false positives and unclear/vague comments. For each candidate, compare the comment against the codebase and related diff, and mark each as "approved" (if valid, clear, and actionable) or "rejected" (if it is a false positive, unclear/vague, stylistic, or a duplicate). Write your validation results to \`${reviewValidatedPath}\`, and submit only approved comments to GitHub. -IMPORTANT: This is Phase 2 (validator) of a two-pass review pipeline. + +Repo: ${repoFullName} +PR Number: ${prNumber} +PR Head Ref: ${prHeadRef} +PR Head SHA: ${prHeadSha} +PR Base Ref: ${prBaseRef} -### Context +Input files: +- Comment Candidates: \`${reviewCandidatesPath}\` +- Full PR Diff: \`${diffPath}\` +- Existing Comments on the PR: \`${commentsPath}\` -* Repo: ${repoFullName} -* PR Number: ${prNumber} -* PR Head Ref: ${prHeadRef} -* PR Head SHA: ${prHeadSha} -* PR Base Ref: ${prBaseRef} +Output file: +- Validated results: \`${reviewValidatedPath}\` + -### Inputs -Read: -* Candidates: \`${reviewCandidatesPath}\` -* Full PR Diff: \`${diffPath}\` -* Existing Comments: \`${commentsPath}\` + +## Phase 1: Validate Comment Candidates -### Outputs + +- You are currently checked out to the PR branch. +- For every candidate comment in \`${reviewCandidatesPath}\`, compare it against the codebase and PR diff to determine its validity and clarity. -1) Write validated results to: \`${reviewValidatedPath}\` -2) Post ONLY the approved inline comments to the PR -3) Submit a PR review summary (if applicable) +**Reject** any candidate that meets the following criteria: +- False positive: Not an actual issue when matched against the codebase context. +- Unclear or vague: Lacks sufficient detail to be actionable. +- Stylistic / naming / formatting only. +- Already reported (duplicate of existing comment). +- For valid and clear candidates, mark as "approved"; for others, mark as "rejected" with a concise reason (e.g., "false positive", "unclear/vague", etc.). + -======================= -## CRITICAL REQUIREMENTS - -1. You MUST read and validate **every** candidate before posting anything. -2. For each candidate, confirm: - * It is a real, actionable bug (not speculative) - * There is a realistic trigger path and observable wrong behavior - * The anchor is valid (path + side + line/startLine correspond to the diff) -3. **Posting rule (STRICT):** - * Only post comments where \`status === "approved"\`. - * Never post rejected items. -4. Preserve ordering: keep results in the same order as candidates. - -======================= - -## Phase 1: Load context (REQUIRED) - -1. Read existing comments: - Read \`${commentsPath}\` - -2. Read the COMPLETE diff: - Read \`${diffPath}\` - If large, read in chunks (offset/limit). **Do not proceed until you have read the ENTIRE diff.** - -3. Read candidates: - Read \`${reviewCandidatesPath}\` - -======================= - -## Phase 2: Validate candidates - -Apply the same Reporting Gate as review: - -### Approve ONLY if at least one is true -* Definite runtime failure -* Incorrect logic with a concrete trigger path and wrong outcome -* Security vulnerability with realistic exploit -* Data corruption/loss -* Breaking contract change (discoverable in code/tests) - -Reject if: -* It's speculative / "might" without a concrete trigger -* It's stylistic / naming / formatting -* It's not anchored to a valid changed line -* It's already reported (dedupe against existing comments) - -When rejecting, write a concise reason. - -======================= - -## Phase 3: Write review_validated.json (REQUIRED) - -Write \`${reviewValidatedPath}\` with this schema: + +Write to \`${reviewValidatedPath}\` using this exact schema: \`\`\`json { @@ -142,29 +101,47 @@ Write \`${reviewValidatedPath}\` with this schema: ], "reviewSummary": { "status": "approved", - "body": "1–3 sentence overall assessment" + "body": "1-3 sentence overall assessment" } } \`\`\` -Notes: -* Use \`commit_id\` = \`${prHeadSha}\`. -* \`results\` MUST have exactly one entry per candidate, in the same order. - -Then write the file using the local file tool. - -Tooling note: -* If the tools list includes \`ApplyPatch\` (common for OpenAI models like GPT-5.2), use \`ApplyPatch\` to create/update the file at the exact path. -* Otherwise, use \`Create\` (or \`Edit\` if overwriting) to write the file. + +- **version**: Always \`1\` -======================= +- **meta**: Metadata object + - \`repo\`: "${repoFullName}" + - \`prNumber\`: ${prNumber} + - \`headSha\`: "${prHeadSha}" + - \`baseRef\`: "${prBaseRef}" + - \`validatedAt\`: ISO 8601 timestamp -## Phase 4: Post approved items +- **results**: Array with exactly one entry per candidate (same order) + - For approved: \`{ "status": "approved", "comment": {...} }\` + - For rejected: \`{ "status": "rejected", "candidate": {...}, "reason": "..." }\` -After writing \`${reviewValidatedPath}\`, post comments ONLY for \`status === "approved"\`: +- **reviewSummary**: + - \`status\`: "approved" or "rejected" + - \`body\`: 1-3 sentence overall assessment -* For each approved entry, call \`github_inline_comment___create_inline_comment\` with the \`comment\` object. -* Submit a review via \`github_pr___submit_review\` using the summary body (if there are any approved items OR a meaningful overall assessment). -* Do not approve or request changes. +Notes: +* Use \`commit_id\` = \`${prHeadSha}\`. + + + + +- You MUST read and validate **every** candidate before posting anything. +- You MUST reject false positives or unclear/vague comments. +- You MUST write \`${reviewValidatedPath}\` before posting any comments. +- ONLY post comments where \`status === "approved"\`—never post rejected items. +- Preserve ordering: results must match candidate order exactly. + + + +## Phase 2: Post approved items +- Call \`github_inline_comment___create_inline_comment\` for each \`status === "approved"\` entry +- Call \`github_pr___submit_review\` with summary (if any approved items or meaningful assessment) +- Do NOT approve or request changes on the review + `; } From b89f550dedc3548395977c171c881293884e7c3a Mon Sep 17 00:00:00 2001 From: User Date: Tue, 27 Jan 2026 10:41:01 -0800 Subject: [PATCH 26/33] add common classes of bugs to candidate prompt --- .../templates/review-candidates-prompt.ts | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/create-prompt/templates/review-candidates-prompt.ts b/src/create-prompt/templates/review-candidates-prompt.ts index 462f7b9..71a020c 100644 --- a/src/create-prompt/templates/review-candidates-prompt.ts +++ b/src/create-prompt/templates/review-candidates-prompt.ts @@ -42,6 +42,16 @@ Precomputed data files: - You are currently checked out to the PR branch. - Review ALL modified files in the PR branch. - Focus on: functional correctness, syntax errors, logic bugs, broken dependencies/contracts/tests, security issues, and performance problems. +- High-signal bug patterns to actively check for (only comment when evidenced in the diff): + - Null/undefined/Optional dereferences; missing-key errors on untrusted/external dict/JSON payloads + - Resource leaks (unclosed files/streams/connections; missing cleanup on error paths) + - Injection vulnerabilities (SQL injection, XSS, command/template injection) and auth/security invariant violations + - OAuth/CSRF invariants: state must be per-flow unpredictable and validated; avoid deterministic/predictable state or missing state checks + - Concurrency/race/atomicity hazards (TOCTOU, lost updates, unsafe shared state, process/thread lifecycle bugs) + - Missing error handling for critical operations (network, persistence, auth, migrations, external APIs) + - Wrong-variable/shadowing mistakes; contract mismatches (serializer/validated_data, interfaces/abstract methods) + - Type-assumption bugs (e.g., numeric ops on datetime/strings, ordering key type mismatches) + - Offset/cursor/pagination semantic mismatches (off-by-one, prev/next behavior, commit semantics) - Do NOT duplicate comments already in \`${commentsPath}\`. - Only flag issues you are confident about—avoid speculative or stylistic nitpicks. From 2ad216426b5a2129516d4d7ad3e8b7eba1cbbb54 Mon Sep 17 00:00:00 2001 From: User Date: Tue, 27 Jan 2026 15:37:58 -0800 Subject: [PATCH 27/33] old validator prompt --- .../templates/review-validator-prompt.ts | 149 ++++++++++-------- 1 file changed, 86 insertions(+), 63 deletions(-) diff --git a/src/create-prompt/templates/review-validator-prompt.ts b/src/create-prompt/templates/review-validator-prompt.ts index 1883904..38561cf 100644 --- a/src/create-prompt/templates/review-validator-prompt.ts +++ b/src/create-prompt/templates/review-validator-prompt.ts @@ -25,44 +25,85 @@ export function generateReviewValidatorPrompt(context: PreparedContext): string process.env.REVIEW_VALIDATED_PATH ?? "$RUNNER_TEMP/droid-prompts/review_validated.json"; - return `You are a senior staff software engineer and expert code reviewer. -Your task: Act as a validator agent by reviewing candidate review comments for PR #${prNumber} in ${repoFullName}. Your primary objective is to identify and filter out false positives and unclear/vague comments. For each candidate, compare the comment against the codebase and related diff, and mark each as "approved" (if valid, clear, and actionable) or "rejected" (if it is a false positive, unclear/vague, stylistic, or a duplicate). Write your validation results to \`${reviewValidatedPath}\`, and submit only approved comments to GitHub. + return `You are validating candidate review comments for PR #${prNumber} in ${repoFullName}. - -Repo: ${repoFullName} -PR Number: ${prNumber} -PR Head Ref: ${prHeadRef} -PR Head SHA: ${prHeadSha} -PR Base Ref: ${prBaseRef} +IMPORTANT: This is Phase 2 (validator) of a two-pass review pipeline. -Input files: -- Comment Candidates: \`${reviewCandidatesPath}\` -- Full PR Diff: \`${diffPath}\` -- Existing Comments on the PR: \`${commentsPath}\` +### Context -Output file: -- Validated results: \`${reviewValidatedPath}\` - +* Repo: ${repoFullName} +* PR Number: ${prNumber} +* PR Head Ref: ${prHeadRef} +* PR Head SHA: ${prHeadSha} +* PR Base Ref: ${prBaseRef} +### Inputs - -## Phase 1: Validate Comment Candidates +Read: +* Candidates: \`${reviewCandidatesPath}\` +* Full PR Diff: \`${diffPath}\` +* Existing Comments: \`${commentsPath}\` - -- You are currently checked out to the PR branch. -- For every candidate comment in \`${reviewCandidatesPath}\`, compare it against the codebase and PR diff to determine its validity and clarity. +### Outputs -**Reject** any candidate that meets the following criteria: -- False positive: Not an actual issue when matched against the codebase context. -- Unclear or vague: Lacks sufficient detail to be actionable. -- Stylistic / naming / formatting only. -- Already reported (duplicate of existing comment). -- For valid and clear candidates, mark as "approved"; for others, mark as "rejected" with a concise reason (e.g., "false positive", "unclear/vague", etc.). - +1) Write validated results to: \`${reviewValidatedPath}\` +2) Post ONLY the approved inline comments to the PR +3) Submit a PR review summary (if applicable) +======================= - -Write to \`${reviewValidatedPath}\` using this exact schema: +## CRITICAL REQUIREMENTS + +1. You MUST read and validate **every** candidate before posting anything. +2. For each candidate, confirm: + * It is a real, actionable bug (not speculative) + * There is a realistic trigger path and observable wrong behavior + * The anchor is valid (path + side + line/startLine correspond to the diff) +3. **Posting rule (STRICT):** + * Only post comments where \`status === "approved"\`. + * Never post rejected items. +4. Preserve ordering: keep results in the same order as candidates. + +======================= + +## Phase 1: Load context (REQUIRED) + +1. Read existing comments: + Read \`${commentsPath}\` + +2. Read the COMPLETE diff: + Read \`${diffPath}\` + If large, read in chunks (offset/limit). **Do not proceed until you have read the ENTIRE diff.** + +3. Read candidates: + Read \`${reviewCandidatesPath}\` + +======================= + +## Phase 2: Validate candidates + +Apply the same Reporting Gate as review: + +### Approve ONLY if at least one is true +* Definite runtime failure +* Incorrect logic with a concrete trigger path and wrong outcome +* Security vulnerability with realistic exploit +* Data corruption/loss +* Breaking contract change (discoverable in code/tests) + +Reject if: +* It's speculative / "might" without a concrete trigger +* It's stylistic / naming / formatting +* It's not anchored to a valid changed line +* It's already reported (dedupe against existing comments) + +When rejecting, write a concise reason. + +======================= + +## Phase 3: Write review_validated.json (REQUIRED) + +Write \`${reviewValidatedPath}\` with this schema: \`\`\`json { @@ -101,47 +142,29 @@ Write to \`${reviewValidatedPath}\` using this exact schema: ], "reviewSummary": { "status": "approved", - "body": "1-3 sentence overall assessment" + "body": "1–3 sentence overall assessment" } } \`\`\` - -- **version**: Always \`1\` +Notes: +* Use \`commit_id\` = \`${prHeadSha}\`. +* \`results\` MUST have exactly one entry per candidate, in the same order. + +Then write the file using the local file tool. -- **meta**: Metadata object - - \`repo\`: "${repoFullName}" - - \`prNumber\`: ${prNumber} - - \`headSha\`: "${prHeadSha}" - - \`baseRef\`: "${prBaseRef}" - - \`validatedAt\`: ISO 8601 timestamp +Tooling note: +* If the tools list includes \`ApplyPatch\` (common for OpenAI models like GPT-5.2), use \`ApplyPatch\` to create/update the file at the exact path. +* Otherwise, use \`Create\` (or \`Edit\` if overwriting) to write the file. -- **results**: Array with exactly one entry per candidate (same order) - - For approved: \`{ "status": "approved", "comment": {...} }\` - - For rejected: \`{ "status": "rejected", "candidate": {...}, "reason": "..." }\` +======================= -- **reviewSummary**: - - \`status\`: "approved" or "rejected" - - \`body\`: 1-3 sentence overall assessment +## Phase 4: Post approved items -Notes: -* Use \`commit_id\` = \`${prHeadSha}\`. - - - - -- You MUST read and validate **every** candidate before posting anything. -- You MUST reject false positives or unclear/vague comments. -- You MUST write \`${reviewValidatedPath}\` before posting any comments. -- ONLY post comments where \`status === "approved"\`—never post rejected items. -- Preserve ordering: results must match candidate order exactly. - - - -## Phase 2: Post approved items -- Call \`github_inline_comment___create_inline_comment\` for each \`status === "approved"\` entry -- Call \`github_pr___submit_review\` with summary (if any approved items or meaningful assessment) -- Do NOT approve or request changes on the review - +After writing \`${reviewValidatedPath}\`, post comments ONLY for \`status === "approved"\`: + +* For each approved entry, call \`github_inline_comment___create_inline_comment\` with the \`comment\` object. +* Submit a review via \`github_pr___submit_review\` using the summary body (if there are any approved items OR a meaningful overall assessment). +* Do not approve or request changes. `; } From 00e3ea98a6976e96dbd48b3434b07614f14c642e Mon Sep 17 00:00:00 2001 From: User Date: Tue, 27 Jan 2026 18:26:10 -0800 Subject: [PATCH 28/33] Parallel Review: Phase 1 - Add file-group-reviewer subagent --- .factory/droids/file-group-reviewer.md | 74 ++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 .factory/droids/file-group-reviewer.md diff --git a/.factory/droids/file-group-reviewer.md b/.factory/droids/file-group-reviewer.md new file mode 100644 index 0000000..7ba489b --- /dev/null +++ b/.factory/droids/file-group-reviewer.md @@ -0,0 +1,74 @@ +--- +name: file-group-reviewer +description: Reviews an assigned subset of PR files for bugs, security issues, and correctness problems. Spawned in parallel by the main review agent to ensure thorough coverage. +model: inherit +tools: ["Read", "Grep", "Glob", "LS"] +--- + +You are a senior staff software engineer and expert code reviewer. + +Your task: Review the assigned files from the PR and generate a JSON array of **high-confidence, actionable** review comments that pinpoint genuine issues. + + +- You are currently checked out to the PR branch. +- Review ALL files assigned to you thoroughly. +- Focus on: functional correctness, syntax errors, logic bugs, broken dependencies/contracts/tests, security issues, and performance problems. +- High-signal bug patterns to actively check for (only comment when evidenced in the diff): + - Null/undefined/Optional dereferences; missing-key errors on untrusted/external dict/JSON payloads + - Resource leaks (unclosed files/streams/connections; missing cleanup on error paths) + - Injection vulnerabilities (SQL injection, XSS, command/template injection) and auth/security invariant violations + - OAuth/CSRF invariants: state must be per-flow unpredictable and validated; avoid deterministic/predictable state or missing state checks + - Concurrency/race/atomicity hazards (TOCTOU, lost updates, unsafe shared state, process/thread lifecycle bugs) + - Missing error handling for critical operations (network, persistence, auth, migrations, external APIs) + - Wrong-variable/shadowing mistakes; contract mismatches (serializer/validated_data, interfaces/abstract methods) + - Type-assumption bugs (e.g., numeric ops on datetime/strings, ordering key type mismatches) + - Offset/cursor/pagination semantic mismatches (off-by-one, prev/next behavior, commit semantics) +- Only flag issues you are confident about—avoid speculative or stylistic nitpicks. + + + +1. Read each assigned file in full to understand the context +2. Read the relevant diff sections provided in the prompt +3. Read related files as needed to fully understand the changes: + - Imported modules and dependencies + - Interfaces, base classes, and type definitions + - Related tests to understand expected behavior + - Callers/callees of modified functions + - Configuration files if behavior depends on them +4. Analyze the changes for issues matching the bug patterns above +5. For each issue found, verify it against the actual code and related context before including it + + + +Return your findings as a JSON array (no wrapper object, just the array): + +```json +[ + { + "path": "src/index.ts", + "body": "[P1] Title\n\n1 paragraph explanation.", + "line": 42, + "startLine": null, + "side": "RIGHT" + } +] +``` + +If no issues found, return an empty array: `[]` + +Field definitions: +- `path`: Relative file path (must match exactly as provided in your assignment) +- `body`: Comment text starting with priority tag [P0|P1|P2], then title, then 1 paragraph explanation + - P0: Critical bugs (crashes, security vulnerabilities, data loss) + - P1: Important bugs (incorrect behavior, logic errors) + - P2: Minor bugs (edge cases, non-critical issues) +- `line`: Target line number (single-line) or end line number (multi-line). Must be ≥ 0. +- `startLine`: `null` for single-line comments, or start line number for multi-line comments +- `side`: "RIGHT" for new/modified code (default), "LEFT" only for commenting on removed code + + + +- Output ONLY the JSON array—no additional commentary or markdown formatting around it. +- Do not include `commit_id` in your output—the parent agent will add this. +- Do not attempt to post comments to GitHub—just return the JSON array. + From 925aeb6714421598d744d18ec6f74cee28442103 Mon Sep 17 00:00:00 2001 From: User Date: Tue, 27 Jan 2026 18:56:55 -0800 Subject: [PATCH 29/33] Parallel Review: Phase 2 - Add Triage Phase section in Candidates Prompt --- .../templates/review-candidates-prompt.ts | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/create-prompt/templates/review-candidates-prompt.ts b/src/create-prompt/templates/review-candidates-prompt.ts index 71a020c..2cdb701 100644 --- a/src/create-prompt/templates/review-candidates-prompt.ts +++ b/src/create-prompt/templates/review-candidates-prompt.ts @@ -56,6 +56,29 @@ Precomputed data files: - Only flag issues you are confident about—avoid speculative or stylistic nitpicks. + +**Step 1: Analyze and group the modified files** + +Before reviewing, you must triage the PR to enable parallel review: + +1. Read the diff file (\`${diffPath}\`) to identify ALL modified files +2. Group the files into logical clusters based on: + - **Related functionality**: Files in the same module or feature area + - **File relationships**: A component and its tests, a class and its interface + - **Risk profile**: Security-sensitive files together, database/migration files together + - **Dependencies**: Files that import each other or share types + +3. Document your grouping briefly, for example: + - Group 1 (Auth): src/auth/login.ts, src/auth/session.ts, tests/auth.test.ts + - Group 2 (API handlers): src/api/users.ts, src/api/orders.ts + - Group 3 (Database): src/db/migrations/001.ts, src/db/schema.ts + +Guidelines for grouping: +- Aim for 3-6 groups to balance parallelism with context coherence +- Keep related files together so reviewers have full context +- Each group should be reviewable independently + + Write output to \`${reviewCandidatesPath}\` using this exact schema: From fb7b9ae6a0d311179fe44d7ebb9c05f3f0e1ef00 Mon Sep 17 00:00:00 2001 From: User Date: Wed, 28 Jan 2026 09:58:22 -0800 Subject: [PATCH 30/33] Parallel Review: Phase 3 - Add parallel subagent calls phase --- .../templates/review-candidates-prompt.ts | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/create-prompt/templates/review-candidates-prompt.ts b/src/create-prompt/templates/review-candidates-prompt.ts index 2cdb701..650cab1 100644 --- a/src/create-prompt/templates/review-candidates-prompt.ts +++ b/src/create-prompt/templates/review-candidates-prompt.ts @@ -79,6 +79,50 @@ Guidelines for grouping: - Each group should be reviewable independently + +**Step 2: Spawn parallel subagents to review each group** + +After grouping, use the Task tool to spawn parallel \`file-group-reviewer\` subagents. Each subagent will review one group of files independently. + +**IMPORTANT**: Spawn ALL subagents in a single response to enable parallel execution. + +For each group, invoke the Task tool with: +- \`subagent_type\`: "file-group-reviewer" +- \`description\`: Brief label (e.g., "Review auth module") +- \`prompt\`: Must include: + 1. The PR context (repo, PR number, base/head refs) + 2. The list of assigned files for this group + 3. The relevant diff sections for those files (extract from \`${diffPath}\`) + 4. Instructions to return a JSON array of findings + +Example Task invocation for one group: +\`\`\` +Task( + subagent_type: "file-group-reviewer", + description: "Review auth module", + prompt: """ + Review the following files from PR #${prNumber} in ${repoFullName}. + + PR Context: + - Head SHA: ${prHeadSha} + - Base Ref: ${prBaseRef} + + Assigned files: + - src/auth/login.ts + - src/auth/session.ts + - tests/auth.test.ts + + Diff for these files: + + + Return a JSON array of issues found. If no issues, return []. + """ +) +\`\`\` + +Spawn all group reviewers in parallel by including multiple Task calls in one response. + + Write output to \`${reviewCandidatesPath}\` using this exact schema: From 11816e27620bb9db628c3d918122fe2fbe4ad575 Mon Sep 17 00:00:00 2001 From: User Date: Wed, 28 Jan 2026 10:02:21 -0800 Subject: [PATCH 31/33] Parallel Review: Phase 4 - Aggregation Phase --- .../templates/review-candidates-prompt.ts | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/create-prompt/templates/review-candidates-prompt.ts b/src/create-prompt/templates/review-candidates-prompt.ts index 650cab1..5e5d9ae 100644 --- a/src/create-prompt/templates/review-candidates-prompt.ts +++ b/src/create-prompt/templates/review-candidates-prompt.ts @@ -123,6 +123,21 @@ Task( Spawn all group reviewers in parallel by including multiple Task calls in one response. + +**Step 3: Aggregate subagent results** + +After all subagents complete, collect and merge their findings: + +1. **Collect results**: Each subagent returns a JSON array of comment objects +2. **Merge arrays**: Combine all arrays into a single comments array +3. **Add commit_id**: Add \`"commit_id": "${prHeadSha}"\` to each comment object +4. **Deduplicate**: If multiple subagents flagged the same location (same path + line), keep only one comment (prefer higher priority: P0 > P1 > P2) +5. **Filter existing**: Remove any comments that duplicate issues already in \`${commentsPath}\` +6. **Write reviewSummary**: Synthesize a 1-3 sentence overall assessment based on all findings + +Write the final aggregated result to \`${reviewCandidatesPath}\` using the schema in \`\`. + + Write output to \`${reviewCandidatesPath}\` using this exact schema: From 7f727560c310143655c6c054f2fd271f4e1e9800 Mon Sep 17 00:00:00 2001 From: User Date: Wed, 28 Jan 2026 14:15:05 -0800 Subject: [PATCH 32/33] Parallel Review: Phase 5 - Move subagent to ~/.factory/droids --- action.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/action.yml b/action.yml index 390bde6..378776f 100644 --- a/action.yml +++ b/action.yml @@ -184,6 +184,20 @@ runs: DROID_DIR=$(dirname "${{ inputs.path_to_droid_executable }}") echo "$DROID_DIR" >> "$GITHUB_PATH" + - name: Setup Custom Droids + if: steps.prepare.outputs.contains_trigger == 'true' + shell: bash + run: | + echo "Setting up custom droids..." + mkdir -p ~/.factory/droids + if [ -d "${GITHUB_ACTION_PATH}/.factory/droids" ]; then + cp -r ${GITHUB_ACTION_PATH}/.factory/droids/* ~/.factory/droids/ + echo "Copied custom droids to ~/.factory/droids/" + ls -la ~/.factory/droids/ + else + echo "No custom droids found in action" + fi + - name: Setup Network Restrictions if: steps.prepare.outputs.contains_trigger == 'true' && inputs.experimental_allowed_domains != '' shell: bash @@ -290,6 +304,7 @@ runs: ~/.factory/logs/droid-log-single.log ~/.factory/logs/console.log ~/.factory/sessions/* + ~/.factory/droids/* ${{ runner.temp }}/droid-prompts/** if-no-files-found: ignore retention-days: 7 From 355bbe3c6deda334dc6754a4d8aea0ffd4d36d1d Mon Sep 17 00:00:00 2001 From: User Date: Wed, 28 Jan 2026 15:14:42 -0800 Subject: [PATCH 33/33] enable task tool --- src/tag/commands/review.ts | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/tag/commands/review.ts b/src/tag/commands/review.ts index 569405f..d4d1a16 100644 --- a/src/tag/commands/review.ts +++ b/src/tag/commands/review.ts @@ -210,6 +210,9 @@ export async function prepareReviewMode({ "github_comment___update_droid_comment", ]; + // Task tool is needed for parallel subagent reviews in candidate generation phase + const candidateGenerationTools = reviewUseValidator ? ["Task"] : []; + const reviewTools = reviewUseValidator ? [] : [ @@ -232,7 +235,12 @@ export async function prepareReviewMode({ : userAllowedMCPTools; const allowedTools = Array.from( - new Set([...baseTools, ...reviewTools, ...safeUserAllowedMCPTools]), + new Set([ + ...baseTools, + ...candidateGenerationTools, + ...reviewTools, + ...safeUserAllowedMCPTools, + ]), ); const mcpTools = await prepareMcpTools({