Improve GitHub issue deduplication reliability by introducing a stage… (#11769)
…d two-pass Codex search strategy with deterministic fallback behavior,
and remove an obsolete prompt file that was no longer used.
### Changes
- Updated `workflows/issue-deduplicator.yml`:
- Added richer issue input fields (`state`, `updatedAt`, `labels`) for
model context.
- Added two candidate pools:
- `codex-existing-issues-all.json` (`--state all`)
- `codex-existing-issues-open.json` (`--state open`)
- Added body truncation during JSON preparation to reduce prompt noise.
- Added **Pass 1** Codex run over all issues.
- Added normalization/validation step for Pass 1 output:
- tolerant JSON parsing
- self-issue filtering
- deduplication
- cap to 5 results
- Added **Pass 2 fallback** Codex run over open issues only, triggered
only when Pass 1 has no usable matches.
- Added normalization/validation step for Pass 2 output (same
filtering/dedup/cap behavior).
- Added final deterministic selector:
- prefer pass 2 if it finds matches
- otherwise use pass 1
- otherwise return no matches
- Added observability logs:
- pool sizes
- per-pass parse/match status
- final pass selected and final duplicate count
- Kept public issue-comment format unchanged.
- Added comment documenting that prompt text now lives inline in
workflow.
- Deleted obsolete file:
- `/prompts/issue-deduplicator.txt`
### Behavior Impact
- Better duplicate recall when broad search fails by retrying against
active issues only.
- More deterministic/noise-resistant output handling.
- No change to workflow trigger conditions, permissions, or issue
comment structure.
This commit is contained in:
parent
e71760fc64
commit
ffef5ce5de
2 changed files with 232 additions and 34 deletions
18
.github/prompts/issue-deduplicator.txt
vendored
18
.github/prompts/issue-deduplicator.txt
vendored
|
|
@ -1,18 +0,0 @@
|
|||
You are an assistant that triages new GitHub issues by identifying potential duplicates.
|
||||
|
||||
You will receive the following JSON files located in the current working directory:
|
||||
- `codex-current-issue.json`: JSON object describing the newly created issue (fields: number, title, body).
|
||||
- `codex-existing-issues.json`: JSON array of recent issues (each element includes number, title, body, createdAt).
|
||||
|
||||
Instructions:
|
||||
- Load both files as JSON and review their contents carefully. The codex-existing-issues.json file is large, ensure you explore all of it.
|
||||
- Compare the current issue against the existing issues to find up to five that appear to describe the same underlying problem or request.
|
||||
- Only consider an issue a potential duplicate if there is a clear overlap in symptoms, feature requests, reproduction steps, or error messages.
|
||||
- Prioritize newer issues when similarity is comparable.
|
||||
- Ignore pull requests and issues whose similarity is tenuous.
|
||||
- When unsure, prefer returning fewer matches.
|
||||
|
||||
Output requirements:
|
||||
- Respond with a JSON array of issue numbers (integers), ordered from most likely duplicate to least.
|
||||
- Include at most five numbers.
|
||||
- If you find no plausible duplicates, respond with `[]`.
|
||||
248
.github/workflows/issue-deduplicator.yml
vendored
248
.github/workflows/issue-deduplicator.yml
vendored
|
|
@ -15,34 +15,68 @@ jobs:
|
|||
permissions:
|
||||
contents: read
|
||||
outputs:
|
||||
codex_output: ${{ steps.codex.outputs.final-message }}
|
||||
codex_output: ${{ steps.select-final.outputs.codex_output }}
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Prepare Codex inputs
|
||||
env:
|
||||
GH_TOKEN: ${{ github.token }}
|
||||
REPO: ${{ github.repository }}
|
||||
ISSUE_NUMBER: ${{ github.event.issue.number }}
|
||||
run: |
|
||||
set -eo pipefail
|
||||
|
||||
CURRENT_ISSUE_FILE=codex-current-issue.json
|
||||
EXISTING_ISSUES_FILE=codex-existing-issues.json
|
||||
EXISTING_ALL_FILE=codex-existing-issues-all.json
|
||||
EXISTING_OPEN_FILE=codex-existing-issues-open.json
|
||||
|
||||
gh issue list --repo "${{ github.repository }}" \
|
||||
--json number,title,body,createdAt \
|
||||
gh issue list --repo "$REPO" \
|
||||
--json number,title,body,createdAt,updatedAt,state,labels \
|
||||
--limit 1000 \
|
||||
--state all \
|
||||
--search "sort:created-desc" \
|
||||
| jq '.' \
|
||||
> "$EXISTING_ISSUES_FILE"
|
||||
| jq '[.[] | {
|
||||
number,
|
||||
title,
|
||||
body: ((.body // "")[0:4000]),
|
||||
createdAt,
|
||||
updatedAt,
|
||||
state,
|
||||
labels: ((.labels // []) | map(.name))
|
||||
}]' \
|
||||
> "$EXISTING_ALL_FILE"
|
||||
|
||||
gh issue view "${{ github.event.issue.number }}" \
|
||||
--repo "${{ github.repository }}" \
|
||||
gh issue list --repo "$REPO" \
|
||||
--json number,title,body,createdAt,updatedAt,state,labels \
|
||||
--limit 1000 \
|
||||
--state open \
|
||||
--search "sort:created-desc" \
|
||||
| jq '[.[] | {
|
||||
number,
|
||||
title,
|
||||
body: ((.body // "")[0:4000]),
|
||||
createdAt,
|
||||
updatedAt,
|
||||
state,
|
||||
labels: ((.labels // []) | map(.name))
|
||||
}]' \
|
||||
> "$EXISTING_OPEN_FILE"
|
||||
|
||||
gh issue view "$ISSUE_NUMBER" \
|
||||
--repo "$REPO" \
|
||||
--json number,title,body \
|
||||
| jq '.' \
|
||||
| jq '{number, title, body: ((.body // "")[0:4000])}' \
|
||||
> "$CURRENT_ISSUE_FILE"
|
||||
|
||||
- id: codex
|
||||
echo "Prepared duplicate detection input files."
|
||||
echo "all_issue_count=$(jq 'length' "$EXISTING_ALL_FILE")"
|
||||
echo "open_issue_count=$(jq 'length' "$EXISTING_OPEN_FILE")"
|
||||
|
||||
# Prompt instructions are intentionally inline in this workflow. The old
|
||||
# .github/prompts/issue-deduplicator.txt file is obsolete and removed.
|
||||
- id: codex-all
|
||||
name: Find duplicates (pass 1, all issues)
|
||||
uses: openai/codex-action@main
|
||||
with:
|
||||
openai-api-key: ${{ secrets.CODEX_OPENAI_API_KEY }}
|
||||
|
|
@ -52,14 +86,17 @@ jobs:
|
|||
|
||||
You will receive the following JSON files located in the current working directory:
|
||||
- `codex-current-issue.json`: JSON object describing the newly created issue (fields: number, title, body).
|
||||
- `codex-existing-issues.json`: JSON array of recent issues (each element includes number, title, body, createdAt).
|
||||
- `codex-existing-issues-all.json`: JSON array of recent issues with states, timestamps, and labels.
|
||||
|
||||
Instructions:
|
||||
- Compare the current issue against the existing issues to find up to five that appear to describe the same underlying problem or request.
|
||||
- Focus on the underlying intent and context of each issue—such as reported symptoms, feature requests, reproduction steps, or error messages—rather than relying solely on string similarity or synthetic metrics.
|
||||
- After your analysis, validate your results in 1-2 lines explaining your decision to return the selected matches.
|
||||
- When unsure, prefer returning fewer matches.
|
||||
- Include at most five numbers.
|
||||
- Prioritize concrete overlap in symptoms, reproduction details, error signatures, and user intent.
|
||||
- Prefer active unresolved issues when confidence is similar.
|
||||
- Closed issues can still be valid duplicates if they clearly match.
|
||||
- Return fewer matches rather than speculative ones.
|
||||
- If confidence is low, return an empty list.
|
||||
- Include at most five issue numbers.
|
||||
- After analysis, provide a short reason for your decision.
|
||||
|
||||
output-schema: |
|
||||
{
|
||||
|
|
@ -77,6 +114,179 @@ jobs:
|
|||
"additionalProperties": false
|
||||
}
|
||||
|
||||
- id: normalize-all
|
||||
name: Normalize pass 1 output
|
||||
env:
|
||||
CODEX_OUTPUT: ${{ steps.codex-all.outputs.final-message }}
|
||||
CURRENT_ISSUE_NUMBER: ${{ github.event.issue.number }}
|
||||
run: |
|
||||
set -eo pipefail
|
||||
|
||||
raw=${CODEX_OUTPUT//$'\r'/}
|
||||
parsed=false
|
||||
issues='[]'
|
||||
reason=''
|
||||
|
||||
if [ -n "$raw" ] && printf '%s' "$raw" | jq -e 'type == "object" and (.issues | type == "array")' >/dev/null 2>&1; then
|
||||
parsed=true
|
||||
issues=$(printf '%s' "$raw" | jq -c '[.issues[] | tostring]')
|
||||
reason=$(printf '%s' "$raw" | jq -r '.reason // ""')
|
||||
else
|
||||
reason='Pass 1 output was empty or invalid JSON.'
|
||||
fi
|
||||
|
||||
filtered=$(jq -cn --argjson issues "$issues" --arg current "$CURRENT_ISSUE_NUMBER" '[
|
||||
$issues[]
|
||||
| tostring
|
||||
| select(. != $current)
|
||||
] | reduce .[] as $issue ([]; if index($issue) then . else . + [$issue] end) | .[:5]')
|
||||
|
||||
has_matches=false
|
||||
if [ "$(jq 'length' <<< "$filtered")" -gt 0 ]; then
|
||||
has_matches=true
|
||||
fi
|
||||
|
||||
echo "Pass 1 parsed: $parsed"
|
||||
echo "Pass 1 matches after filtering: $(jq 'length' <<< "$filtered")"
|
||||
echo "Pass 1 reason: $reason"
|
||||
|
||||
{
|
||||
echo "issues_json=$filtered"
|
||||
echo "reason<<EOF"
|
||||
echo "$reason"
|
||||
echo "EOF"
|
||||
echo "has_matches=$has_matches"
|
||||
} >> "$GITHUB_OUTPUT"
|
||||
|
||||
- id: codex-open
|
||||
name: Find duplicates (pass 2, open issues)
|
||||
if: ${{ steps.normalize-all.outputs.has_matches != 'true' }}
|
||||
uses: openai/codex-action@main
|
||||
with:
|
||||
openai-api-key: ${{ secrets.CODEX_OPENAI_API_KEY }}
|
||||
allow-users: "*"
|
||||
prompt: |
|
||||
You are an assistant that triages new GitHub issues by identifying potential duplicates.
|
||||
|
||||
This is a fallback pass because a broad search did not find convincing matches.
|
||||
|
||||
You will receive the following JSON files located in the current working directory:
|
||||
- `codex-current-issue.json`: JSON object describing the newly created issue (fields: number, title, body).
|
||||
- `codex-existing-issues-open.json`: JSON array of open issues only.
|
||||
|
||||
Instructions:
|
||||
- Search only these active unresolved issues for duplicates of the current issue.
|
||||
- Prioritize concrete overlap in symptoms, reproduction details, error signatures, and user intent.
|
||||
- Prefer fewer, higher-confidence matches.
|
||||
- If confidence is low, return an empty list.
|
||||
- Include at most five issue numbers.
|
||||
- After analysis, provide a short reason for your decision.
|
||||
|
||||
output-schema: |
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"issues": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"reason": { "type": "string" }
|
||||
},
|
||||
"required": ["issues", "reason"],
|
||||
"additionalProperties": false
|
||||
}
|
||||
|
||||
- id: normalize-open
|
||||
name: Normalize pass 2 output
|
||||
if: ${{ steps.normalize-all.outputs.has_matches != 'true' }}
|
||||
env:
|
||||
CODEX_OUTPUT: ${{ steps.codex-open.outputs.final-message }}
|
||||
CURRENT_ISSUE_NUMBER: ${{ github.event.issue.number }}
|
||||
run: |
|
||||
set -eo pipefail
|
||||
|
||||
raw=${CODEX_OUTPUT//$'\r'/}
|
||||
parsed=false
|
||||
issues='[]'
|
||||
reason=''
|
||||
|
||||
if [ -n "$raw" ] && printf '%s' "$raw" | jq -e 'type == "object" and (.issues | type == "array")' >/dev/null 2>&1; then
|
||||
parsed=true
|
||||
issues=$(printf '%s' "$raw" | jq -c '[.issues[] | tostring]')
|
||||
reason=$(printf '%s' "$raw" | jq -r '.reason // ""')
|
||||
else
|
||||
reason='Pass 2 output was empty or invalid JSON.'
|
||||
fi
|
||||
|
||||
filtered=$(jq -cn --argjson issues "$issues" --arg current "$CURRENT_ISSUE_NUMBER" '[
|
||||
$issues[]
|
||||
| tostring
|
||||
| select(. != $current)
|
||||
] | reduce .[] as $issue ([]; if index($issue) then . else . + [$issue] end) | .[:5]')
|
||||
|
||||
has_matches=false
|
||||
if [ "$(jq 'length' <<< "$filtered")" -gt 0 ]; then
|
||||
has_matches=true
|
||||
fi
|
||||
|
||||
echo "Pass 2 parsed: $parsed"
|
||||
echo "Pass 2 matches after filtering: $(jq 'length' <<< "$filtered")"
|
||||
echo "Pass 2 reason: $reason"
|
||||
|
||||
{
|
||||
echo "issues_json=$filtered"
|
||||
echo "reason<<EOF"
|
||||
echo "$reason"
|
||||
echo "EOF"
|
||||
echo "has_matches=$has_matches"
|
||||
} >> "$GITHUB_OUTPUT"
|
||||
|
||||
- id: select-final
|
||||
name: Select final duplicate set
|
||||
env:
|
||||
PASS1_ISSUES: ${{ steps.normalize-all.outputs.issues_json }}
|
||||
PASS1_REASON: ${{ steps.normalize-all.outputs.reason }}
|
||||
PASS2_ISSUES: ${{ steps.normalize-open.outputs.issues_json }}
|
||||
PASS2_REASON: ${{ steps.normalize-open.outputs.reason }}
|
||||
PASS1_HAS_MATCHES: ${{ steps.normalize-all.outputs.has_matches }}
|
||||
PASS2_HAS_MATCHES: ${{ steps.normalize-open.outputs.has_matches }}
|
||||
run: |
|
||||
set -eo pipefail
|
||||
|
||||
selected_issues='[]'
|
||||
selected_reason='No plausible duplicates found.'
|
||||
selected_pass='none'
|
||||
|
||||
if [ "$PASS1_HAS_MATCHES" = "true" ]; then
|
||||
selected_issues=${PASS1_ISSUES:-'[]'}
|
||||
selected_reason=${PASS1_REASON:-'Pass 1 found duplicates.'}
|
||||
selected_pass='all'
|
||||
fi
|
||||
|
||||
if [ "$PASS2_HAS_MATCHES" = "true" ]; then
|
||||
selected_issues=${PASS2_ISSUES:-'[]'}
|
||||
selected_reason=${PASS2_REASON:-'Pass 2 found duplicates.'}
|
||||
selected_pass='open-fallback'
|
||||
fi
|
||||
|
||||
final_json=$(jq -cn \
|
||||
--argjson issues "$selected_issues" \
|
||||
--arg reason "$selected_reason" \
|
||||
--arg pass "$selected_pass" \
|
||||
'{issues: $issues, reason: $reason, pass: $pass}')
|
||||
|
||||
echo "Final pass used: $selected_pass"
|
||||
echo "Final duplicate count: $(jq '.issues | length' <<< "$final_json")"
|
||||
echo "Final reason: $(jq -r '.reason' <<< "$final_json")"
|
||||
|
||||
{
|
||||
echo "codex_output<<EOF"
|
||||
echo "$final_json"
|
||||
echo "EOF"
|
||||
} >> "$GITHUB_OUTPUT"
|
||||
|
||||
comment-on-issue:
|
||||
name: Comment with potential duplicates
|
||||
needs: gather-duplicates
|
||||
|
|
@ -105,11 +315,17 @@ jobs:
|
|||
|
||||
const issues = Array.isArray(parsed?.issues) ? parsed.issues : [];
|
||||
const currentIssueNumber = String(context.payload.issue.number);
|
||||
const passUsed = typeof parsed?.pass === 'string' ? parsed.pass : 'unknown';
|
||||
const reason = typeof parsed?.reason === 'string' ? parsed.reason : '';
|
||||
|
||||
console.log(`Current issue number: ${currentIssueNumber}`);
|
||||
console.log(`Pass used: ${passUsed}`);
|
||||
if (reason) {
|
||||
console.log(`Reason: ${reason}`);
|
||||
}
|
||||
console.log(issues);
|
||||
|
||||
const filteredIssues = issues.filter((value) => String(value) !== currentIssueNumber);
|
||||
const filteredIssues = [...new Set(issues.map((value) => String(value)))].filter((value) => value !== currentIssueNumber).slice(0, 5);
|
||||
|
||||
if (filteredIssues.length === 0) {
|
||||
core.info('Codex reported no potential duplicates.');
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue