diff --git a/.github/scripts/pr_labeler.py b/.github/scripts/pr_labeler.py index a614fd7..75875cc 100644 --- a/.github/scripts/pr_labeler.py +++ b/.github/scripts/pr_labeler.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""PR labeler: compute size, risk, and checkbox labels for one or more PRs. +"""PR labeler: compute size, risk, and template-field labels for one or more PRs. Inputs come from environment variables set by the calling workflow: GITHUB_REPOSITORY e.g. "owner/repo" (always set on Actions) @@ -11,7 +11,13 @@ 1. Fetching additions, deletions, body, and current labels from the GitHub API. 2. Computing the size bucket from additions+deletions. 3. Parsing the Bugbot CURSOR_SUMMARY block for a risk level. - 4. Parsing the PR template checkboxes for `urgent` and `high complexity`. + 4. Parsing the PR template fields for `urgent` and `high complexity`. + Two formats are supported: + - Current: ``- **Urgent** (...): yes`` / ``: no`` + - Legacy: ``- [x] **Urgent** ...`` / ``- [ ] **Urgent** ...`` + The current format is preferred; the legacy format is matched as a + fallback so PRs opened before the template change keep working until + the queue rolls over. 5. Reconciling with current labels and applying adds/removes via `gh pr edit`. For backfill mode (PR_NUMBER == "all"), per-PR failures are logged but do not @@ -32,6 +38,9 @@ URGENT_LABEL = "review/urgent" COMPLEXITY_LABEL = "complexity/high" +URGENT_KEYWORD = "urgent" +COMPLEXITY_KEYWORD = "high complexity" + CURSOR_SUMMARY_MARKER = "" RISK_REGEX = re.compile(r"\*\*(\w+)\s+Risk\*\*", re.IGNORECASE) RISK_MAP = { @@ -43,21 +52,44 @@ RISK_FALLBACK = "risk/high" -# Match a markdown checkbox followed (with whitespace and optional bold/markdown -# punctuation) by a target keyword. The `[xX ]` part captures the state. +def yesno_regex(keyword: str) -> re.Pattern[str]: + """Match the current template format and capture ``yes`` or ``no``. + + Examples that match (state captured): + - **Urgent** (needs same-day review): yes + - **High complexity** (non-obvious logic, careful review): no + * **urgent**: YES + + The bullet must appear at the start of a line so that an inline ``*`` + from markdown bold syntax (e.g. ``**Urgent**`` inside a legacy checkbox + line ``- [x] **Urgent**: no further action``) cannot be mistaken for a + list bullet -- otherwise ``: no`` from the description would be captured + and flip a checked legacy box from ``on`` to ``off``. + """ + return re.compile( + rf"^\s*[-*]\s*[*_`]*\s*{re.escape(keyword)}\b[^:\n]*:\s*(yes|no)\b", + re.IGNORECASE | re.MULTILINE, + ) + + def checkbox_regex(keyword: str) -> re.Pattern[str]: - # Examples that should match (state captured): - # - [x] **Urgent**: needs same-day review - # - [ ] **High complexity**: ... - # * [X] urgent + """Match the legacy template format and capture the checkbox state. + + Examples that match (state captured): + - [x] **Urgent**: needs same-day review + - [ ] **High complexity**: ... + * [X] urgent + """ return re.compile( rf"[-*]\s*\[\s*([xX ])\s*\]\s*[*_`]*\s*{re.escape(keyword)}", re.IGNORECASE, ) -URGENT_REGEX = checkbox_regex("urgent") -COMPLEXITY_REGEX = checkbox_regex("high complexity") +URGENT_YESNO_REGEX = yesno_regex(URGENT_KEYWORD) +COMPLEXITY_YESNO_REGEX = yesno_regex(COMPLEXITY_KEYWORD) +URGENT_CHECKBOX_REGEX = checkbox_regex(URGENT_KEYWORD) +COMPLEXITY_CHECKBOX_REGEX = checkbox_regex(COMPLEXITY_KEYWORD) @dataclass @@ -154,12 +186,36 @@ def risk_from_body(body: str, plan: LabelPlan) -> str | None: return label -def checkbox_state(body: str, regex: re.Pattern[str]) -> str | None: - """Return 'on', 'off', or None (not present).""" - match = regex.search(body) - if not match: - return None - return "on" if match.group(1).lower() == "x" else "off" +def field_state( + body: str, + *, + yesno: re.Pattern[str], + checkbox: re.Pattern[str], +) -> str | None: + """Return ``'on'``, ``'off'``, or ``None`` for a template field. + + Tries the current ``**Field**: yes/no`` syntax first and falls back to the + legacy ``- [x] **Field**`` syntax. The legacy regex is retained so PRs + opened before the template change keep being labeled correctly until the + queue rolls over (~2 weeks). It will be removed in a follow-up. + + Defense in depth: skip any yes/no match whose enclosing line is itself a + legacy checkbox line. The yes/no regex is anchored to the start of a + line, so this shouldn't happen today, but a stray ``: no`` in a + checkbox description must never preempt the checkbox result and flip a + checked ``[x]`` from ``on`` to ``off``. + """ + for match in yesno.finditer(body): + line_start = body.rfind("\n", 0, match.start()) + 1 + newline = body.find("\n", match.end()) + line = body[line_start : newline if newline != -1 else len(body)] + if checkbox.search(line): + continue + return "on" if match.group(1).lower() == "yes" else "off" + match = checkbox.search(body) + if match: + return "on" if match.group(1).lower() == "x" else "off" + return None def reconcile( @@ -192,12 +248,12 @@ def reconcile( # strip a manually-set risk label just because Bugbot didn't comment. plan.remove.append(label) - # Checkboxes: three-state (on/off/absent). - for regex, label in [ - (URGENT_REGEX, URGENT_LABEL), - (COMPLEXITY_REGEX, COMPLEXITY_LABEL), + # Template fields: three-state (on/off/absent). + for yesno_re, checkbox_re, label in [ + (URGENT_YESNO_REGEX, URGENT_CHECKBOX_REGEX, URGENT_LABEL), + (COMPLEXITY_YESNO_REGEX, COMPLEXITY_CHECKBOX_REGEX, COMPLEXITY_LABEL), ]: - state = checkbox_state(body, regex) + state = field_state(body, yesno=yesno_re, checkbox=checkbox_re) if state == "on" and label not in current_labels: plan.add.append(label) elif state == "off" and label in current_labels: diff --git a/.github/scripts/test_pr_labeler.py b/.github/scripts/test_pr_labeler.py index 7615dbf..df1b404 100644 --- a/.github/scripts/test_pr_labeler.py +++ b/.github/scripts/test_pr_labeler.py @@ -8,6 +8,8 @@ import sys from pathlib import Path +import pytest + sys.path.insert(0, str(Path(__file__).parent)) import pr_labeler # noqa: E402 @@ -88,45 +90,185 @@ def test_text_before_marker_ignored(self): assert pr_labeler.risk_from_body(body, _plan()) == "risk/high" -# ---- checkbox_state -------------------------------------------------------- +# ---- field_state ----------------------------------------------------------- + + +def _urgent_state(body: str) -> str | None: + return pr_labeler.field_state( + body, + yesno=pr_labeler.URGENT_YESNO_REGEX, + checkbox=pr_labeler.URGENT_CHECKBOX_REGEX, + ) + + +def _complexity_state(body: str) -> str | None: + return pr_labeler.field_state( + body, + yesno=pr_labeler.COMPLEXITY_YESNO_REGEX, + checkbox=pr_labeler.COMPLEXITY_CHECKBOX_REGEX, + ) + + +class TestFieldStateYesNo: + """Current template format: ``- **Field** (...): yes|no``.""" + + @pytest.mark.parametrize( + "body", + [ + "- **Urgent** (needs same-day review): yes", + "- **Urgent**: yes", + "* **urgent**: YES", + "- **Urgent** (needs same-day review): yes, plus extra context", + "- **Urgent** (needs same-day review) : yes", + ], + ) + def test_urgent_yes_variants(self, body): + assert _urgent_state(body) == "on" + + @pytest.mark.parametrize( + "body", + [ + "- **Urgent** (needs same-day review): no", + "- **Urgent**: no", + "* **urgent**: NO", + ], + ) + def test_urgent_no_variants(self, body): + assert _urgent_state(body) == "off" + + def test_complexity_yes(self): + body = "- **High complexity** (non-obvious logic, careful review): yes" + assert _complexity_state(body) == "on" + def test_complexity_no(self): + body = "- **High complexity** (non-obvious logic, careful review): no" + assert _complexity_state(body) == "off" + + def test_value_must_be_yes_or_no(self): + # "maybe" is not yes/no; field is treated as absent. + body = "- **Urgent**: maybe" + assert _urgent_state(body) is None + + def test_value_word_boundary(self): + # "nothing" must not be parsed as "no". + body = "- **Urgent**: nothing here" + assert _urgent_state(body) is None + + def test_yes_or_no_inside_parenthetical_is_ignored(self): + # The inline "yes or no" hint inside the parenthetical is descriptive; + # only the value after the colon counts. + body = "- **Urgent** (answer yes or no): no" + assert _urgent_state(body) == "off" + + def test_absent_returns_none(self): + assert _urgent_state("body with no template") is None + + +class TestFieldStateLegacyCheckbox: + """Legacy template format: ``- [x] **Field**`` (in-flight PRs).""" -class TestCheckboxState: def test_urgent_checked(self): body = "- [x] **Urgent**: needs same-day review" - assert pr_labeler.checkbox_state(body, pr_labeler.URGENT_REGEX) == "on" + assert _urgent_state(body) == "on" def test_urgent_unchecked(self): body = "- [ ] **Urgent**: needs same-day review" - assert pr_labeler.checkbox_state(body, pr_labeler.URGENT_REGEX) == "off" + assert _urgent_state(body) == "off" def test_urgent_capital_x(self): body = "- [X] **Urgent**" - assert pr_labeler.checkbox_state(body, pr_labeler.URGENT_REGEX) == "on" + assert _urgent_state(body) == "on" def test_urgent_absent(self): body = "no template here" - assert pr_labeler.checkbox_state(body, pr_labeler.URGENT_REGEX) is None + assert _urgent_state(body) is None def test_urgent_without_bold(self): body = "- [x] urgent: needs same-day review" - assert pr_labeler.checkbox_state(body, pr_labeler.URGENT_REGEX) == "on" + assert _urgent_state(body) == "on" def test_complexity_checked(self): body = "- [x] **High complexity**: non-obvious logic" - assert pr_labeler.checkbox_state(body, pr_labeler.COMPLEXITY_REGEX) == "on" + assert _complexity_state(body) == "on" def test_complexity_unchecked(self): body = "- [ ] **High complexity**: non-obvious logic" - assert pr_labeler.checkbox_state(body, pr_labeler.COMPLEXITY_REGEX) == "off" + assert _complexity_state(body) == "off" def test_extra_whitespace(self): body = "- [ x ] **Urgent**: needs same-day review" - assert pr_labeler.checkbox_state(body, pr_labeler.URGENT_REGEX) == "on" + assert _urgent_state(body) == "on" def test_asterisk_bullet(self): body = "* [x] urgent" - assert pr_labeler.checkbox_state(body, pr_labeler.URGENT_REGEX) == "on" + assert _urgent_state(body) == "on" + + +class TestFieldStatePrecedence: + """When both formats appear in the same body, yes/no wins.""" + + def test_yesno_wins_over_legacy_checkbox(self): + body = ( + "- [ ] **Urgent**: stale legacy line\n" + "- **Urgent** (needs same-day review): yes" + ) + assert _urgent_state(body) == "on" + + def test_yesno_no_wins_over_legacy_checked(self): + body = ( + "- [x] **Urgent**: stale legacy line\n" + "- **Urgent** (needs same-day review): no" + ) + assert _urgent_state(body) == "off" + + def test_legacy_checked_with_yesno_in_description_is_on(self): + # Regression: the yes/no regex must not treat the ``*`` from + # ``**Urgent**`` inside a legacy checkbox line as a list bullet, + # which would let it capture ``no`` from the description and + # incorrectly flip a checked box from ``on`` to ``off``. + body = "- [x] **Urgent**: no further action" + assert _urgent_state(body) == "on" + + def test_legacy_checked_with_yes_in_description_is_on(self): + body = "- [x] **Urgent**: yes please review today" + assert _urgent_state(body) == "on" + + def test_legacy_unchecked_with_yes_in_description_is_off(self): + body = "- [ ] **Urgent**: yes please review today" + assert _urgent_state(body) == "off" + + def test_field_state_ignores_yesno_match_on_checkbox_line(self): + # Defense in depth: even if the yes/no regex regresses to the old + # unanchored form and matches inside a legacy checkbox line, + # ``field_state`` must drop that match and use the checkbox. + import re as _re + + unanchored = _re.compile( + r"[-*]\s*[*_`]*\s*urgent\b[^:\n]*:\s*(yes|no)\b", + _re.IGNORECASE, + ) + body = "- [x] **Urgent**: no further action" + # Sanity: the regressed regex really would mis-capture "no". + assert unanchored.search(body).group(1) == "no" + # field_state must still return "on" via the checkbox fallback. + assert ( + pr_labeler.field_state( + body, + yesno=unanchored, + checkbox=pr_labeler.URGENT_CHECKBOX_REGEX, + ) + == "on" + ) + + def test_field_state_keeps_yesno_on_separate_line_from_checkbox(self): + # The defensive filter must only ignore yes/no matches whose + # enclosing line is itself a checkbox line. A real yes/no entry + # on its own line still wins over an unrelated legacy line. + body = ( + "- [x] **Urgent**: stale legacy line\n" + "- **Urgent** (needs same-day review): no" + ) + assert _urgent_state(body) == "off" # ---- reconcile ------------------------------------------------------------- @@ -185,13 +327,28 @@ def test_swaps_risk_label_when_bugbot_changes(self): assert "risk/low" in plan.add assert "risk/high" in plan.remove - def test_urgent_checkbox_on_adds_label(self): + def test_urgent_yesno_yes_adds_label(self): + plan = _plan() + body = "- **Urgent** (needs same-day review): yes" + pr_labeler.reconcile(_pr(additions=5, body=body), plan=plan) + assert pr_labeler.URGENT_LABEL in plan.add + + def test_urgent_yesno_no_removes_label(self): + plan = _plan() + body = "- **Urgent** (needs same-day review): no" + pr_labeler.reconcile( + _pr(additions=5, body=body, labels=(pr_labeler.URGENT_LABEL,)), + plan=plan, + ) + assert pr_labeler.URGENT_LABEL in plan.remove + + def test_urgent_legacy_checked_adds_label(self): plan = _plan() body = "- [x] **Urgent**: needs same-day review" pr_labeler.reconcile(_pr(additions=5, body=body), plan=plan) assert pr_labeler.URGENT_LABEL in plan.add - def test_urgent_checkbox_off_removes_label(self): + def test_urgent_legacy_unchecked_removes_label(self): plan = _plan() body = "- [ ] **Urgent**: needs same-day review" pr_labeler.reconcile( @@ -200,7 +357,7 @@ def test_urgent_checkbox_off_removes_label(self): ) assert pr_labeler.URGENT_LABEL in plan.remove - def test_urgent_checkbox_absent_leaves_manual_label(self): + def test_urgent_absent_leaves_manual_label(self): plan = _plan() pr_labeler.reconcile( _pr(additions=5, body="no template", labels=(pr_labeler.URGENT_LABEL,)), @@ -208,6 +365,12 @@ def test_urgent_checkbox_absent_leaves_manual_label(self): ) assert pr_labeler.URGENT_LABEL not in plan.remove + def test_complexity_yesno_yes_adds_label(self): + plan = _plan() + body = "- **High complexity** (non-obvious logic, careful review): yes" + pr_labeler.reconcile(_pr(additions=5, body=body), plan=plan) + assert pr_labeler.COMPLEXITY_LABEL in plan.add + # ---- determine_targets ------------------------------------------------------