Files
lean4/script/lean-bisect-test.lean
Kim Morrison b87d2c0fb9 feat: add lean-bisect script for bisecting toolchain regressions (#11727)
This PR adds a Python script that helps find which commit introduced a
behavior change in Lean. It supports multiple bisection modes and
automatically downloads CI artifacts when available.

- [x] depends on: #11735

## Usage

```
usage: lean-bisect [-h] [--timeout SEC] [--ignore-messages] [--verbose]
                   [--selftest] [--clear-cache] [--nightly-only]
                   [file] [RANGE]

Bisect Lean toolchain versions to find where behavior changes.

positional arguments:
  file               Lean file to test (must only import Lean.* or Std.*)
  RANGE              Range to bisect: FROM..TO, FROM, or ..TO

options:
  -h, --help         show this help message and exit
  --timeout SEC      Timeout in seconds for each test run
  --ignore-messages  Compare only exit codes, ignore stdout/stderr differences
  --verbose, -v      Show stdout/stderr from each test
  --selftest         Run built-in selftest to verify lean-bisect works
  --clear-cache      Clear CI artifact cache (~600MB per commit) and exit
  --nightly-only     Stop after finding nightly range (don't bisect individual
                     commits)

Range Syntax:

  FROM..TO                Bisect between FROM and TO
  FROM                    Start from FROM, bisect to latest nightly
  ..TO                    Bisect to TO, search backwards for regression start

  If no range given, searches backwards from latest nightly to find regression.

Identifier Formats:

  nightly-YYYY-MM-DD    Nightly build date (e.g., nightly-2024-06-15)
                        Uses pre-built toolchains from leanprover/lean4-nightly.
                        Fast: downloads via elan (~30s each).

  v4.X.Y or v4.X.Y-rcN  Version tag (e.g., v4.8.0, v4.9.0-rc1)
                        Converts to equivalent nightly range.

  Commit SHA            Git commit hash (short or full, e.g., abc123def)
                        Bisects individual commits between two points.
                        Tries CI artifacts first (~30s), falls back to building (~2-5min).
                        Commits with failed CI builds are automatically skipped.
                        Artifacts cached in ~/.cache/lean-bisect/artifacts/

Bisection Modes:

  Nightly mode:   Both endpoints are nightly dates.
                  Binary search through nightlies to find the day behavior changed.
                  Then automatically continues to bisect individual commits.
                  Use --nightly-only to stop after finding the nightly range.

  Version mode:   Either endpoint is a version tag.
                  Converts to equivalent nightly range and bisects.

  Commit mode:    Both endpoints are commit SHAs.
                  Binary search through individual commits on master.
                  Output: "Behavior change introduced in commit abc123"

Examples:

  # Simplest: just provide the file, finds the regression automatically
  lean-bisect test.lean

  # Specify an endpoint if you know roughly when it broke
  lean-bisect test.lean ..nightly-2024-06-01

  # Full manual control over the range
  lean-bisect test.lean nightly-2024-01-01..nightly-2024-06-01

  # Only find the nightly range, don't continue to commit bisection
  lean-bisect test.lean nightly-2024-01-01..nightly-2024-06-01 --nightly-only

  # Add a timeout (kills slow/hanging tests)
  lean-bisect test.lean --timeout 30

  # Bisect commits directly (if you already know the commit range)
  lean-bisect test.lean abc1234..def5678

  # Only compare exit codes, ignore output differences
  lean-bisect test.lean --ignore-messages

  # Clear downloaded CI artifacts to free disk space
  lean-bisect --clear-cache
```

🤖 Prepared with Claude Code

---------

Co-authored-by: Claude <noreply@anthropic.com>
2025-12-21 20:04:47 +00:00

308 lines
12 KiB
Lean4
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/-
Copyright Strata Contributors
SPDX-License-Identifier: Apache-2.0 OR MIT
-/
namespace Strata
namespace Python
/-
Parser and translator for some basic regular expression patterns supported by
Python's `re` library
Ref.: https://docs.python.org/3/library/re.html
Also see
https://github.com/python/cpython/blob/759a048d4bea522fda2fe929be0fba1650c62b0e/Lib/re/_parser.py
for a reference implementation.
-/
-------------------------------------------------------------------------------
inductive ParseError where
/--
`patternError` is raised when Python's `re.patternError` exception is
raised.
[Reference: Python's re exceptions](https://docs.python.org/3/library/re.html#exceptions):
"Exception raised when a string passed to one of the functions here is not a
valid regular expression (for example, it might contain unmatched
parentheses) or when some other error occurs during compilation or matching.
It is never an error if a string contains no match for a pattern."
-/
| patternError (message : String) (pattern : String) (pos : String.Pos.Raw)
/--
`unimplemented` is raised whenever we don't support some regex operations
(e.g., lookahead assertions).
-/
| unimplemented (message : String) (pattern : String) (pos : String.Pos.Raw)
deriving Repr
def ParseError.toString : ParseError String
| .patternError msg pat pos => s!"Pattern error at position {pos.byteIdx}: {msg} in pattern '{pat}'"
| .unimplemented msg pat pos => s!"Unimplemented at position {pos.byteIdx}: {msg} in pattern '{pat}'"
instance : ToString ParseError where
toString := ParseError.toString
-------------------------------------------------------------------------------
/--
Regular Expression Nodes
-/
inductive RegexAST where
/-- Single literal character: `a` -/
| char : Char RegexAST
/-- Character range: `[a-z]` -/
| range : Char Char RegexAST
/-- Alternation: `a|b` -/
| union : RegexAST RegexAST RegexAST
/-- Concatenation: `ab` -/
| concat : RegexAST RegexAST RegexAST
/-- Any character: `.` -/
| anychar : RegexAST
/-- Zero or more: `a*` -/
| star : RegexAST RegexAST
/-- One or more: `a+` -/
| plus : RegexAST RegexAST
/-- Zero or one: `a?` -/
| optional : RegexAST RegexAST
/-- Bounded repetition: `a{n,m}` -/
| loop : RegexAST Nat Nat RegexAST
/-- Start of string: `^` -/
| anchor_start : RegexAST
/-- End of string: `$` -/
| anchor_end : RegexAST
/-- Grouping: `(abc)` -/
| group : RegexAST RegexAST
/-- Empty string: `()` or `""` -/
| empty : RegexAST
/-- Complement: `[^a-z]` -/
| complement : RegexAST RegexAST
deriving Inhabited, Repr
-------------------------------------------------------------------------------
/-- Parse character class like [a-z], [0-9], etc. into union of ranges and
chars. Note that this parses `|` as a character. -/
def parseCharClass (s : String) (pos : String.Pos.Raw) : Except ParseError (RegexAST × String.Pos.Raw) := do
if pos.get? s != some '[' then throw (.patternError "Expected '[' at start of character class" s pos)
let mut i := pos.next s
-- Check for complement (negation) with leading ^
let isComplement := !i.atEnd s && i.get? s == some '^'
if isComplement then
i := i.next s
let mut result : Option RegexAST := none
-- Process each element in the character class.
while !i.atEnd s && i.get? s != some ']' do
-- Uncommenting this makes the code stop
--dbg_trace "Working" (pure ())
let some c1 := i.get? s | throw (.patternError "Invalid character in class" s i)
let i1 := i.next s
-- Check for range pattern: c1-c2.
if !i1.atEnd s && i1.get? s == some '-' then
let i2 := i1.next s
if !i2.atEnd s && i2.get? s != some ']' then
let some c2 := i2.get? s | throw (.patternError "Invalid character in range" s i2)
if c1 > c2 then
throw (.patternError s!"Invalid character range [{c1}-{c2}]: \
start character '{c1}' is greater than end character '{c2}'" s i)
let r := RegexAST.range c1 c2
-- Union with previous elements.
result := some (match result with | none => r | some prev => RegexAST.union prev r)
i := i2.next s
continue
-- Single character.
let r := RegexAST.char c1
result := some (match result with | none => r | some prev => RegexAST.union prev r)
i := i.next s
let some ast := result | throw (.patternError "Unterminated character set" s pos)
let finalAst := if isComplement then RegexAST.complement ast else ast
pure (finalAst, i.next s)
-------------------------------------------------------------------------------
/-- Parse numeric repeats like `{10}` or `{1,10}` into min and max bounds. -/
def parseBounds (s : String) (pos : String.Pos.Raw) : Except ParseError (Nat × Nat × String.Pos.Raw) := do
if pos.get? s != some '{' then throw (.patternError "Expected '{' at start of bounds" s pos)
let mut i := pos.next s
let mut numStr := ""
-- Parse first number.
while !i.atEnd s && (i.get? s).any Char.isDigit do
numStr := numStr.push ((i.get? s).get!)
i := i.next s
let some n := numStr.toNat? | throw (.patternError "Invalid minimum bound" s pos)
-- Check for comma (range) or closing brace (exact count).
match i.get? s with
| some '}' => pure (n, n, i.next s) -- {n} means exactly n times.
| some ',' =>
i := i.next s
-- Parse maximum bound
numStr := ""
while !i.atEnd s && (i.get? s).any Char.isDigit do
numStr := numStr.push ((i.get? s).get!)
i := i.next s
let some max := numStr.toNat? | throw (.patternError "Invalid maximum bound" s i)
if i.get? s != some '}' then throw (.patternError "Expected '}' at end of bounds" s i)
-- Validate bounds order
if max < n then
throw (.patternError s!"Invalid repeat bounds \{{n},{max}}: \
maximum {max} is less than minimum {n}" s pos)
pure (n, max, i.next s)
| _ => throw (.patternError "Invalid bounds syntax" s i)
-------------------------------------------------------------------------------
mutual
/--
Parse atom: single element (char, class, anchor, group) with optional
quantifier. Stops at the first `|`.
-/
partial def parseAtom (s : String) (pos : String.Pos.Raw) : Except ParseError (RegexAST × String.Pos.Raw) := do
if pos.atEnd s then throw (.patternError "Unexpected end of regex" s pos)
let some c := pos.get? s | throw (.patternError "Invalid position" s pos)
-- Detect invalid quantifier at start
if c == '*' || c == '+' || c == '{' || c == '?' then
throw (.patternError s!"Quantifier '{c}' at position {pos} has nothing to quantify" s pos)
-- Detect unbalanced closing parenthesis
if c == ')' then
throw (.patternError "Unbalanced parenthesis" s pos)
-- Parse base element (anchor, char class, group, anychar, escape, or single char).
let (base, nextPos) match c with
| '^' => pure (RegexAST.anchor_start, pos.next s)
| '$' => pure (RegexAST.anchor_end, pos.next s)
| '[' => parseCharClass s pos
| '(' => parseExplicitGroup s pos
| '.' => pure (RegexAST.anychar, pos.next s)
| '\\' =>
-- Handle escape sequence.
-- Note: Python uses a single backslash as an escape character, but Lean
-- strings need to escape that. After DDMification, we will see two
-- backslashes in Strata for every Python backslash.
let nextPos := pos.next s
if nextPos.atEnd s then throw (.patternError "Incomplete escape sequence at end of regex" s pos)
let some escapedChar := nextPos.get? s | throw (.patternError "Invalid escape position" s nextPos)
-- Check for special sequences (unsupported right now).
match escapedChar with
| 'A' | 'b' | 'B' | 'd' | 'D' | 's' | 'S' | 'w' | 'W' | 'z' | 'Z' =>
throw (.unimplemented s!"Special sequence \\{escapedChar} is not supported" s pos)
| 'a' | 'f' | 'n' | 'N' | 'r' | 't' | 'u' | 'U' | 'v' | 'x' =>
throw (.unimplemented s!"Escape sequence \\{escapedChar} is not supported" s pos)
| c =>
if c.isDigit then
throw (.unimplemented s!"Backreference \\{c} is not supported" s pos)
else
pure (RegexAST.char escapedChar, nextPos.next s)
| _ => pure (RegexAST.char c, pos.next s)
-- Check for numeric repeat suffix on base element (but not on anchors)
match base with
| .anchor_start | .anchor_end => pure (base, nextPos)
| _ =>
if !nextPos.atEnd s then
match nextPos.get? s with
| some '{' =>
let (min, max, finalPos) parseBounds s nextPos
pure (RegexAST.loop base min max, finalPos)
| some '*' =>
let afterStar := nextPos.next s
if !afterStar.atEnd s then
match afterStar.get? s with
| some '?' => throw (.unimplemented "Non-greedy quantifier *? is not supported" s nextPos)
| some '+' => throw (.unimplemented "Possessive quantifier *+ is not supported" s nextPos)
| _ => pure (RegexAST.star base, afterStar)
else pure (RegexAST.star base, afterStar)
| some '+' =>
let afterPlus := nextPos.next s
if !afterPlus.atEnd s then
match afterPlus.get? s with
| some '?' => throw (.unimplemented "Non-greedy quantifier +? is not supported" s nextPos)
| some '+' => throw (.unimplemented "Possessive quantifier ++ is not supported" s nextPos)
| _ => pure (RegexAST.plus base, afterPlus)
else pure (RegexAST.plus base, afterPlus)
| some '?' =>
let afterQuestion := nextPos.next s
if !afterQuestion.atEnd s then
match afterQuestion.get? s with
| some '?' => throw (.unimplemented "Non-greedy quantifier ?? is not supported" s nextPos)
| some '+' => throw (.unimplemented "Possessive quantifier ?+ is not supported" s nextPos)
| _ => pure (RegexAST.optional base, afterQuestion)
else pure (RegexAST.optional base, afterQuestion)
| _ => pure (base, nextPos)
else
pure (base, nextPos)
/-- Parse explicit group with parentheses. -/
partial def parseExplicitGroup (s : String) (pos : String.Pos.Raw) : Except ParseError (RegexAST × String.Pos.Raw) := do
if pos.get? s != some '(' then throw (.patternError "Expected '(' at start of group" s pos)
let mut i := pos.next s
-- Check for extension notation (?...
if !i.atEnd s && i.get? s == some '?' then
let i1 := i.next s
if !i1.atEnd s then
match i1.get? s with
| some '=' => throw (.unimplemented "Positive lookahead (?=...) is not supported" s pos)
| some '!' => throw (.unimplemented "Negative lookahead (?!...) is not supported" s pos)
| _ => throw (.unimplemented "Extension notation (?...) is not supported" s pos)
let (inner, finalPos) parseGroup s i (some ')')
pure (.group inner, finalPos)
/-- Parse group: handles alternation and concatenation at current scope. -/
partial def parseGroup (s : String) (pos : String.Pos.Raw) (endChar : Option Char) :
Except ParseError (RegexAST × String.Pos.Raw) := do
let mut alternatives : List (List RegexAST) := [[]]
let mut i := pos
-- Parse until end of string or `endChar`.
while !i.atEnd s && (endChar.isNone || i.get? s != endChar) do
if i.get? s == some '|' then
-- Push a new scope to `alternatives`.
alternatives := [] :: alternatives
i := i.next s
else
let (ast, nextPos) parseAtom s i
alternatives := match alternatives with
| [] => [[ast]]
| head :: tail => (ast :: head) :: tail
i := nextPos
-- Check for expected end character.
if let some ec := endChar then
if i.get? s != some ec then
throw (.patternError s!"Expected '{ec}'" s i)
i := i.next s
-- Build result: concatenate each alternative, then union them.
let concatAlts := alternatives.reverse.filterMap fun alt =>
match alt.reverse with
| [] => -- Empty regex.
some (.empty)
| [single] => some single
| head :: tail => some (tail.foldl RegexAST.concat head)
match concatAlts with
| [] => pure (.empty, i)
| [single] => pure (single, i)
| head :: tail => pure (tail.foldl RegexAST.union head, i)
end
/-- info: Except.ok (Strata.Python.RegexAST.range 'A' 'z', { byteIdx := 5 }) -/
#guard_msgs in
#eval parseCharClass "[A-z]" 0
-- Test code: Print done
#print "Done!"