feat: leading whitespace on first token (#12662)

This PR adjusts the module parser to set the leading whitespace of the
first token to the whitespace up to that token. If there are no actual
tokens in the file, the leading whitespace is set on the final (empty)
EOI token. This ensures that we do not lose the initial whitespace (e.g.
comments) of a file in `Syntax`.

(Tests generated/adjusted by Claude)

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marc Huisinga
2026-03-06 13:46:44 +01:00
committed by GitHub
parent 5f3ca3ac3d
commit 35944c367b
6 changed files with 144 additions and 17 deletions

View File

@@ -917,7 +917,11 @@ def lean (name : Option Ident := none) (error warning : flag false) («show» :
let scopes := ( get).scopes
let (cmds, cmdState, trees) withSaveInfoContext do
let mut cmdState : Command.State := { env, maxRecDepth := MonadRecDepth.getMaxRecDepth, scopes }
let mut pstate : Parser.ModuleParserState := {pos := pos, recovering := false}
let mut pstate : Parser.ModuleParserState := {
pos
recovering := false
hasLeading := false
}
let mut cmds := #[]
repeat
let scope := cmdState.scopes.head!

View File

@@ -26,7 +26,12 @@ end Module
structure ModuleParserState where
pos : String.Pos.Raw := 0
recovering : Bool := false
recovering : Bool := false
/--
Whether there is leading whitespace before `pos`.
Used to associate whitespace at the start of the file with the first token of the file.
-/
hasLeading : Bool := true
deriving Inhabited
private partial def mkErrorMessage (c : InputContext) (pos : String.Pos.Raw) (stk : SyntaxStack) (e : Parser.Error) : Message := Id.run do
@@ -60,6 +65,12 @@ where
if let .original (trailing := trailing) .. := stx.getTailInfo then pure (some trailing)
else none
private def setStartOfFileLeading (stx : Syntax) : Syntax × Bool := Id.run do
let some (.original leading pos trailing endPos) stx.getHeadInfo?
| return (stx, false)
let info := .original { leading with startPos := 0 } pos trailing endPos
return (stx.setHeadInfo info, true)
def parseHeader (inputCtx : InputContext) : IO (TSyntax ``Module.header × ModuleParserState × MessageLog) := do
let dummyEnv mkEmptyEnvironment
let p := andthenFn whitespace Module.header.fn
@@ -96,10 +107,12 @@ def parseHeader (inputCtx : InputContext) : IO (TSyntax ``Module.header × Modul
consider using separate `public import {mod}` and `import all {mod}` directives \
in order to import public data into the public scope and private data into the \
private scope."
pure (stx, {pos := s.pos, recovering := s.hasError}, messages)
let (stx, isLeadingSet) := setStartOfFileLeading stx
pure (stx, { pos := s.pos, recovering := s.hasError, hasLeading := ! isLeadingSet }, messages)
private def mkEOI (pos : String.Pos.Raw) : Syntax :=
let atom := mkAtom (SourceInfo.original "".toRawSubstring pos "".toRawSubstring pos) ""
private def mkEOI (inputCtx : InputContext) (pos : String.Pos.Raw) : Syntax :=
let emptyAt := inputCtx.substring pos pos
let atom := mkAtom (SourceInfo.original emptyAt pos emptyAt pos) ""
mkNode ``Command.eoi #[atom]
def isTerminalCommand (s : Syntax) : Bool :=
@@ -122,7 +135,7 @@ partial def parseCommand (inputCtx : InputContext) (pmctx : ParserModuleContext)
let mut stx := Syntax.missing -- will always be assigned below
repeat
if inputCtx.atEnd pos then
stx := mkEOI pos
stx := mkEOI inputCtx pos
break
let pos' := pos
let p := andthenFn whitespace topLevelCommandParserFn
@@ -155,7 +168,10 @@ partial def parseCommand (inputCtx : InputContext) (pmctx : ParserModuleContext)
else
stx := s.stxStack.back
break
return (stx, { pos, recovering }, messages)
if mps.hasLeading then
-- File header was empty - set start-of-file leading whitespace on first token in `stx`
(stx, _) := setStartOfFileLeading stx
return (stx, { pos, recovering, hasLeading := false }, messages)
-- only useful for testing since most Lean files cannot be parsed without elaboration
@@ -165,7 +181,7 @@ partial def testParseModuleAux (env : Environment) (inputCtx : InputContext) (s
| (stx, state, msgs) =>
if isTerminalCommand stx then
if !msgs.hasUnreported then
pure stxs
pure (stxs.push stx)
else do
msgs.forM fun msg => msg.toString >>= IO.println
throw (IO.userError "failed to parse file")
@@ -173,14 +189,14 @@ partial def testParseModuleAux (env : Environment) (inputCtx : InputContext) (s
parse state msgs (stxs.push stx)
parse s msgs stxs
def testParseModule (env : Environment) (fname contents : String) : IO (TSyntax ``Parser.Module.module) := do
def testParseModule (env : Environment) (fname contents : String) : IO Syntax := do
let inputCtx := mkInputContext contents fname
let (header, state, messages) parseHeader inputCtx
let cmds testParseModuleAux env inputCtx state messages #[]
let stx := mkNode `Lean.Parser.Module.module #[header, mkListNode cmds]
pure stx.raw.updateLeading
pure stx
def testParseFile (env : Environment) (fname : System.FilePath) : IO (TSyntax ``Parser.Module.module) := do
def testParseFile (env : Environment) (fname : System.FilePath) : IO Syntax := do
let contents IO.FS.readFile fname
testParseModule env fname.toString contents

View File

@@ -12,20 +12,27 @@ unsafe def main (args : List String) : IO Unit := do
| [f] => (false, f)
| _ => panic! "usage: file [-d]"
let env mkEmptyEnvironment
let stx Lean.Parser.testParseFile env fn
let stx Lean.Parser.testParseFile env fn
-- `testParseFile` matches the real pipeline (no `updateLeading`), but reformatting
-- needs leading info to correctly place inter-declaration comments, just like
-- `script/reformat.lean` which explicitly calls `updateLeading`.
let stx : Syntax := stx₀.updateLeading
let act : CoreM Format := do
withOptions (fun opts =>
opts
-- Name sanitization clears inline comments attached to identifiers.
|>.set `pp.sanitizeNames false
|>.set `trace.PrettyPrinter.format debug) do
tryFinally (PrettyPrinter.ppModule stx) printTraces
tryFinally (PrettyPrinter.ppModule stx) printTraces
let (f, _) act.toIO { fileName := "", fileMap := default } { env := env }
-- `testParseModule` includes the EOI token in the command list (matching the real
-- pipeline), so `ppModule` emits two trailing newlines via the `ppLine >> ppLine`
-- separator for EOI's empty command text.
IO.print f
let stx' Lean.Parser.testParseModule env fn (toString f)
if stx' != stx then
let stx := stx.raw.getArg 1
let stx' := stx'.raw.getArg 1
let stx := stx.getArg 1
let stx' := stx'.getArg 1
stx.getArgs.size.forM fun i _ => do
if stx.getArg i != stx'.getArg i then
throw $ IO.userError s!"reparsing failed:\n{stx.getArg i}\n{stx'.getArg i}"

View File

@@ -326,3 +326,5 @@ instance {α : Type u} [DecidableEq α] : BEq α :=
def dite {α : Sort u} (c : Prop) [h : Decidable c] (t : c → α) (e : Not c → α) : α :=
Decidable.casesOn (motive := fun _ => α) h e t

View File

@@ -33,10 +33,13 @@ let (debug, f) : Bool × String := match args with
| _ => panic! "usage: file [-d]";
let env mkEmptyEnvironment;
let stx Lean.Parser.testParseFile env args.head!;
let header := stx.raw.getArg 0;
-- `testParseFile` matches the real pipeline (no `updateLeading`), but reprinting
-- needs leading info to correctly place inter-declaration whitespace.
let stx := stx.updateLeading;
let header := stx.getArg 0;
let some s pure header.reprint | throw $ IO.userError "header reprint failed";
IO.print s;
let cmds := (stx.raw.getArg 1).getArgs;
let cmds := (stx.getArg 1).getArgs;
cmds.forM $ fun cmd => do
let cmd := unparen cmd;
let (cmd, _) (tryFinally (PrettyPrinter.parenthesizeCommand cmd) printTraces).toIO { options := Options.empty.set `trace.PrettyPrinter.parenthesize debug, fileName := "", fileMap := default } { env := env };

View File

@@ -0,0 +1,95 @@
-- Tests that file-start whitespace and comments are associated with the leading
-- of the first actual token in the file (in the module header or first command).
import Lean
open Lean Parser
/-- Parse a header string and return the leading Substring.Raw of the first token. -/
def getFirstTokenLeading (input : String) : IO (Option Substring.Raw) := do
let inputCtx := mkInputContext input "<test>"
let (header, _, _) parseHeader inputCtx
return match header.raw.getHeadInfo? with
| some (.original leading ..) => some leading
| _ => none
-- Test 1: file-start line comment before import
-- The leading of the `import` keyword should start at position 0 and contain the comment.
/--
info: leading startPos: 0
leading content: -- file-start comment
-/
#guard_msgs in
#eval do
let some leading getFirstTokenLeading "-- file-start comment\nimport Foo\n"
| IO.println "no leading found"
IO.println s!"leading startPos: {leading.startPos.byteIdx}"
IO.println s!"leading content: {Substring.Raw.toString leading}"
-- Test 2: file-start block comment before import
/--
info: leading startPos: 0
leading content: /- block comment -/
-/
#guard_msgs in
#eval do
let some leading getFirstTokenLeading "/- block comment -/\nimport Foo\n"
| IO.println "no leading found"
IO.println s!"leading startPos: {leading.startPos.byteIdx}"
IO.println s!"leading content: {Substring.Raw.toString leading}"
-- Test 3: no file-start whitespace (import is the very first thing)
-- The leading should start at position 0 with stopPos 0 (empty leading).
/--
info: leading startPos: 0
leading stopPos: 0 -/
#guard_msgs in
#eval do
let some leading getFirstTokenLeading "import Foo\n"
| IO.println "no leading found"
IO.println s!"leading startPos: {leading.startPos.byteIdx}"
IO.println s!"leading stopPos: {leading.stopPos.byteIdx}"
-- Test 4: file with only newlines at the start (no comments)
/--
info: leading startPos: 0
leading content:
-/
#guard_msgs in
#eval do
let some leading getFirstTokenLeading "\n\nimport Foo\n"
| IO.println "no leading found"
IO.println s!"leading startPos: {leading.startPos.byteIdx}"
IO.println s!"leading content: {Substring.Raw.toString leading}"
-- Test 5: entirely empty file (only a comment, no header or body)
-- The EOI token's leading should include the file-start comment.
/--
info: first token leading startPos: 0
first token leading content: -- just a comment
-/
#guard_msgs in
#eval do
let env mkEmptyEnvironment
let stx testParseModule env "<test>" "-- just a comment\n"
match stx.getHeadInfo? with
| some (.original leading ..) =>
IO.println s!"first token leading startPos: {leading.startPos.byteIdx}"
IO.println s!"first token leading content: {Substring.Raw.toString leading}"
| _ => IO.println "no original leading found"
-- Test 6: empty header (no imports) - file-start comment before a def
-- The leading of the first command token should include the file-start comment.
/--
info: first token leading startPos: 0
first token leading content: -- file-start comment
-/
#guard_msgs in
#eval do
let env mkEmptyEnvironment
let stx testParseModule env "<test>" "-- file-start comment\ndef foo := 1\n"
match stx.getHeadInfo? with
| some (.original leading ..) =>
IO.println s!"first token leading startPos: {leading.startPos.byteIdx}"
IO.println s!"first token leading content: {Substring.Raw.toString leading}"
| _ => IO.println "no original leading found"