chore: switch to new test/bench suite (#12590)

This PR sets up the new integrated test/bench suite. It then migrates all benchmarks and some related tests to the new suite. There's also some documentation and some linting. For now, a lot of the old tests are left alone so this PR doesn't become even larger than it already is. Eventually, all tests should be migrated to the new suite though so there isn't a confusing mix of two systems.
2026-03-17 10:24:07 +00:00 · 2026-02-25 14:51:53 +01:00
parent bd0c6a42c8
commit 08eb78a5b2
4585 changed files with 17733 additions and 1765 deletions
--- a/.claude/CLAUDE.md
+++ b/.claude/CLAUDE.md
@@ -4,29 +4,25 @@ To build Lean you should use `make -j$(nproc) -C build/release`.

 ## Running Tests

-See `doc/dev/testing.md` for full documentation. Quick reference:
+See `tests/README.md` for full documentation. Quick reference:

 ```bash
 # Full test suite (use after builds to verify correctness)
-make -j$(nproc) -C build/release test ARGS="-j$(nproc)"
+CTEST_PARALLEL_LEVEL="$(nproc)" CTEST_OUTPUT_ON_FAILURE=1 \
+make -C build/release -j "$(nproc)" test

 # Specific test by name (supports regex via ctest -R)
-make -j$(nproc) -C build/release test ARGS='-R grind_ematch --output-on-failure'
+CTEST_PARALLEL_LEVEL="$(nproc)" CTEST_OUTPUT_ON_FAILURE=1 \
+make -C build/release -j "$(nproc)" test ARGS='-R grind_ematch'

 # Rerun only previously failed tests
-make -j$(nproc) -C build/release test ARGS='--rerun-failed --output-on-failure'
+CTEST_PARALLEL_LEVEL="$(nproc)" CTEST_OUTPUT_ON_FAILURE=1 \
+make -C build/release -j "$(nproc)" test ARGS='--rerun-failed'

-# Single test from tests/lean/run/ (quick check during development)
-cd tests/lean/run && ./test_single.sh example_test.lean
-
-# ctest directly (from stage1 build dir)
-cd build/release/stage1 && ctest -j$(nproc) --output-on-failure --timeout 300
+# Single test from tests/foo/bar/ (quick check during development)
+cd tests/foo/bar && ./run_test example_test.lean
 ```

-The full test suite includes `tests/lean/`, `tests/lean/run/`, `tests/lean/interactive/`,
-`tests/compiler/`, `tests/pkg/`, Lake tests, and more. Using `make test` or `ctest` runs
-all of them; `test_single.sh` in `tests/lean/run/` only covers that one directory.
-
 ## New features

 When asked to implement new features:
@@ -34,8 +30,6 @@ When asked to implement new features:
 * write comprehensive tests first (expecting that these will initially fail)
 * and then iterate on the implementation until the tests pass.

-All new tests should go in `tests/lean/run/`. These tests don't have expected output; we just check there are no errors. You should use `#guard_msgs` to check for specific messages.
-
 ## Success Criteria

 *Never* report success on a task unless you have verified both a clean build without errors, and that the relevant tests pass.
--- a/.github/workflows/build-template.yml
+++ b/.github/workflows/build-template.yml
@@ -85,7 +85,7 @@ jobs:
      - name: CI Merge Checkout
        run: |
          git fetch --depth=1 origin ${{ github.sha }}
-          git checkout FETCH_HEAD flake.nix flake.lock script/prepare-* tests/lean/run/importStructure.lean
+          git checkout FETCH_HEAD flake.nix flake.lock script/prepare-* tests/elab/importStructure.lean
        if: github.event_name == 'pull_request'
      # (needs to be after "Checkout" so files don't get overridden)
      - name: Setup emsdk
@@ -235,7 +235,7 @@ jobs:
        # prefix `if` above with `always` so it's run even if tests failed
        if: always() && steps.test.conclusion != 'skipped'
      - name: Check Test Binary
-        run: ${{ matrix.binary-check }} tests/compiler/534.lean.out
+        run: ${{ matrix.binary-check }} tests/compile/534.lean.out
        if: (!matrix.cross) && steps.test.conclusion != 'skipped'
      - name: Build Stage 2
        run: |
@@ -246,13 +246,7 @@ jobs:
          make -C build -j$NPROC check-stage3
        if: matrix.check-stage3
      - name: Test Speedcenter Benchmarks
-        run: |
-          # Necessary for some timing metrics but does not work on Namespace runners
-          # and we just want to test that the benchmarks run at all here
-          #echo -1 | sudo tee /proc/sys/kernel/perf_event_paranoid
-          export BUILD=$PWD/build PATH=$PWD/build/stage1/bin:$PATH
-          cd tests/bench
-          nix shell .#temci -c temci exec --config speedcenter.yaml --included_blocks fast --runs 1
+        run: nix shell github:Kha/lakeprof -c make -C build -j$NPROC bench
        if: matrix.test-speedcenter
      - name: Check rebootstrap
        run: |
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.11)
+cmake_minimum_required(VERSION 3.21)

 option(USE_MIMALLOC "use mimalloc" ON)

@@ -147,6 +147,7 @@ ExternalProject_Add(
  INSTALL_COMMAND ""
  DEPENDS stage2
  EXCLUDE_FROM_ALL ON
+  STEP_TARGETS configure
 )

 # targets forwarded to appropriate stages
@@ -157,6 +158,25 @@ add_custom_target(update-stage0-commit COMMAND $(MAKE) -C stage1 update-stage0-c

 add_custom_target(test COMMAND $(MAKE) -C stage1 test DEPENDS stage1)

+add_custom_target(
+  bench
+  COMMAND $(MAKE) -C stage2
+  COMMAND $(MAKE) -C stage2 -j1 bench
+  DEPENDS stage2
+)
+add_custom_target(
+  bench-part1
+  COMMAND $(MAKE) -C stage2
+  COMMAND $(MAKE) -C stage2 -j1 bench-part1
+  DEPENDS stage2
+)
+add_custom_target(
+  bench-part2
+  COMMAND $(MAKE) -C stage2
+  COMMAND $(MAKE) -C stage2 -j1 bench-part2
+  DEPENDS stage2
+)
+
 add_custom_target(clean-stdlib COMMAND $(MAKE) -C stage1 clean-stdlib DEPENDS stage1)

 install(CODE "execute_process(COMMAND make -C stage1 install)")
--- a/doc/dev/testing.md
+++ b/doc/dev/testing.md
@@ -1,5 +1,9 @@
 # Test Suite

+**Warning:** This document is partially outdated.
+It describes the old test suite, which is currently in the process of being replaced.
+The new test suite's documentation can be found at [`tests/README.md`](../../tests/README.md).
+
 After [building Lean](../make/index.md) you can run all the tests using
 ```
 cd build/release
--- a/script/benchReelabRss.lean
+++ b/script/benchReelabRss.lean
@@ -83,7 +83,7 @@ def main (args : List String) : IO Unit := do
      lastRSS? := some rss

    let avgRSSDelta := totalRSSDelta / (n - 2)
-    IO.println s!"avg-reelab-rss-delta: {avgRSSDelta}"
+    IO.println s!"measurement: avg-reelab-rss-delta {avgRSSDelta*1024} b"

    let _ ← Ipc.collectDiagnostics requestNo uri versionNo
    (← Ipc.stdin).writeLspMessage (Message.notification "exit" none)
--- a/script/benchReelabWatchdogRss.lean
+++ b/script/benchReelabWatchdogRss.lean
@@ -82,7 +82,7 @@ def main (args : List String) : IO Unit := do
      lastRSS? := some rss

    let avgRSSDelta := totalRSSDelta / (n - 2)
-    IO.println s!"avg-reelab-rss-delta: {avgRSSDelta}"
+    IO.println s!"measurement: avg-reelab-rss-delta {avgRSSDelta*1024} b"

    let _ ← Ipc.collectDiagnostics requestNo uri versionNo
    Ipc.shutdown requestNo
--- a/script/fmt
+++ b/script/fmt
@@ -9,5 +9,5 @@ find -regex '.*/CMakeLists\.txt\(\.in\)?\|.*\.cmake\(\.in\)?' \
  ! -path "./stage0/*" \
  -exec \
    uvx gersemi --in-place --line-length 120 --indent 2 \
-    --definitions src/cmake/Modules/ src/CMakeLists.txt \
+    --definitions src/cmake/Modules/ src/CMakeLists.txt tests/CMakeLists.txt \
    -- {} +
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,6 +1,4 @@
-cmake_minimum_required(VERSION 3.10)
-cmake_policy(SET CMP0054 NEW)
-cmake_policy(SET CMP0110 NEW)
+cmake_minimum_required(VERSION 3.21)
 if(NOT CMAKE_GENERATOR MATCHES "Unix Makefiles")
  message(FATAL_ERROR "The only supported CMake generator at the moment is 'Unix Makefiles'")
 endif()
--- a/src/Lean/Linter/Coe.lean
+++ b/src/Lean/Linter/Coe.lean
@@ -38,7 +38,7 @@ def coercionsBannedInCore : Array Name := #[``optionCoe, ``instCoeSubarrayArray]
 def coeLinter : Linter where
  run := fun _ => do
    let mainModule ← getMainModule
-    let isCoreModule := mainModule = `lean.run.linterCoe ∨ (mainModule.getRoot ∈ [`Init, `Std])
+    let isCoreModule := mainModule = `elab.linterCoe ∨ (mainModule.getRoot ∈ [`Init, `Std])
    let shouldWarnOnDeprecated := getLinterValue linter.deprecatedCoercions (← getLinterOptions)
    let trees ← Elab.getInfoTrees
    for tree in trees do
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -1 +1,15 @@
-*.olean
+# Generated by cmake
+/env_test.sh
+/env_bench.sh
+
+# Created by test suite
+*.out.produced
+*.exit.produced
+
+# Created by bench suite
+*.measurements.jsonl
+measurements.jsonl
+
+# Created by compile tests
+*.lean.c
+*.lean.out
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,27 +1,182 @@
+#################
+## Environment ##
+#################
+
 # MSYS2 bash usually handles Windows paths relatively well, but not when putting them in the PATH
 string(REGEX REPLACE "^([a-zA-Z]):" "/\\1" LEAN_BIN "${CMAKE_BINARY_DIR}/bin")

-# Environment variables
 set(TEST_VARS "${LEAN_TEST_VARS}")
-string(APPEND TEST_VARS " PATH=${LEAN_BIN}:$PATH")
+
+# Test scripts can use these to find other parts of the repo, e.g. "$TEST_DIR/measure.py"
+string(APPEND TEST_VARS " STAGE='${STAGE}'") # Using this should not normally be necessary
+string(APPEND TEST_VARS " SRC_DIR='${CMAKE_SOURCE_DIR}'")
+string(APPEND TEST_VARS " TEST_DIR='${CMAKE_CURRENT_SOURCE_DIR}'")
+string(APPEND TEST_VARS " BUILD_DIR='${CMAKE_BINARY_DIR}'")
+string(APPEND TEST_VARS " SCRIPT_DIR='${CMAKE_SOURCE_DIR}/../script'")
+
+# Use the current stage's lean binary instead of whatever lake thinks we want
+string(APPEND TEST_VARS " PATH='${LEAN_BIN}':\"$PATH\"")
+
 string(APPEND TEST_VARS " LEANC_OPTS='${LEANC_OPTS}'")
+
 # LEANC_OPTS in CXX is necessary for macOS c++ to find its headers
 string(APPEND TEST_VARS " CXX='${CMAKE_CXX_COMPILER} ${LEANC_OPTS}'")

-add_test(lean_help1 "${CMAKE_BINARY_DIR}/bin/lean" --help)
-add_test(lean_help2 "${CMAKE_BINARY_DIR}/bin/lean" -h)
-add_test(lean_version1 "${CMAKE_BINARY_DIR}/bin/lean" --version)
-add_test(lean_version2 "${CMAKE_BINARY_DIR}/bin/lean" --v)
-add_test(lean_ghash1 "${CMAKE_BINARY_DIR}/bin/lean" -g)
-add_test(lean_ghash2 "${CMAKE_BINARY_DIR}/bin/lean" --githash)
-add_test(lean_unknown_option bash "${LEAN_SOURCE_DIR}/cmake/check_failure.sh" "${CMAKE_BINARY_DIR}/bin/lean" "-z")
-add_test(
-  lean_unknown_file1
-  bash
-  "${LEAN_SOURCE_DIR}/cmake/check_failure.sh"
-  "${CMAKE_BINARY_DIR}/bin/lean"
-  "boofoo.lean"
-)
+string(APPEND TEST_VARS " TEST_BENCH=")
+configure_file(env.sh.in "${CMAKE_CURRENT_SOURCE_DIR}/env_test.sh")
+
+block()
+  string(APPEND TEST_VARS " TEST_BENCH=1")
+  configure_file(env.sh.in "${CMAKE_CURRENT_SOURCE_DIR}/env_bench.sh")
+endblock()
+
+######################
+## Helper functions ##
+######################
+
+function(check_test_bench_scripts DIR DIR_ABS)
+  set(RUN_TEST "${DIR_ABS}/run_test")
+  set(RUN_BENCH "${DIR_ABS}/run_bench")
+
+  set(RUN_TEST_EXISTS FALSE)
+  set(RUN_BENCH_EXISTS FALSE)
+  if(EXISTS "${RUN_TEST}")
+    set(RUN_TEST_EXISTS TRUE)
+  endif()
+  if(EXISTS "${RUN_BENCH}")
+    set(RUN_BENCH_EXISTS TRUE)
+  endif()
+
+  if(NOT RUN_TEST_EXISTS AND NOT RUN_BENCH_EXISTS)
+    message(FATAL_ERROR "${DIR}: Found neither a run_test nor a run_bench file")
+    return()
+  endif()
+
+  # Replace with return(PROPAGATE) if we ever update to cmake 3.25+
+  set(RUN_TEST "${RUN_TEST}" PARENT_SCOPE)
+  set(RUN_BENCH "${RUN_BENCH}" PARENT_SCOPE)
+  set(RUN_TEST_EXISTS "${RUN_TEST_EXISTS}" PARENT_SCOPE)
+  set(RUN_BENCH_EXISTS "${RUN_BENCH_EXISTS}" PARENT_SCOPE)
+endfunction()
+
+function(check_bench_argument DIR ARGS_BENCH RUN_BENCH_EXISTS)
+  if(RUN_BENCH_EXISTS AND NOT ARGS_BENCH)
+    message(FATAL_ERROR "${DIR}: run_bench file found, BENCH argument must be specified")
+    return()
+  endif()
+  if(NOT RUN_BENCH_EXISTS AND ARGS_BENCH)
+    message(FATAL_ERROR "${DIR}: BENCH argument specified but no run_bench file found")
+    return()
+  endif()
+endfunction()
+
+function(add_combined_measurements OUTPUT)
+  if(NOT ARGN)
+    message(AUTHOR_WARNING "No input measurements provided for ${OUTPUT}")
+    add_custom_command(OUTPUT "${OUTPUT}" COMMAND "${CMAKE_COMMAND}" -E touch "${OUTPUT}")
+    return()
+  endif()
+
+  add_custom_command(
+    OUTPUT "${OUTPUT}"
+    DEPENDS "${ARGN}"
+    COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/combine.py" -o "${OUTPUT}" -- ${ARGN}
+  )
+endfunction()
+
+# A test pile is a directory containing many test files, each of which
+# represents a separate test (or benchmark). The directory may also contain
+# additional files or subdirectories required by the individual test files.
+#
+# If a run_test script is present, each test file will be added as a test. Tests
+# can be disabled on a per-file basis by creating a `<file>.no_test` file.
+#
+# If a run_bench script is present, each test file will be added as a benchmark.
+# Benchmarks can be disabled on a per-file basis by creating a `<file>.no_bench`
+# file. CMake expects the bench script to produce a `<file>.measurements.jsonl`
+# file next to the test file. The individual measurements will be combined into
+# a single `measurements.jsonl` file in the pile directory, whose path will be
+# added to the list specified by the BENCH argument.
+function(add_test_pile DIR GLOB)
+  cmake_parse_arguments(ARGS "" BENCH "" ${ARGN})
+  set(DIR_ABS "${CMAKE_CURRENT_SOURCE_DIR}/${DIR}")
+
+  check_test_bench_scripts("${DIR}" "${DIR_ABS}")
+  check_bench_argument("${DIR}" "${ARGS_BENCH}" "${RUN_BENCH_EXISTS}")
+
+  # The test files' individual measurement files that will later be combined
+  # into a single measurements.jsonl file
+  set(MEASUREMENTS_FILES "")
+
+  # Iterate over all files matching the glob
+  file(GLOB TEST_FILES "${DIR_ABS}/${GLOB}")
+  foreach(FILE_ABS IN LISTS TEST_FILES)
+    # Path relative to source directory
+    cmake_path(RELATIVE_PATH FILE_ABS BASE_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE FILE)
+    # Path relative to pile directory
+    cmake_path(RELATIVE_PATH FILE_ABS BASE_DIRECTORY "${DIR_ABS}" OUTPUT_VARIABLE FILE_NAME)
+
+    if(RUN_TEST_EXISTS AND NOT EXISTS "${FILE_ABS}.no_test")
+      add_test(NAME "${FILE}" WORKING_DIRECTORY "${DIR_ABS}" COMMAND "${RUN_TEST}" "${FILE_NAME}")
+    endif()
+
+    if(RUN_BENCH_EXISTS AND NOT EXISTS "${FILE_ABS}.no_bench")
+      set(MEASUREMENTS_FILE "${FILE_ABS}.measurements.jsonl")
+      list(APPEND MEASUREMENTS_FILES "${MEASUREMENTS_FILE}")
+      add_custom_command(
+        OUTPUT "${MEASUREMENTS_FILE}"
+        WORKING_DIRECTORY "${DIR_ABS}"
+        COMMAND "${CMAKE_COMMAND}" -E remove -f "${MEASUREMENTS_FILE}"
+        COMMAND "${RUN_BENCH}" "${FILE_NAME}"
+      )
+    endif()
+  endforeach()
+
+  # Combine measurements
+  if(RUN_BENCH_EXISTS)
+    set(MEASUREMENTS_FILE "${DIR_ABS}/measurements.jsonl")
+    list(APPEND "${ARGS_BENCH}" "${MEASUREMENTS_FILE}")
+    set("${ARGS_BENCH}" "${${ARGS_BENCH}}" PARENT_SCOPE)
+    add_combined_measurements("${MEASUREMENTS_FILE}" "${MEASUREMENTS_FILES}")
+  endif()
+endfunction()
+
+# A test directory is a directory containing a single test (or benchmark),
+# alongside any additional files or subdirectories required by that test.
+function(add_test_dir DIR)
+  cmake_parse_arguments(ARGS "" BENCH "" ${ARGN})
+  set(DIR_ABS "${CMAKE_CURRENT_SOURCE_DIR}/${DIR}")
+
+  check_test_bench_scripts("${DIR}" "${DIR_ABS}")
+  check_bench_argument("${DIR}" "${ARGS_BENCH}" "${RUN_BENCH_EXISTS}")
+
+  # Add as test
+  if(RUN_TEST_EXISTS)
+    add_test(NAME "${DIR}" WORKING_DIRECTORY "${DIR_ABS}" COMMAND "${RUN_TEST}")
+  endif()
+
+  # Add as benchmark
+  if(RUN_BENCH_EXISTS)
+    set(MEASUREMENTS_FILE "${DIR_ABS}/measurements.jsonl")
+    list(APPEND "${ARGS_BENCH}" "${MEASUREMENTS_FILE}")
+    set("${ARGS_BENCH}" "${${ARGS_BENCH}}" PARENT_SCOPE)
+    add_custom_command(
+      OUTPUT "${MEASUREMENTS_FILE}"
+      WORKING_DIRECTORY "${DIR_ABS}"
+      COMMAND "${CMAKE_COMMAND}" -E remove -f "${MEASUREMENTS_FILE}"
+      COMMAND "${RUN_BENCH}"
+    )
+  endif()
+endfunction()
+
+# Benchmarks are split into two parts which should be roughly equal in total runtime.
+# In radar, each part is run on a different runner.
+set(PART1 "")
+set(PART2 "")
+
+##########################
+## Tests and benchmarks ##
+##########################

 # LEAN TESTS
 file(GLOB LEANTESTS "${LEAN_SOURCE_DIR}/../tests/lean/*.lean")
@@ -221,3 +376,31 @@ foreach(T ${LEANLAKETESTS})
    )
  endif()
 endforeach(T)
+
+add_test_pile(compile *.lean BENCH PART2)
+add_test_pile(compile_bench *.lean BENCH PART2)
+add_test_pile(elab *.lean)
+add_test_pile(elab_bench *.lean BENCH PART2)
+add_test_pile(elab_fail *.lean)
+add_test_pile(misc *.sh)
+add_test_pile(misc_bench *.sh BENCH PART2)
+
+add_test_dir(bench/build BENCH PART1)
+add_test_dir(bench/size BENCH PART1)
+add_test_dir(lake_bench/inundation BENCH PART2)
+
+#######################
+## Benchmark targets ##
+#######################
+
+set(BENCH_MEASUREMENTS_PART1 "${CMAKE_CURRENT_SOURCE_DIR}/part1.measurements.jsonl")
+set(BENCH_MEASUREMENTS_PART2 "${CMAKE_CURRENT_SOURCE_DIR}/part2.measurements.jsonl")
+set(BENCH_MEASUREMENTS "${CMAKE_CURRENT_SOURCE_DIR}/measurements.jsonl")
+
+add_combined_measurements("${BENCH_MEASUREMENTS_PART1}" "${PART1}")
+add_combined_measurements("${BENCH_MEASUREMENTS_PART2}" "${PART2}")
+add_combined_measurements("${BENCH_MEASUREMENTS}" "${BENCH_MEASUREMENTS_PART1}" "${BENCH_MEASUREMENTS_PART2}")
+
+add_custom_target(bench-part1 DEPENDS lean "${BENCH_MEASUREMENTS_PART1}" COMMENT "Run benchmarks (part 1)")
+add_custom_target(bench-part2 DEPENDS lean "${BENCH_MEASUREMENTS_PART2}" COMMENT "Run benchmarks (part 2)")
+add_custom_target(bench DEPENDS lean "${BENCH_MEASUREMENTS}" COMMENT "Run all benchmarks")
--- a/tests/README.md
+++ b/tests/README.md
@@ -0,0 +1,251 @@
+# Test suite
+
+This directory contains the lean test and benchmark suite.
+It is currently in the process of being migrated to the framework described in this file.
+Some tests still use the previous framework,
+which is partially documented in [testing.md](../doc/dev/testing.md).
+
+The test suite consists of two types of directories: Test directories and test piles.
+
+A **test directory** is a directory containing a `run_test` and/or a `run_bench` script.
+It represents a single test or benchmark, depending on which script is present.
+The run scripts are executed once with their working directory set to the test directory.
+
+A **test pile** is also a directory containing a `run_test` and/or a `run_bench` script.
+Here however, each file of a directory-specific extension (usually `.lean`) represents a single test or benchmark.
+The run scripts are executed once for each test file with their working directory set to the pile directory.
+Often, additional supplementary files are placed next to the test files and interpreted by the run scripts.
+
+## Directory structure
+
+Benchmarks belonging to the old framework are not included in this description.
+
+- `bench`:
+  A bunch of benchmarks and benchmarking related files,
+  most of which are not part of the test suite.
+  - `build`:
+    A benchmark that builds the lean stdlib and measures the per-file performance.
+  - `size`:
+    A benchmark that measures the sizes of a few different kinds of files.
+- `compile`:
+  Tests that compile lean files and then execute the resulting binary, verifying the resulting output.
+  They also run the same lean file through the interpreter.
+- `compile_bench`:
+  Benchmarks that compile lean files and measure the execution of the resulting binary,
+  as well as optionally run the same lean file through the interpreter.
+- `elab`:
+  Tests that elaborate lean files without executing them, verifying the resulting output.
+- `elab_fail`:
+  Like `elab`, but expecting an exit code of 1 instead of 0.
+- `elab_bench`:
+  Like `elab`, but measuring the elaboration performance.
+- `lake_bench`:
+  Benchmark directories that measure lake performance.
+- `misc`:
+  A collection of miscellaneous small test scripts.
+- `misc_bench`:
+  A collection of miscellaneous small benchmark scripts.
+
+## How to run the test suite?
+
+Run all tests using
+
+```sh
+CTEST_PARALLEL_LEVEL="$(nproc)" CTEST_OUTPUT_ON_FAILURE=1 \
+make -C build/release -j "$(nproc)" test
+```
+
+Or rerun only the failed tests using
+
+```sh
+CTEST_PARALLEL_LEVEL="$(nproc)" CTEST_OUTPUT_ON_FAILURE=1 \
+make -C build/release -j "$(nproc)" test ARGS="--rerun-failed"
+```
+
+Run an individual test by `cd`-ing into its directory and then using
+
+```sh
+./run_test # in a test directory
+./run_test testfile # in a test pile
+```
+
+## How to run the bench suite?
+
+Run the full benchmark suite using
+
+```sh
+make -C build/release -j "$(nproc)" bench # produces tests/measurements.jsonl
+```
+
+It is split into two roughly equal parts so it can be split among the benchmark runner machines.
+Run each individual part using
+
+```sh
+make -C build/release -j "$(nproc)" bench-part1 # produces tests/part1.measurements.jsonl
+make -C build/release -j "$(nproc)" bench-part2 # produces tests/part2.measurements.jsonl
+```
+
+Make sure not to specify `-j "$(nproc)"` when running the bench suite manually inside `build/release/stage<n>`.
+
+Run an individual benchmark by `cd`-ing into its directory and then using
+
+```sh
+./run_bench # in a test directory
+./run_bench testfile # in a test pile
+```
+
+## How to write a test or benchmark?
+
+If your test fits one of the existing test piles:
+
+1. Add your test file to the test pile.
+2. Document the test via doc comment inside the test file.
+3. Execute the test as documented above (or run the entire test suite).
+4. Run [`fix_expected.py`](fix_expected.py) to create an `.out.expected` or `.out.ignored` file for the test.
+5. Run [`lint.py`](lint.py).
+
+If your test should be part of one of the existing test directories:
+
+1. Modify the test directory to include your test.
+2. Document the test via comment or `README.md`, following the test directory's conventions.
+
+Otherwise, create a new test directory or pile:
+
+1. Decide on a place to put the new directory.
+2. Write a `run_test` and/or `run_bench` script.
+3. Add the directory to the [`CMakeLists.txt`](CMakeLists.txt) file,
+   next to the other tests near the bottom.
+4. Document the new directory in this readme file
+   by updating the directory structure section above.
+5. Optionally update [`lint.py`](lint.py) if it makes sense.
+
+## How to fix existing tests after your change breaks them?
+
+If the tests break because the expected output differs from the actual output,
+don't blindly copy the produced output into the expected output file.
+Instead, execute [`fix_expected.py`](fix_expected.py) (you need to have `meld` installed).
+This script allows you to review the changes one-by-one.
+
+If the test output is very brittle, either modify the test so the output becomes less brittle,
+or ignore the output by removing `.out.expected`,
+re-running `fix_expected.py` and choosing to ignore the output.
+Brittle output that should usually be ignored are detailed compiler debug traces
+or inherently nondeterministic things like multithreading.
+
+Some test directories or test piles strip or modify certain flaky or nondeterministic outputs
+(e.g. benchmark timings, reference manual URLs).
+
+## How to write a test or bench run script?
+
+Test and bench scripts must be named `run_test` and `run_bench` respectively.
+They must be executable and start with the shebang `#!/usr/bin/env bash`.
+Immediately afterwards, they must source `env_test.sh` or `env_bench.sh` respectively
+using a relative path.
+
+The `env_*.sh` files set some build related environment variables,
+plus a set of test suite related environment variables
+document at the top of [`CMakeLists.txt`](CMakeLists.txt).
+The most notable ones are:
+
+- `TEST_DIR`: Absolute path to the `tests` directory.
+- `SCRIPT_DIR`: Absolute path to the `script` directory.
+- `TEST_BENCH`: Set to `1` if we're currently executing a benchmark, unset otherwise.
+
+Finally, the run script should source `"$TEST_DIR/util.sh"`,
+which provides a few utility functions and also uses `set` to set sensible bash defaults.
+See `util.sh` for the available utility functions.
+
+The run scripts are always executed with their working directory set to their surrounding directory.
+Inside a test pile, `run_test` and `run_bench` receive
+a relative path to the file under test as their first (and only) argument.
+Inside a test directory, they receive no arguments.
+
+A test succeeds iff the `run_test` script exits with exit code 0.
+A benchmark additionally must produce a measurements file:
+Inside a test pile, `run_bench testfile` is expected to produce a `testfile.measurments.jsonl` file.
+Inside a test directory, `run_bench` is expected to produce a `measurements.jsonl` file.
+
+## The `elab*` test pile
+
+These files are available to configure a test:
+
+- `<file>.init.sh`:
+  This file is sourced at the start of the run script.
+  Configure the run script by setting bash variables here.
+
+- `<file>.before.sh`:
+  This file is executed before the test/benchmark.
+  Create or set up temporary resources used by the test here.
+  Usually, it is better to create temporary files or directories inside the test itself,
+  so they're also available when opening the file in your editor.
+
+- `<file>.after.sh`:
+  This file is executed after the test/benchmark.
+  Delete temporary resources used by the test here.
+
+- `<file>.out.expected`:
+  The test fails if its stdout and stderr doesn't match this file's contents.
+  If this file isn't present, the test's output must be empty.
+
+- `<file>.out.ignored`:
+  Ignore the test's output entirely; don't compare it to `<file>.out.expected`.
+
+- `<file>.exit.expected`:
+  The test fails if its exit code doesn't match this file's contents.
+  If this file isn't present, the pile's default exit code is used instead.
+
+These bash variables (set via `<file>.init.sh`) are used by the run script:
+
+- `TEST_LEAN_ARGS`:
+  A bash array of additional arguments to the `lean` command.
+
+## The `compile*` test pile
+
+These files are available to configure a test:
+
+- `<file>.(do|no)_(compile|interpret)`,
+  `<file>.(do|no)_(compile|interpret)_(test|bench)`:
+  Enable or disable the compiler or interpreter during testing or benchmarking.
+  The more specific files take precedence over the more generic files.
+  Instead of disabling the compiler during tests, consider reducing the problem size
+  by passing different command line parameters via `<file>.init.sh`.
+
+- `<file>.init.sh`:
+  This file is sourced at the start of the run script.
+  Configure the run script by setting bash variables here.
+
+- `<file>.before.sh`:
+  This file is executed before the test/benchmark.
+  Create or set up temporary resources used by the test here.
+  Usually, it is better to create temporary files or directories inside the test itself,
+  so they're also available when opening the file in your editor.
+
+- `<file>.after.sh`:
+  This file is executed after the test/benchmark.
+  Delete temporary resources used by the test here.
+
+- `<file>.out.expected`:
+  The test fails if its stdout and stderr doesn't match this file's contents.
+  If this file isn't present, the test's output must be empty.
+
+- `<file>.out.ignored`:
+  Ignore the test's output entirely; don't compare it to `<file>.out.expected`.
+
+- `<file>.exit.expected`:
+  The test fails if its exit code doesn't match this file's contents.
+  If this file isn't present, the test's exit code must be 0.
+
+These bash variables (set via `<file>.init.sh`) are used by the run script:
+
+- `TEST_LEAN_ARGS`:
+  A bash array of additional arguments to the `lean` command used to compile the lean file.
+
+- `TEST_LEANC_ARGS`:
+  A bash array of additional arguments to the `leanc` command used to compile the c file.
+
+- `TEST_LEANI_ARGS`:
+  A bash array of additional arguments to the `lean --run <file>` command used to interpret the lean file.
+
+- `TEST_ARGS`:
+  A bash array of arguments to the compiled (or interpreted) program.
+  Check `TEST_BENCH` if you want to specify more intense parameters for benchmarks.
--- a/tests/bench-radar/README.md
+++ b/tests/bench-radar/README.md
@@ -1,24 +0,0 @@
-# Lean 4 benchmark suite
-
-This directory contains the new Lean 4 benchmark suite.
-It is built around [radar](github.com/leanprover/radar)
-and benchmark results can be viewed
-on the [Lean FRO radar instance](https://radar.lean-lang.org/repos/lean4).
-
-Benchmarks are organized into subdirectories.
-Each benchmark directory must contain a script called `run` that executes the benchmark,
-as well as any additional benchmark-specific required files.
-Ideally, each benchmark directory also contains a `README.md` explaining the benchmark.
-
-To execute the entire suite, run `tests/bench-radar/run` in the repo root.
-To execute an individua benchmark, run `tests/bench-radar/<benchmark>/run` in the repo root.
-All scripts output their measurements into the file `measurements.jsonl`.
-
-Radar sums any duplicated measurements with matching metrics.
-To post-process the `measurements.jsonl` file this way in-place,
-run `tests/bench-radar/combine.py` in the repo root after executing the benchmark suite.
-
-All scripts related to the new benchmark suite are contained in this directory.
-The files at `tests/bench` belong to the old suite.
-The `*.py` symlinks are only for convenience when editing the python scripts in VSCode,
-so the python extensions (in particular pyrefly) treat it as a python file.
--- a/tests/bench-radar/build/lakeprof_report_upload.py
+++ b/tests/bench-radar/build/lakeprof_report_upload.py
@@ -1,44 +0,0 @@
-#!/usr/bin/env python3
-
-import json
-import subprocess
-import sys
-from pathlib import Path
-
-
-def run(*args: str) -> None:
-    subprocess.run(args, check=True)
-
-
-def run_stdout(*command: str, cwd: str | None = None) -> str:
-    result = subprocess.run(command, capture_output=True, encoding="utf-8", cwd=cwd)
-    if result.returncode != 0:
-        print(result.stdout, end="", file=sys.stdout)
-        print(result.stderr, end="", file=sys.stderr)
-        sys.exit(result.returncode)
-    return result.stdout
-
-
-def main() -> None:
-    script_file = Path(__file__)
-    template_file = script_file.parent / "lakeprof_report_template.html"
-
-    sha = run_stdout("git", "rev-parse", "@").strip()
-    base_url = f"https://speed.lean-lang.org/lean4-out/{sha}"
-    report = run_stdout("lakeprof", "report", "-prc", cwd="src")
-    with open(template_file) as f:
-        template = f.read()
-
-    template = template.replace("__BASE_URL__", json.dumps(base_url))
-    template = template.replace("__LAKEPROF_REPORT__", report)
-
-    with open("index.html", "w") as f:
-        f.write(template)
-
-    run("curl", "-fT", "index.html", f"{base_url}/index.html")
-    run("curl", "-fT", "src/lakeprof.log", f"{base_url}/lakeprof.log")
-    run("curl", "-fT", "src/lakeprof.trace_event", f"{base_url}/lakeprof.trace_event")
-
-
-if __name__ == "__main__":
-    main()
--- a/tests/bench-radar/build/run
+++ b/tests/bench-radar/build/run
@@ -1,44 +0,0 @@
-#!/usr/bin/env bash
-set -euxo pipefail
-
-BENCH="tests/bench-radar"
-STAGE2="build/release/stage2"
-STAGE3="build/release/stage3"
-
-# Build previous stages and warm up stage3
-cmake --preset release
-timeout -s KILL 1h time make -C build/release -j"$(nproc)" stage3
-pushd "$STAGE3"
-mkdir install
-make install DESTDIR=install
-find lib -name "*.olean" -delete
-popd
-
-# Use stage2 binaries from now on
-#
-# Otherwise, tools like lakeprof use the global lean installation,
-# which may not exist or be the right version.
-export PATH="$PWD/$STAGE2/bin:$PATH"
-
-# Substitute our own wrapper script
-mv "$STAGE2/bin/lean" "$STAGE2/bin/lean.wrapped"
-cp "$BENCH/build/lean_wrapper.py" "$STAGE2/bin/lean"
-
-# Build stage3
-"$BENCH/measure.py" -t build \
-  -m cycles -m instructions -m maxrss -m task-clock -m wall-clock -- \
-  lakeprof record -- \
-  make -C build/release -j"$(nproc)" stage3
-
-# Analyze lakeprof data
-mv lakeprof.log src
-pushd src
-lakeprof report -pj | jq '{metric: "build/lakeprof/longest build path//wall-clock", value: .[-1][2], unit: "s"}' -c >> ../measurements.jsonl
-lakeprof report -rj | jq '{metric: "build/lakeprof/longest rebuild path//wall-clock", value: .[-1][2], unit: "s"}' -c >> ../measurements.jsonl
-popd
-
-# Upload lakeprof report
-# Guarded to prevent accidental uploads (which wouldn't work anyways) during local runs.
-if [ -f build_upload_lakeprof_report ]; then
-  python3 "$BENCH/build/lakeprof_report_upload.py"
-fi
--- a/tests/bench-radar/combine.py
+++ b/tests/bench-radar/combine.py
@@ -1,31 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import json
-from pathlib import Path
-
-OUTFILE = Path() / "measurements.jsonl"
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description=f"Combine duplicated measurements in {OUTFILE.name} the way radar does, by summing their values."
-    )
-    args = parser.parse_args()
-
-    values: dict[str, float] = {}
-    units: dict[str, str | None] = {}
-
-    with open(OUTFILE, "r") as f:
-        for line in f:
-            data = json.loads(line)
-            metric = data["metric"]
-            values[metric] = values.get(metric, 0) + data["value"]
-            units[metric] = data.get("unit")
-
-    with open(OUTFILE, "w") as f:
-        for metric, value in values.items():
-            unit = units.get(metric)
-            data = {"metric": metric, "value": value}
-            if unit is not None:
-                data["unit"] = unit
-            f.write(f"{json.dumps(data)}\n")
--- a/tests/bench-radar/measure.py
+++ b/tests/bench-radar/measure.py
@@ -1,166 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import json
-import os
-import resource
-import subprocess
-import sys
-import tempfile
-from dataclasses import dataclass
-from pathlib import Path
-
-
-@dataclass
-class PerfMetric:
-    event: str
-    factor: float = 1
-    unit: str | None = None
-
-
-@dataclass
-class RusageMetric:
-    name: str
-    factor: float = 1
-    unit: str | None = None
-
-
-PERF_METRICS = {
-    "task-clock": PerfMetric("task-clock", factor=1e-9, unit="s"),
-    "wall-clock": PerfMetric("duration_time", factor=1e-9, unit="s"),
-    "instructions": PerfMetric("instructions"),
-    "cycles": PerfMetric("cycles"),
-}
-
-PERF_UNITS = {
-    "msec": 1e-3,
-    "ns": 1e-9,
-}
-
-RUSAGE_METRICS = {
-    "maxrss": RusageMetric("ru_maxrss", factor=1000, unit="B"),  # KiB on linux
-}
-
-ALL_METRICS = {**PERF_METRICS, **RUSAGE_METRICS}
-
-
-def measure_perf(cmd: list[str], events: list[str]) -> dict[str, tuple[float, str]]:
-    with tempfile.NamedTemporaryFile() as tmp:
-        cmd = [
-            *["perf", "stat", "-j", "-o", tmp.name],
-            *[arg for event in events for arg in ["-e", event]],
-            *["--", *cmd],
-        ]
-
-        # Execute command
-        env = os.environ.copy()
-        env["LC_ALL"] = "C"  # or else perf may output syntactically invalid json
-        result = subprocess.run(cmd, env=env)
-        if result.returncode != 0:
-            sys.exit(result.returncode)
-
-        # Collect results
-        perf = {}
-        for line in tmp:
-            data = json.loads(line)
-            if "event" in data and "counter-value" in data:
-                perf[data["event"]] = float(data["counter-value"]), data["unit"]
-
-        return perf
-
-
-@dataclass
-class Result:
-    category: str
-    value: float
-    unit: str | None
-
-    def fmt(self, topic: str) -> str:
-        metric = f"{topic}//{self.category}"
-        if self.unit is None:
-            return json.dumps({"metric": metric, "value": self.value})
-        return json.dumps({"metric": metric, "value": self.value, "unit": self.unit})
-
-
-def measure(cmd: list[str], metrics: list[str]) -> list[Result]:
-    # Check args
-    unknown_metrics = []
-    for metric in metrics:
-        if metric not in RUSAGE_METRICS and metric not in PERF_METRICS:
-            unknown_metrics.append(metric)
-    if unknown_metrics:
-        raise Exception(f"unknown metrics: {', '.join(unknown_metrics)}")
-
-    # Prepare perf events
-    events: list[str] = []
-    for metric in metrics:
-        if info := PERF_METRICS.get(metric):
-            events.append(info.event)
-
-    # Measure
-    perf = measure_perf(cmd, events)
-    rusage = resource.getrusage(resource.RUSAGE_CHILDREN)
-
-    # Extract results
-    results = []
-    for metric in metrics:
-        if info := PERF_METRICS.get(metric):
-            if info.event in perf:
-                value, unit = perf[info.event]
-            else:
-                # Without the corresponding permissions,
-                # we only get access to the userspace versions of the counters.
-                value, unit = perf[f"{info.event}:u"]
-
-            value *= PERF_UNITS.get(unit, info.factor)
-            results.append(Result(metric, value, info.unit))
-
-        if info := RUSAGE_METRICS.get(metric):
-            value = getattr(rusage, info.name) * info.factor
-            results.append(Result(metric, value, info.unit))
-
-    return results
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Measure resource usage of a command using perf and rusage."
-    )
-    parser.add_argument(
-        "-t",
-        "--topic",
-        action="append",
-        default=[],
-        help="topic prefix for the metrics",
-    )
-    parser.add_argument(
-        "-m",
-        "--metric",
-        action="append",
-        default=[],
-        help=f"metrics to measure. Can be specified multiple times. Available metrics: {', '.join(sorted(ALL_METRICS))}",
-    )
-    parser.add_argument(
-        "-o",
-        "--output",
-        type=Path,
-        default=Path() / "measurements.jsonl",
-    )
-    parser.add_argument(
-        "cmd",
-        nargs="*",
-        help="command to measure the resource usage of",
-    )
-    args = parser.parse_args()
-
-    topics: list[str] = args.topic
-    metrics: list[str] = args.metric
-    output: Path = args.output
-    cmd: list[str] = args.cmd
-
-    results = measure(cmd, metrics)
-
-    with open(output, "a") as f:
-        for result in results:
-            for topic in topics:
-                f.write(f"{result.fmt(topic)}\n")
--- a/tests/bench-radar/repeatedly.py
+++ b/tests/bench-radar/repeatedly.py
@@ -1,108 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import json
-import subprocess
-import sys
-from contextlib import contextmanager
-from dataclasses import dataclass
-from pathlib import Path
-
-REPO = Path()
-OUTFILE = REPO / "measurements.jsonl"
-OUTFILE_TMP = REPO / "measurements_repeated_tmp.jsonl"
-
-
-@dataclass
-class Measurement:
-    metric: str
-    value: float
-    unit: str | None
-
-    @classmethod
-    def from_json_str(cls, s: str) -> "Measurement":
-        data = json.loads(s.strip())
-        return cls(data["metric"], data["value"], data.get("unit"))
-
-    def to_json_str(self) -> str:
-        if self.unit is None:
-            return json.dumps({"metric": self.metric, "value": self.value})
-        return json.dumps(
-            {"metric": self.metric, "value": self.value, "unit": self.unit}
-        )
-
-
-@contextmanager
-def temporarily_move_outfile():
-    if OUTFILE_TMP.exists():
-        raise Exception(f"{OUTFILE_TMP} already exists")
-
-    OUTFILE.touch()
-    OUTFILE.rename(OUTFILE_TMP)
-    try:
-        yield
-    finally:
-        OUTFILE_TMP.rename(OUTFILE)
-
-
-def read_measurements_from_outfile() -> list[Measurement]:
-    measurements = []
-    with open(OUTFILE, "r") as f:
-        for line in f:
-            measurements.append(Measurement.from_json_str(line))
-    return measurements
-
-
-def write_measurements_to_outfile(measurements: list[Measurement]) -> None:
-    with open(OUTFILE, "a") as f:
-        for measurement in measurements:
-            f.write(f"{measurement.to_json_str()}\n")
-
-
-def run_once(cmd: list[str]) -> list[Measurement]:
-    with temporarily_move_outfile():
-        proc = subprocess.run(cmd)
-        if proc.returncode != 0:
-            sys.exit(proc.returncode)
-
-        return read_measurements_from_outfile()
-
-
-def repeatedly(cmd: list[str], iterations: int) -> list[Measurement]:
-    totals: dict[str, Measurement] = {}
-
-    for i in range(iterations):
-        for measurement in run_once(cmd):
-            if existing := totals.get(measurement.metric):
-                measurement.value += existing.value
-            totals[measurement.metric] = measurement
-
-    for measurement in totals.values():
-        measurement.value /= iterations
-
-    return list(totals.values())
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description=f"Repeatedly run a command, averaging the resulting measurements in {OUTFILE.name}.",
-    )
-    parser.add_argument(
-        "-n",
-        "--iterations",
-        type=int,
-        default=5,
-        help="number of iterations",
-    )
-    parser.add_argument(
-        "cmd",
-        nargs="*",
-        help="command to repeatedly run",
-    )
-    args = parser.parse_args()
-
-    iterations: int = args.iterations
-    cmd: list[str] = args.cmd
-
-    measurements = repeatedly(cmd, iterations)
-    write_measurements_to_outfile(measurements)
--- a/tests/bench-radar/run_build
+++ b/tests/bench-radar/run_build
@@ -1,8 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-echo "Running benchmark: build"
-tests/bench-radar/build/run
-
-echo "Running benchmark: size"
-tests/bench-radar/size/run
--- a/tests/bench-radar/size/README.md
+++ b/tests/bench-radar/size/README.md
@@ -1,42 +0,0 @@
-# The `size` benchmark
-
-This benchmark measures the number and size of a few kinds of files.
-It expects to be executed after the `build` benchmark.
-
-The following general metrics are collected:
-
- `size/libleanshared.so//bytes`
- `size/libleanshared.so//dynamic symbols`
- `size/libLake_shared.so//dynamic symbols`
-
-The following metrics are collected from the entire build process:
-
- `size/all/.c//files`
- `size/all/.c//lines`
- `size/all/.cpp//files`
- `size/all/.cpp//lines`
- `size/all/.lean//files`
- `size/all/.lean//lines`
- `size/all/.ilean//files`
- `size/all/.ilean//bytes`
- `size/all/.olean//files`
- `size/all/.olean//bytes`
- `size/all/.olean.server//files`
- `size/all/.olean.server//bytes`
- `size/all/.olean.private//files`
- `size/all/.olean.private//bytes`
- `size/all/.ir//files`
- `size/all/.ir//bytes`
-
-The following metrics are collected only for the `Init` library.
-
- `size/init/.olean//files`
- `size/init/.olean//bytes`
- `size/init/.olean.server//files`
- `size/init/.olean.server//bytes`
- `size/init/.olean.private//files`
- `size/init/.olean.private//bytes`
-
-The following metric measures the size of all files produced by a `make install`.
-
- `size/install//bytes`
--- a/tests/bench-radar/size/run
+++ b/tests/bench-radar/size/run
@@ -1,95 +0,0 @@
-#!/usr/bin/env python3
-
-import json
-import subprocess
-from pathlib import Path
-from typing import Iterable
-
-OUTFILE = Path() / "measurements.jsonl"
-
-SRC = Path("src")
-STAGE3 = Path("build/release/stage3")
-STAGE3_TEMP = STAGE3 / "lib" / "temp"
-STAGE3_LEAN = STAGE3 / "lib" / "lean"
-
-
-def output_result(metric: str, value: float, unit: str | None = None) -> None:
-    data = {"metric": metric, "value": value}
-    if unit is not None:
-        data["unit"] = unit
-    with open(OUTFILE, "a") as f:
-        f.write(f"{json.dumps(data)}\n")
-
-
-def measure_bytes(topic: str, paths: Iterable[Path]) -> None:
-    amount = 0
-    total = 0
-    for path in paths:
-        amount += 1
-        total += path.stat().st_size
-    output_result(f"{topic}//files", amount)
-    output_result(f"{topic}//bytes", total, "B")
-
-
-def measure_lines(topic: str, paths: Iterable[Path]) -> None:
-    amount = 0
-    total = 0
-    for path in paths:
-        amount += 1
-        with open(path) as f:
-            total += sum(1 for _ in f)
-    output_result(f"{topic}//files", amount)
-    output_result(f"{topic}//lines", total)
-
-
-def measure_bytes_for_file(topic: str, path: Path) -> int:
-    size = path.stat().st_size
-    output_result(f"{topic}//bytes", size, "B")
-    return size
-
-
-def measure_bytes_for_dir(topic: str, path: Path) -> int:
-    total = 0
-    for path in path.rglob("*"):
-        if path.is_file():
-            total += path.stat().st_size
-    output_result(f"{topic}//bytes", total, "B")
-    return total
-
-
-def measure_symbols_for_file(topic: str, path: Path) -> int:
-    result = subprocess.run(
-        ["nm", "--extern-only", "--defined-only", path],
-        capture_output=True,
-        encoding="utf-8",
-        check=True,
-    )
-    count = len(result.stdout.splitlines())
-    output_result(f"{topic}//dynamic symbols", count)
-    return count
-
-
-if __name__ == "__main__":
-    measure_bytes_for_file("size/libleanshared.so", STAGE3_LEAN / "libleanshared.so")
-    measure_symbols_for_file("size/libleanshared.so", STAGE3_LEAN / "libleanshared.so")
-    measure_symbols_for_file(
-        "size/libLake_shared.so", STAGE3_LEAN / "libLake_shared.so"
-    )
-    measure_bytes_for_dir("size/install", STAGE3 / "install")
-
-    # Stdlib
-    measure_lines("size/all/.c", STAGE3_TEMP.glob("**/*.c"))
-    measure_bytes("size/all/.ir", STAGE3_LEAN.glob("**/*.ir"))
-    measure_lines("size/all/.cpp", SRC.glob("**/*.cpp"))
-    measure_lines("size/all/.lean", SRC.glob("**/*.lean"))
-    measure_bytes("size/all/.ilean", STAGE3_LEAN.glob("**/*.ilean"))
-    measure_bytes("size/all/.olean", STAGE3_LEAN.glob("**/*.olean"))
-    measure_bytes("size/all/.olean.server", STAGE3_LEAN.glob("**/*.olean.server"))
-    measure_bytes("size/all/.olean.private", STAGE3_LEAN.glob("**/*.olean.private"))
-
-    # Init
-    measure_bytes("size/init/.olean", STAGE3_LEAN.glob("Init/**/*.olean"))
-    measure_bytes("size/init/.olean.server", STAGE3_LEAN.glob("Init/**/*.olean.server"))
-    measure_bytes(
-        "size/init/.olean.private", STAGE3_LEAN.glob("Init/**/*.olean.private")
-    )
--- a/tests/bench-radar/size/run.py
+++ b/tests/bench-radar/size/run.py
@@ -1 +0,0 @@
-run
--- a/tests/bench/.gitignore
+++ b/tests/bench/.gitignore
@@ -1,4 +1,3 @@
-/build
 *.out
 *.lean.c
 *.lean.linked.bc
--- a/tests/bench/binarytrees.lean.args
+++ b/tests/bench/binarytrees.lean.args
@@ -1 +0,0 @@
-14
--- a/tests/bench-radar/build/README.md
+++ b/tests/bench-radar/build/README.md
@@ -1,7 +1,8 @@
 # The `build` benchmark

-This benchmark executes a complete build of the stage3 stdlib
-and collects global and per-module metrics.
+This benchmark executes a complete build of the stage3 stdlib from stage2 and
+collects global and per-module metrics. This is different from most other
+benchmarks, which benchmark the stage the bench suite is being executed in.

 The following metrics are collected by a wrapper around the entire build process:

--- a/tests/bench/build/fake_root/bin/lean
+++ b/tests/bench/build/fake_root/bin/lean
@@ -0,0 +1 @@
+../../lean_wrapper.py
--- a/tests/bench-radar/build/lakeprof_report_template.html
+++ b/tests/bench-radar/build/lakeprof_report_template.html
--- a/tests/bench/build/lakeprof_report_upload.py
+++ b/tests/bench/build/lakeprof_report_upload.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+# Determine paths relative to the current file.
+script_file = Path(__file__)
+src_dir = script_file.parent.parent.parent.parent / "src"
+template_file = script_file.parent / "lakeprof_report_template.html"
+
+
+def run_stdout(*command: str, cwd: Path | None = None) -> str:
+    result = subprocess.run(command, capture_output=True, encoding="utf-8", cwd=cwd)
+    if result.returncode != 0:
+        print(result.stdout, end="", file=sys.stdout)
+        print(result.stderr, end="", file=sys.stderr)
+        sys.exit(result.returncode)
+    return result.stdout
+
+
+sha = run_stdout("git", "rev-parse", "@", cwd=src_dir).strip()
+base_url = f"https://speed.lean-lang.org/lean4-out/{sha}"
+report = run_stdout("lakeprof", "report", "-prc", cwd=src_dir)
+
+template = template_file.read_text()
+template = template.replace("__BASE_URL__", json.dumps(base_url))
+template = template.replace("__LAKEPROF_REPORT__", report)
+(src_dir / "index.html").write_text(template)
+
+
+def upload(file: Path) -> None:
+    subprocess.run(["curl", "-fT", file, f"{base_url}/{file.name}"], check=True)
+
+
+upload(src_dir / "index.html")
+upload(src_dir / "lakeprof.log")
+upload(src_dir / "lakeprof.trace_event")
--- a/tests/bench-radar/build/lean_wrapper.py
+++ b/tests/bench-radar/build/lean_wrapper.py
@@ -2,24 +2,27 @@

 import argparse
 import json
+import os
 import re
 import subprocess
 import sys
 from collections import Counter
 from pathlib import Path

-NAME = "build"
-REPO = Path("..")
-BENCH = REPO / "tests" / "bench-radar"
-STAGE2 = REPO / "build" / "release" / "stage2"
-OUT = REPO / "measurements.jsonl"
+# Global paths
+TEST_DIR = Path(os.environ["TEST_DIR"])
+WRAPPER_OUT = Path(os.environ["WRAPPER_OUT"])
+WRAPPER_PREFIX = Path(os.environ["WRAPPER_PREFIX"])
+
+# Other config
+BENCHMARK = "build"


-def save_result(metric: str, value: float, unit: str | None = None) -> None:
+def save_measurement(metric: str, value: float, unit: str | None = None) -> None:
    data = {"metric": metric, "value": value}
    if unit is not None:
        data["unit"] = unit
-    with open(OUT, "a") as f:
+    with open(WRAPPER_OUT, "a") as f:
        f.write(f"{json.dumps(data)}\n")


@@ -46,7 +49,7 @@ def get_module(setup: Path) -> str:
 def count_lines(module: str, path: Path) -> None:
    with open(path) as f:
        lines = sum(1 for _ in f)
-    save_result(f"{NAME}/module/{module}//lines", lines)
+    save_measurement(f"{BENCHMARK}/module/{module}//lines", lines)


 def count_bytes(module: str, path: Path, suffix: str) -> None:
@@ -54,18 +57,18 @@ def count_bytes(module: str, path: Path, suffix: str) -> None:
        bytes = path.with_suffix(suffix).stat().st_size
    except FileNotFoundError:
        return
-    save_result(f"{NAME}/module/{module}//bytes {suffix}", bytes, "B")
+    save_measurement(f"{BENCHMARK}/module/{module}//bytes {suffix}", bytes, "B")


 def run_lean(module: str) -> None:
    stdout, stderr = run_capture(
-        f"{BENCH}/measure.py",
-        *("-t", f"{NAME}/module/{module}"),
-        *("-o", f"{OUT}"),
+        f"{TEST_DIR}/measure.py",
+        *("-t", f"{BENCHMARK}/module/{module}"),
+        *("-o", f"{WRAPPER_OUT}", "-a"),
        *("-m", "instructions"),
        *("-m", "cycles"),
        "--",
-        f"{STAGE2}/bin/lean.wrapped",
+        "lean",
        *("--profile", "-Dprofiler.threshold=9999999"),
        "--stat",
        *sys.argv[1:],
@@ -79,7 +82,7 @@ def run_lean(module: str) -> None:
            seconds = float(match.group(2))
            if match.group(3) == "ms":
                seconds = seconds / 1000
-            save_result(f"{NAME}/profile/{name}//wall-clock", seconds, "s")
+            save_measurement(f"{BENCHMARK}/profile/{name}//wall-clock", seconds, "s")

    # Output of `lean --stat`
    stat = Counter[str]()
@@ -91,12 +94,20 @@ def run_lean(module: str) -> None:
    for name, count in stat.items():
        if count > 0:
            if name.endswith("bytes"):
-                save_result(f"{NAME}/stat/{name}//bytes", count, "B")
+                save_measurement(f"{BENCHMARK}/stat/{name}//bytes", count, "B")
            else:
-                save_result(f"{NAME}/stat/{name}//amount", count)
+                save_measurement(f"{BENCHMARK}/stat/{name}//amount", count)


 def main() -> None:
+    if sys.argv[1:] == ["--print-prefix"]:
+        print(WRAPPER_PREFIX)
+        return
+
+    if sys.argv[1:] == ["--githash"]:
+        run("lean", "--githash")
+        return
+
    parser = argparse.ArgumentParser()
    parser.add_argument("lean", type=Path)
    parser.add_argument("--setup", type=Path)
--- a/tests/bench/build/run_bench
+++ b/tests/bench/build/run_bench
@@ -0,0 +1,72 @@
+#!/usr/bin/env bash
+source ../../env_bench.sh
+source "$TEST_DIR/util.sh"
+
+STAGE_THIS="stage$STAGE"
+STAGE_NEXT="stage$((STAGE + 1))"
+
+BUILD_ROOT="$(realpath "$BUILD_DIR/..")"
+BUILD_THIS="$(realpath "$BUILD_ROOT/$STAGE_THIS")"
+BUILD_NEXT="$(realpath "$BUILD_ROOT/$STAGE_NEXT")"
+
+OUT="$(realpath measurements.jsonl)"
+
+
+
+echo
+echo ">"
+echo "> Configuring $STAGE_NEXT..."
+echo ">"
+
+# Building a stage mostly affects files in that stage's build directory.
+# However, the bench suite runs inside the source directory for developer UX
+# reasons, so some stage-specific bench suite files are generated in the source
+# directory (namely the env_*.sh files).
+#
+# To avoid messing up the rest of the bench suite, we restore those files to
+# STAGE_THIS's versions immediately after we configure STAGE_NEXT. Yes, this is
+# a big hack, but it allows running the build benchmark as part of the bench
+# suite instead of completely separately.
+#
+# Configuring STAGE_NEXT also builds all stages up to and including STAGE_THIS.
+make -C "$BUILD_ROOT" -j"$(nproc)" "$STAGE_NEXT-configure"
+make -C "$BUILD_ROOT" -j"$(nproc)" "$STAGE_THIS-configure"
+
+
+
+echo
+echo ">"
+echo "> Warming up $STAGE_NEXT..."
+echo ">"
+
+make -C "$BUILD_NEXT" -j"$(nproc)"
+find "$BUILD_NEXT/lib" -name "*.olean" -delete
+rm -f measurements.jsonl
+
+
+
+echo
+echo ">"
+echo "> Building $STAGE_NEXT..."
+echo ">"
+
+LAKE_OVERRIDE_LEAN=true LEAN="$(realpath fake_root/bin/lean)" \
+WRAPPER_PREFIX="$(realpath fake_root)" WRAPPER_OUT="$OUT" \
+  lakeprof record -- \
+  "$TEST_DIR/measure.py" -t build -d -a -- \
+  make -C "$BUILD_NEXT" -j"$(nproc)"
+
+
+
+echo
+echo ">"
+echo "> Analyzing lakeprof data..."
+echo ">"
+
+# Lakeprof must be executed in the src dir because it obtains some metadata by
+# calling lake in its current working directory.
+mv lakeprof.log "$SRC_DIR"
+pushd "$SRC_DIR"
+lakeprof report -pj | jq '{metric: "build/lakeprof/longest build path//wall-clock", value: .[-1][2], unit: "s"}' -c >> "$OUT"
+lakeprof report -rj | jq '{metric: "build/lakeprof/longest rebuild path//wall-clock", value: .[-1][2], unit: "s"}' -c >> "$OUT"
+popd
--- a/tests/bench/build/run_upload_lakeprof_report
+++ b/tests/bench/build/run_upload_lakeprof_report
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+source ../../env_bench.sh
+source "$TEST_DIR/util.sh"
+
+# This should run in the same environment as run_bench, otherwise `lakeprof`
+# will use the `lake` from the global system `elan` install and not the one from
+# the current commit.
+#
+# Once an elan with support for relative toolchains has been widely released and
+# been adopted by this repo, this wrapper script should no longer be necessary
+# and the upload script can be called directly.
+./lakeprof_report_upload.py
--- a/tests/bench/const_fold.lean.args
+++ b/tests/bench/const_fold.lean.args
@@ -1 +0,0 @@
-15
--- a/tests/bench/deriv.lean.args
+++ b/tests/bench/deriv.lean.args
@@ -1 +0,0 @@
-9
--- a/tests/bench/inundation/.gitignore
+++ b/tests/bench/inundation/.gitignore
@@ -1,2 +0,0 @@
-/build
-/test
--- a/tests/bench/liasolver.lean.args
+++ b/tests/bench/liasolver.lean.args
@@ -1 +0,0 @@
-ex-50-50-1.leq
--- a/tests/bench/nat_repr.lean.args
+++ b/tests/bench/nat_repr.lean.args
@@ -1 +0,0 @@
-5000
--- a/tests/bench/qsort.lean.args
+++ b/tests/bench/qsort.lean.args
@@ -1 +0,0 @@
-80
--- a/tests/bench/rbmap.lean.args
+++ b/tests/bench/rbmap.lean.args
@@ -1 +0,0 @@
-100000
--- a/tests/bench/rbmap_checkpoint.lean.args
+++ b/tests/bench/rbmap_checkpoint.lean.args
@@ -1 +0,0 @@
-100000 10
--- a/tests/bench/size/README.md
+++ b/tests/bench/size/README.md
@@ -0,0 +1,4 @@
+# Size measurements
+
+This benchmark measures the number and size of a few kinds of files
+produced by the current stage's build.
--- a/tests/bench/size/measure_sizes.py
+++ b/tests/bench/size/measure_sizes.py
@@ -0,0 +1,107 @@
+import argparse
+import json
+import subprocess
+from pathlib import Path
+
+parser = argparse.ArgumentParser()
+parser.add_argument("src", type=Path)
+parser.add_argument("build", type=Path)
+parser.add_argument("install", type=Path)
+parser.add_argument("output", type=Path)
+args = parser.parse_args()
+
+src: Path = args.src
+build: Path = args.build
+install: Path = args.install
+output: Path = args.output
+
+build_temp = build / "lib" / "temp"
+build_lean = build / "lib" / "lean"
+
+
+def output_measurement(
+    topic: str,
+    category: str,
+    value: float,
+    unit: str | None = None,
+) -> None:
+    data = {"metric": f"{topic}//{category}", "value": value}
+    if unit is not None:
+        data["unit"] = unit
+    with open(output, "a") as f:
+        f.write(f"{json.dumps(data)}\n")
+
+
+def measure_bytes_for_file(topic: str, path: Path, count: bool = True) -> None:
+    bytes = path.stat().st_size
+
+    output_measurement(topic, "bytes", bytes, "B")
+    if count:
+        output_measurement(topic, "files", 1)
+
+
+def measure_bytes(topic: str, *paths: Path, count: bool = True) -> None:
+    for path in paths:
+        if path.is_file():
+            measure_bytes_for_file(topic, path, count=count)
+
+
+def measure_lines_for_file(topic: str, path: Path, count: bool = True) -> None:
+    with open(path) as f:
+        lines = sum(1 for _ in f)
+
+    output_measurement(topic, "lines", lines)
+    if count:
+        output_measurement(topic, "files", 1)
+
+
+def measure_lines(topic: str, *paths: Path, count: bool = True) -> None:
+    for path in paths:
+        if path.is_file():
+            measure_lines_for_file(topic, path, count=count)
+
+
+def measure_symbols_for_file(topic: str, path: Path, count: bool = True) -> None:
+    result = subprocess.run(
+        ["nm", "--extern-only", "--defined-only", path],
+        capture_output=True,
+        encoding="utf-8",
+        check=True,
+    )
+    symbols = len(result.stdout.splitlines())
+
+    output_measurement(topic, "dynamic symbols", symbols)
+    if count:
+        output_measurement(topic, "files", 1)
+
+
+def measure_symbols(topic: str, *paths: Path, count: bool = True) -> None:
+    for path in paths:
+        if path.is_file():
+            measure_symbols_for_file(topic, path, count=count)
+
+
+# Make sure not to measure things that depend on other tests or benchmarks (like
+# the tests/compile binary size) since you can't rely on the order the tests or
+# benchmarks are executed in.
+
+# Misc
+measure_bytes("size/libleanshared.so", build_lean / "libleanshared.so", count=False)
+measure_symbols("size/libleanshared.so", build_lean / "libleanshared.so", count=False)
+measure_symbols("size/libLake_shared.so", build_lean / "libLake_shared.so", count=False)
+measure_bytes("size/install", *install.rglob("*"))
+
+# Stdlib
+measure_lines("size/all/.c", *build_temp.rglob("*.c"))
+measure_bytes("size/all/.ir", *build_lean.rglob("*.ir"))
+measure_lines("size/all/.cpp", *src.rglob("*.cpp"))
+measure_lines("size/all/.lean", *src.rglob("*.lean"))
+measure_bytes("size/all/.ilean", *build_lean.rglob("*.ilean"))
+measure_bytes("size/all/.olean", *build_lean.rglob("*.olean"))
+measure_bytes("size/all/.olean.server", *build_lean.rglob("*.olean.server"))
+measure_bytes("size/all/.olean.private", *build_lean.rglob("*.olean.private"))
+
+# Init
+measure_bytes("size/Init/.olean", *build_lean.glob("Init/**/*.olean"))
+measure_bytes("size/Init/.olean.server", *build_lean.glob("Init/**/*.olean.server"))
+measure_bytes("size/Init/.olean.private", *build_lean.glob("Init/**/*.olean.private"))
--- a/tests/bench/size/run_bench
+++ b/tests/bench/size/run_bench
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+source ../../env_bench.sh
+source "$TEST_DIR/util.sh"
+
+make -C "$BUILD_DIR" install DESTDIR="$(realpath install)"
+python measure_sizes.py "$SRC_DIR" "$BUILD_DIR" install measurements.jsonl
+rm -rf install
--- a/tests/bench/speedcenter.exec.velcom.yaml
+++ b/tests/bench/speedcenter.exec.velcom.yaml
@@ -1,703 +0,0 @@
- attributes:
-    description: Init.Prelude async
-    tags: [other]
-    time: &time
-      #runner: time
-      # alternative config: use `perf stat` for extended properties
-      runner: perf_stat
-      perf_stat:
-        properties:
-          [
-            "wall-clock",
-            "task-clock",
-            "instructions",
-            "branches",
-            "branch-misses",
-          ]
-      rusage_properties: ["maxrss"]
-  run_config:
-    <<: *time
-    cwd: ../../src
-    cmd: lean Init/Prelude.lean
- attributes:
-    description: Init.Data.List.Sublist async
-    tags: [other]
-  run_config:
-    <<: *time
-    cwd: ../../src
-    cmd: lean Init/Data/List/Sublist.lean
- attributes:
-    description: Std.Data.Internal.List.Associative
-    tags: [other]
-  run_config:
-    <<: *time
-    cwd: ../../src
-    cmd: lean Std/Data/Internal/List/Associative.lean
- attributes:
-    description: Std.Data.DHashMap.Internal.RawLemmas
-    tags: [other]
-  run_config:
-    <<: *time
-    cwd: ../../src
-    cmd: lean Std/Data/DHashMap/Internal/RawLemmas.lean
- attributes:
-    description: Init.Data.BitVec.Lemmas
-    tags: [other]
-  run_config:
-    <<: *time
-    cwd: ../../src
-    cmd: lean Init/Data/BitVec/Lemmas.lean
- attributes:
-    description: Init.Data.List.Sublist re-elab -j4
-    tags: [other]
-  run_config:
-    <<: *time
-    cwd: ../../src
-    cmd: lean --run ../script/benchReelabRss.lean lean Init/Data/List/Sublist.lean 10 -j4
-    max_runs: 2
-    parse_output: true
- attributes:
-    description: Init.Data.BitVec.Lemmas re-elab
-    tags: [other]
-  run_config:
-    <<: *time
-    cwd: ../../src
-    cmd: lean --run ../script/benchReelabRss.lean lean Init/Data/BitVec/Lemmas.lean 3 -j4
-    max_runs: 2
-    parse_output: true
- attributes:
-    description: Init.Data.List.Sublist re-elab -j4 (watchdog rss)
-    tags: [other]
-  run_config:
-    <<: *time
-    cwd: ../../src
-    cmd: lean --run ../script/benchReelabWatchdogRss.lean lean Init/Data/List/Sublist.lean 10 -j4
-    max_runs: 2
-    parse_output: true
-# This benchmark uncovered the promise cycle in `realizeConst` (#11328)
- attributes:
-    description: Init.Data.List.Basic re-elab
-    tags: [other]
-  run_config:
-    <<: *time
-    cwd: ../../src
-    cmd: lean --run ../script/benchReelabRss.lean lean Init/Data/List/Basic.lean 10 -j4
-    max_runs: 2
-    parse_output: true
- attributes:
-    description: import Lean
-    tags: [other]
-  run_config:
-    <<: *time
-    cwd: ../../src
-    cmd: lean Lean.lean
- attributes:
-    description: tests/compiler
-    tags: [other]
-  run_config:
-    cwd: ../compiler/
-    cmd: |
-      set -eu
-      for f in *.lean; do ../bench/compile.sh $f > /dev/null; done
-      printf 'sum binary sizes: '
-      for f in *.lean; do printf '%s\0' "$f.out"; done | wc -c --files0-from=- | tail -1 | cut -d' ' -f 1
-    max_runs: 1
-    runner: output
- attributes:
-    description: tests/bench/ interpreted
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: |
-      bash -c '
-      set -euxo pipefail
-      ulimit -s unlimited
-      for f in *.args; do
-        lean --run ${f%.args} $(cat $f)
-      done
-      '
-    max_runs: 2
- attributes:
-    description: binarytrees
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./binarytrees.lean.out 21
-  build_config:
-    cmd: ./compile.sh binarytrees.lean
- attributes:
-    description: binarytrees.st
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./binarytrees.st.lean.out 21
-  build_config:
-    cmd: ./compile.sh binarytrees.st.lean
- attributes:
-    description: const_fold
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: bash -c "ulimit -s unlimited && ./const_fold.lean.out 23"
-  build_config:
-    cmd: ./compile.sh const_fold.lean
- attributes:
-    description: deriv
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./deriv.lean.out 10
-  build_config:
-    cmd: ./compile.sh deriv.lean
- attributes:
-    description: lake build clean
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: |
-      bash -c "
-      set -ex
-      ulimit -s unlimited
-      cd inundation
-      lake -flakefile-clean.lean clean
-      lake -flakefile-clean.lean build
-      "
-    max_runs: 2
-  build_config:
-    cmd: |
-      bash -c "
-      set -ex
-      ulimit -s unlimited
-      cd inundation
-      cp lakefile.lean lakefile-clean.lean
-      lake -flakefile-clean.lean -Ktest=Clean run mkBuild
-      lake -flakefile-clean.lean build
-      "
- attributes:
-    description: lake build no-op
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: |
-      bash -c "
-      set -ex
-      ulimit -s unlimited
-      lake -dinundation -flakefile-nop.lean build
-      "
-  build_config:
-    cmd: |
-      bash -c "
-      set -ex
-      ulimit -s unlimited
-      cd inundation
-      cp lakefile.lean lakefile-nop.lean
-      lake -flakefile-nop.lean -Ktest=Nop run mkBuild
-      lake -flakefile-nop.lean build
-      "
- attributes:
-    description: lake config elab
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: |
-      bash -c "
-      set -ex
-      ulimit -s unlimited
-      lake -dinundation -flakefile-rc.lean -R run nop
-      "
-  build_config:
-    cmd: cp inundation/lakefile.lean inundation/lakefile-rc.lean
- attributes:
-    description: lake config import
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: |
-      bash -c "
-      set -ex
-      ulimit -s unlimited
-      lake -dinundation run nop
-      "
-  build_config:
-    cmd: |
-      bash -c "
-      set -ex
-      ulimit -s unlimited
-      lake -dinundation run nop
-      "
- attributes:
-    description: lake config tree
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: |
-      bash -c "
-      set -ex
-      ulimit -s unlimited
-      lake -dinundation/test/tree run nop
-      "
-  build_config:
-    cmd: |
-      lake -dinundation run mkTree
-      lake -dinundation/test/tree update
- attributes:
-    description: lake env
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: |
-      bash -c "
-      set -ex
-      ulimit -s unlimited
-      lake -dinundation env true
-      "
-  build_config:
-    cmd: lake -dinundation env true
- attributes:
-    description: lake startup
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: |
-      bash -c "
-      set -ex
-      ulimit -s unlimited
-      lake self-check
-      "
- attributes:
-    description: language server startup
-    tags: [other]
-  build_config:
-    cmd: ./compile.sh server_startup.lean
-  run_config:
-    <<: *time
-    cmd: ./server_startup.lean.out
- attributes:
-    description: language server startup with ileans
-    tags: [other]
-  build_config:
-    cmd: ./compile.sh watchdogRss.lean
-  run_config:
-    <<: *time
-    cmd: ./watchdogRss.lean.out
- attributes:
-    description: ilean roundtrip
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./ilean_roundtrip.lean.out 200000
-    parse_output: true
-  build_config:
-    cmd: ./compile.sh ilean_roundtrip.lean
- attributes:
-    description: identifier auto-completion
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean -Dlinter.all=false --run identifier_completion_runner.lean
-    parse_output: true
- attributes:
-    description: liasolver
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./liasolver.lean.out ex-50-50-1.leq
-  build_config:
-    cmd: ./compile.sh liasolver.lean
- attributes:
-    description: parser
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./parser.lean.out ../../src/Init/Prelude.lean 50
-  build_config:
-    cmd: ./compile.sh parser.lean
- attributes:
-    description: qsort
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./qsort.lean.out 400
-  build_config:
-    cmd: ./compile.sh qsort.lean
- attributes:
-    description: rbmap
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./rbmap.lean.out 2000000
-  build_config:
-    cmd: ./compile.sh rbmap.lean
- attributes:
-    description: rbmap_1
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./rbmap_checkpoint.lean.out 2000000 1
-  build_config:
-    cmd: ./compile.sh rbmap_checkpoint.lean
- attributes:
-    description: rbmap_10
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./rbmap_checkpoint.lean.out 2000000 10
-  build_config:
-    cmd: ./compile.sh rbmap_checkpoint.lean
- attributes:
-    description: rbmap_fbip
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./rbmap_fbip.lean.out 2000000
-  build_config:
-    cmd: ./compile.sh rbmap_fbip.lean
- attributes:
-    description: rbmap_library
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./rbmap_library.lean.out 2000000
-  build_config:
-    cmd: ./compile.sh rbmap_library.lean
- attributes:
-    description: reduceMatch
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean reduceMatch.lean
- attributes:
-    description: simp_arith1
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean simp_arith1.lean
- attributes:
-    description: simp_bubblesort_256
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean simp_bubblesort_256.lean
- attributes:
-    description: simp_local
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean simp_local.lean
- attributes:
-    description: simp_subexpr
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean simp_subexpr.lean
- attributes:
-    description: simp_congr
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean --tstack=16384 simp_congr.lean
- attributes:
-    description: mut_rec_wf
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean mut_rec_wf.lean
- attributes:
-    description: big_match
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean big_match.lean
- attributes:
-    description: big_match_partial
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean big_match_partial.lean
- attributes:
-    description: big_match_nat
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean big_match_nat.lean
- attributes:
-    description: big_match_nat_split
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean big_match_nat_split.lean
- attributes:
-    description: big_beq
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean big_beq.lean
- attributes:
-    description: big_beq_rec
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean big_beq_rec.lean
- attributes:
-    description: big_deceq
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean big_deceq.lean
- attributes:
-    description: big_deceq_rec
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean big_deceq_rec.lean
- attributes:
-    description: nat_repr
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./nat_repr.lean.out 5000
-  build_config:
-    cmd: ./compile.sh nat_repr.lean
- attributes:
-    description: big_struct
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean big_struct.lean
- attributes:
-    description: big_struct_dep1
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean big_struct_dep1.lean
- attributes:
-    description: big_struct_dep
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean big_struct_dep.lean
- attributes:
-    description: unionfind
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./unionfind.lean.out 3000000
-  build_config:
-    cmd: ./compile.sh unionfind.lean
- attributes:
-    description: workspaceSymbols
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean workspaceSymbols.lean
-    max_runs: 2
- attributes:
-    description: charactersIn
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean charactersIn.lean
-    max_runs: 2
- attributes:
-    description: bv_decide_realworld
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean bv_decide_realworld.lean
- attributes:
-    description: bv_decide_mul
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean bv_decide_mul.lean
- attributes:
-    description: bv_decide_mod
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean bv_decide_mod.lean
-    max_runs: 2
- attributes:
-    description: bv_decide_inequality.lean
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean bv_decide_inequality.lean
-    discarded_runs: 1
-    max_runs: 2
- attributes:
-    description: bv_decide_large_aig.lean
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean bv_decide_large_aig.lean
- attributes:
-    description: bv_decide_rewriter.lean
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean bv_decide_rewriter.lean
- attributes:
-    description: big_do
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean big_do.lean
- attributes:
-    description: big_omega.lean
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean big_omega.lean
- attributes:
-    description: big_omega.lean MT
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean big_omega.lean -Dinternal.cmdlineSnapshots=false
- attributes:
-    description: omega_stress.lean async
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean omega_stress.lean
- attributes:
-    description: channel.lean
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./channel.lean.out
-    parse_output: true
-  build_config:
-    cmd: ./compile.sh channel.lean
- attributes:
-    description: riscv-ast.lean
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean riscv-ast.lean
-    max_runs: 2
- attributes:
-    description: iterators (compiled)
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./iterators.lean.out
-  build_config:
-    cmd: ./compile.sh iterators.lean
- attributes:
-    description: iterators (interpreted)
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean --run iterators.lean
- attributes:
-    description: iterators (elab)
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean iterators.lean
- attributes:
-    description: sigma iterator
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./sigmaIterator.lean.out
-  build_config:
-    cmd: ./compile.sh sigmaIterator.lean
- attributes:
-    description: workspaceSymbols with new ranges
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./workspaceSymbolsNewRanges.lean.out
-  build_config:
-    cmd: ./compile.sh workspaceSymbolsNewRanges.lean
- attributes:
-    description: hashmap.lean
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./hashmap.lean.out 11 10000
-    parse_output: true
-  build_config:
-    cmd: ./compile.sh hashmap.lean
- attributes:
-    description: treemap.lean
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./treemap.lean.out 11 10000
-    parse_output: true
-  build_config:
-    cmd: ./compile.sh treemap.lean
- attributes:
-    description: phashmap.lean
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: ./phashmap.lean.out 11 10000
-    parse_output: true
-  build_config:
-    cmd: ./compile.sh phashmap.lean
- attributes:
-    description: grind_bitvec2.lean
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean ../lean/run/grind_bitvec2.lean
- attributes:
-    description: grind_list2.lean
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean ../lean/run/grind_list2.lean
- attributes:
-    description: grind_ring_5.lean
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean ../lean/run/grind_ring_5.lean
- attributes:
-    description: leanchecker --fresh Init
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: leanchecker --fresh Init
-    max_runs: 1
- attributes:
-    description: cbv tactic (leroy compiler verification course)
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean ./cbv/leroy.lean
- attributes:
-    description: cbv tactic (prime filter)
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean ./cbv/divisors.lean
- attributes:
-    description: cbv tactic (removing duplicates from the list)
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean ./cbv/dedup.lean
- attributes:
-    description: cbv tactic (evaluating Decidable.decide)
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean ./cbv/decide.lean
- attributes:
-    description: cbv tactic (evaluating List.mergeSort)
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean ./cbv/merge_sort.lean
- attributes:
-    description: cbv tactic (System F normalization)
-    tags: [other]
-  run_config:
-    <<: *time
-    cmd: lean ./cbv/system_f.lean
--- a/tests/bench/unionfind.lean.args
+++ b/tests/bench/unionfind.lean.args
@@ -1 +0,0 @@
-70000
--- a/tests/combine.py
+++ b/tests/combine.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any
+
+
+def add_measurement(
+    values: dict[str, float],
+    units: dict[str, str | None],
+    data: dict[str, Any],
+) -> None:
+    metric = data["metric"]
+    values[metric] = values.get(metric, 0) + data["value"]
+    units[metric] = data.get("unit")
+
+
+def format_measurement(
+    values: dict[str, float],
+    units: dict[str, str | None],
+    name: str,
+) -> dict[str, Any]:
+    value = values[name]
+    unit = units.get(name)
+
+    data: dict[str, Any] = {"metric": name, "value": value}
+    if unit is not None:
+        data["unit"] = unit
+
+    return data
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Combine measurement files in the JSON Lines format, summing duplicated measurements like radar does.",
+    )
+    parser.add_argument(
+        "input",
+        nargs="*",
+        default=[],
+        help="input files to read measurements from. If none are specified, measurements are read from stdin.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=Path,
+        help="output file to write measurements to. If not specified, the result is printed to stdout.",
+    )
+    args = parser.parse_args()
+
+    inputs: list[Path] = args.input
+    output: Path | None = args.output
+
+    values: dict[str, float] = {}
+    units: dict[str, str | None] = {}
+
+    # Read measurements
+    if inputs:
+        for input in inputs:
+            with open(input, "r") as f:
+                for line in f:
+                    add_measurement(values, units, json.loads(line))
+    else:
+        for line in sys.stdin:
+            add_measurement(values, units, json.loads(line))
+
+    # Write measurements
+    if output:
+        with open(output, "w") as f:
+            for metric in sorted(values):
+                f.write(f"{json.dumps(format_measurement(values, units, metric))}\n")
+    else:
+        for metric in sorted(values):
+            print(json.dumps(format_measurement(values, units, metric)))
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/compiler/534.lean
+++ b/tests/compiler/534.lean
--- a/tests/compiler/534.lean.expected.out
+++ b/tests/compiler/534.lean.expected.out
--- a/tests/compiler/StackOverflow.lean
+++ b/tests/compiler/StackOverflow.lean
--- a/tests/compile/StackOverflow.lean.exit.expected
+++ b/tests/compile/StackOverflow.lean.exit.expected
@@ -0,0 +1 @@
+134
--- a/tests/compile/StackOverflow.lean.no_interpret
+++ b/tests/compile/StackOverflow.lean.no_interpret
--- a/tests/compiler/StackOverflow.lean.expected.out
+++ b/tests/compiler/StackOverflow.lean.expected.out
--- a/tests/compiler/StackOverflowTask.lean
+++ b/tests/compiler/StackOverflowTask.lean
--- a/tests/compile/StackOverflowTask.lean.exit.expected
+++ b/tests/compile/StackOverflowTask.lean.exit.expected
@@ -0,0 +1 @@
+134
--- a/tests/compile/StackOverflowTask.lean.no_interpret
+++ b/tests/compile/StackOverflowTask.lean.no_interpret
--- a/tests/compiler/StackOverflowTask.lean.expected.out
+++ b/tests/compiler/StackOverflowTask.lean.expected.out
--- a/tests/compiler/append.lean
+++ b/tests/compiler/append.lean
--- a/tests/compiler/append.lean.expected.out
+++ b/tests/compiler/append.lean.expected.out
--- a/tests/compiler/arity_bug1.lean
+++ b/tests/compiler/arity_bug1.lean
--- a/tests/compiler/arity_bug1.lean.expected.out
+++ b/tests/compiler/arity_bug1.lean.expected.out
--- a/tests/compiler/array.lean
+++ b/tests/compiler/array.lean
--- a/tests/compiler/array.lean.expected.out
+++ b/tests/compiler/array.lean.expected.out
--- a/tests/compiler/arrayMk.lean
+++ b/tests/compiler/arrayMk.lean
--- a/tests/compiler/arrayMk.lean.expected.out
+++ b/tests/compiler/arrayMk.lean.expected.out
--- a/tests/compiler/array_test.lean
+++ b/tests/compiler/array_test.lean
--- a/tests/compiler/array_test.lean.expected.out
+++ b/tests/compiler/array_test.lean.expected.out
--- a/tests/compiler/array_test2.lean
+++ b/tests/compiler/array_test2.lean
--- a/tests/compiler/bigctor.lean
+++ b/tests/compiler/bigctor.lean
--- a/tests/compiler/bigctor.lean.expected.out
+++ b/tests/compiler/bigctor.lean.expected.out
--- a/tests/compiler/bytearray_bug.lean
+++ b/tests/compiler/bytearray_bug.lean
--- a/tests/compiler/bytearray_bug.lean.expected.out
+++ b/tests/compiler/bytearray_bug.lean.expected.out
--- a/tests/compiler/closure_bug1.lean
+++ b/tests/compiler/closure_bug1.lean
--- a/tests/compiler/closure_bug1.lean.expected.out
+++ b/tests/compiler/closure_bug1.lean.expected.out
--- a/tests/compiler/closure_bug2.lean
+++ b/tests/compiler/closure_bug2.lean
--- a/tests/compiler/closure_bug2.lean.expected.out
+++ b/tests/compiler/closure_bug2.lean.expected.out
--- a/tests/compiler/closure_bug3.lean
+++ b/tests/compiler/closure_bug3.lean
--- a/tests/compiler/closure_bug3.lean.expected.out
+++ b/tests/compiler/closure_bug3.lean.expected.out
--- a/tests/compiler/closure_bug4.lean
+++ b/tests/compiler/closure_bug4.lean
--- a/tests/compiler/closure_bug4.lean.expected.out
+++ b/tests/compiler/closure_bug4.lean.expected.out
--- a/tests/compiler/closure_bug5.lean
+++ b/tests/compiler/closure_bug5.lean
--- a/tests/compiler/closure_bug5.lean.expected.out
+++ b/tests/compiler/closure_bug5.lean.expected.out
--- a/tests/compiler/closure_bug6.lean
+++ b/tests/compiler/closure_bug6.lean
--- a/tests/compiler/closure_bug6.lean.expected.out
+++ b/tests/compiler/closure_bug6.lean.expected.out
--- a/tests/compiler/closure_bug7.lean
+++ b/tests/compiler/closure_bug7.lean
--- a/tests/compiler/closure_bug7.lean.expected.out
+++ b/tests/compiler/closure_bug7.lean.expected.out
--- a/tests/compiler/closure_bug8.lean
+++ b/tests/compiler/closure_bug8.lean
--- a/tests/compiler/closure_bug8.lean.expected.out
+++ b/tests/compiler/closure_bug8.lean.expected.out
--- a/tests/compiler/escape.lean
+++ b/tests/compiler/escape.lean
--- a/tests/compiler/escape.lean.expected.out
+++ b/tests/compiler/escape.lean.expected.out
--- a/tests/compiler/expr.lean
+++ b/tests/compiler/expr.lean
--- a/tests/compiler/expr.lean.expected.out
+++ b/tests/compiler/expr.lean.expected.out
--- a/tests/compiler/extractClosedMutualBlock.lean
+++ b/tests/compiler/extractClosedMutualBlock.lean
--- a/tests/compiler/extractClosedMutualBlock.lean.expected.out
+++ b/tests/compiler/extractClosedMutualBlock.lean.expected.out
--- a/tests/compiler/float.lean
+++ b/tests/compiler/float.lean
--- a/tests/compiler/float.lean.expected.out
+++ b/tests/compiler/float.lean.expected.out
--- a/tests/compiler/float_cases_bug.lean
+++ b/tests/compiler/float_cases_bug.lean
--- a/tests/compiler/float_cases_bug.lean.expected.out
+++ b/tests/compiler/float_cases_bug.lean.expected.out
--- a/tests/compiler/init.lean
+++ b/tests/compiler/init.lean
--- a/tests/compiler/init.lean.no_interpreter
+++ b/tests/compiler/init.lean.no_interpreter
--- a/tests/compiler/init.lean.expected.out
+++ b/tests/compiler/init.lean.expected.out
--- a/tests/compiler/initUnboxed.lean
+++ b/tests/compiler/initUnboxed.lean
--- a/Show More
+++ b/Show More