llama-quantize : cleanup --help output (#19317 )

* cleanup `llama-quantize --help` output some much needed TLC * remove future argument oops, spoiler * cleanup of cleanup
ci : remove server job from webui and move slow test (#19424 )
2026-05-17 14:34:06 +00:00 · 2026-02-08 09:22:38 +02:00 · 2026-02-08 01:20:00 +01:00 · 2026-02-07 23:50:47 +01:00
4 changed files with 53 additions and 150 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -295,6 +295,7 @@ jobs:
            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)

      - name: Build (no OpenMP)
@@ -307,6 +308,7 @@ jobs:
            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
            -DGGML_OPENMP=OFF
+
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)

      - name: Test
--- a/.github/workflows/server-webui.yml
+++ b/.github/workflows/server-webui.yml
@@ -8,10 +8,6 @@ on:
        description: 'Commit SHA1 to build'
        required: false
        type: string
-      slow_tests:
-        description: 'Run slow tests'
-        required: true
-        type: boolean
  push:
    branches:
      - master
@@ -101,119 +97,3 @@ jobs:
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:e2e
        working-directory: tools/server/webui
-
-  server-build:
-    runs-on: ubuntu-latest
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
-        build_type: [RelWithDebInfo]
-        include:
-          - build_type: Release
-            sanitizer: ""
-      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
-
-    steps:
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get -y install \
-            build-essential \
-            xxd \
-            git \
-            cmake \
-            curl \
-            wget \
-            language-pack-en \
-            libssl-dev
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Python setup
-        id: setup_python
-        uses: actions/setup-python@v6
-        with:
-          python-version: '3.11'
-
-      - name: Tests dependencies
-        id: test_dependencies
-        run: |
-          pip install -r tools/server/tests/requirements.txt
-
-      - name: Setup Node.js for WebUI
-        uses: actions/setup-node@v6
-        with:
-          node-version: "22"
-          cache: "npm"
-          cache-dependency-path: "tools/server/webui/package-lock.json"
-
-      - name: Install WebUI dependencies
-        run: npm ci
-        working-directory: tools/server/webui
-
-      - name: Build WebUI
-        run: npm run build
-        working-directory: tools/server/webui
-
-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-              -DGGML_OPENMP=OFF ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Build (sanitizers)
-        id: cmake_build_sanitizers
-        if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Build (sanitizers)
-        id: cmake_build
-        if: ${{ matrix.sanitizer == '' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Tests
-        id: server_integration_tests
-        if: ${{ matrix.sanitizer == '' }}
-        env:
-          GITHUB_ACTIONS: "true"
-        run: |
-          cd tools/server/tests
-          ./tests.sh
-
-      - name: Tests (sanitizers)
-        id: server_integration_tests_sanitizers
-        if: ${{ matrix.sanitizer != '' }}
-        run: |
-          cd tools/server/tests
-          LLAMA_SANITIZE=1 ./tests.sh
-
-      - name: Slow tests
-        id: server_integration_tests_slow
-        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
-        run: |
-          cd tools/server/tests
-          SLOW_TESTS=1 ./tests.sh
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -81,18 +81,14 @@ jobs:
            -DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
            -DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
            -DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }}
-          cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server

      - name: Python setup
        id: setup_python
        uses: actions/setup-python@v6
        with:
          python-version: '3.11'
-
-      - name: Tests dependencies
-        id: test_dependencies
-        run: |
-          pip install -r tools/server/tests/requirements.txt
+          pip-install: -r tools/server/tests/requirements.txt

      - name: Tests
        id: server_integration_tests
@@ -102,6 +98,14 @@ jobs:
          export ${{ matrix.extra_args }}
          pytest -v -x -m "not slow"

+      - name: Slow tests
+        id: server_integration_tests_slow
+        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
+        run: |
+          cd tools/server/tests
+          export ${{ matrix.extra_args }}
+          SLOW_TESTS=1 pytest -v -x
+
  server-windows:
    runs-on: windows-2022

@@ -124,11 +128,7 @@ jobs:
        uses: actions/setup-python@v6
        with:
          python-version: '3.11'
-
-      - name: Tests dependencies
-        id: test_dependencies
-        run: |
-          pip install -r tools/server/tests/requirements.txt
+          pip-install: -r tools/server/tests/requirements.txt

      - name: Tests
        id: server_integration_tests
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -119,27 +119,48 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 [[noreturn]]
 static void usage(const char * executable) {
    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
-    printf("       [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file] [--prune-layers] [--keep-split] [--override-kv]\n");
+    printf("       [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file]\n");
+    printf("       [--prune-layers] [--keep-split] [--override-kv]\n");
    printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
-    printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
-    printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
-    printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
-    printf("  --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
-    printf("  --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
-    printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
-    printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
-    printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
-    printf("  --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
-    printf("      Advanced option to selectively quantize tensors. May be specified multiple times.\n");
-    printf("  --tensor-type-file tensor_type.txt: list of tensors to quantize to specific ggml_type. example: --tensor-type-file tensor_type_list.txt\n");
-    printf("      Advanced option to selectively quantize a long list of tensors. Format to be tensor_name=ggml_type, separated by spaces/newline.\n");
-    printf("  --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
-    printf("      Advanced option to remove all tensors from the given layers\n");
-    printf("  --keep-split: will generate quantized model in the same shards as input\n");
+    printf("  --allow-requantize\n");
+    printf("                                      allow requantizing tensors that have already been quantized\n");
+    printf("                                      WARNING: this can severely reduce quality compared to quantizing\n");
+    printf("                                               from 16bit or 32bit!\n");
+    printf("  --leave-output-tensor\n");
+    printf("                                      leave output.weight un(re)quantized\n");
+    printf("                                      increases model size but may also increase quality, especially when requantizing\n");
+    printf("  --pure\n");
+    printf("                                      disable k-quant mixtures and quantize all tensors to the same type\n");
+    printf("  --imatrix file_name\n");
+    printf("                                      use data in file_name as importance matrix for quant optimizations\n");
+    printf("  --include-weights tensor_name\n");
+    printf("                                      use importance matrix for this/these tensor(s)\n");
+    printf("  --exclude-weights tensor_name\n");
+    printf("                                      do not use importance matrix for this/these tensor(s)\n");
+    printf("  --output-tensor-type ggml_type\n");
+    printf("                                      use this ggml_type for the output.weight tensor\n");
+    printf("  --token-embedding-type ggml_type\n");
+    printf("                                      use this ggml_type for the token embeddings tensor\n");
+    printf("  --tensor-type tensor_name=ggml_type\n");
+    printf("                                      quantize this tensor to this ggml_type\n");
+    printf("                                      this is an advanced option to selectively quantize tensors. may be specified multiple times.\n");
+    printf("                                      example: --tensor-type attn_q=q8_0\n");
+    printf("  --tensor-type-file tensor_types.txt\n");
+    printf("                                      list of tensors to quantize to a specific ggml_type\n");
+    printf("                                      this is an advanced option to selectively quantize a long list of tensors.\n");
+    printf("                                      the file should use the same format as above, separated by spaces or newlines.\n");
+    printf("  --prune-layers L0,L1,L2...\n");
+    printf("                                      comma-separated list of layer numbers to prune from the model\n");
+    printf("                                      WARNING: this is an advanced option, use with care.\n");
+    printf("  --keep-split\n");
+    printf("                                      generate quantized model in the same shards as input\n");
    printf("  --override-kv KEY=TYPE:VALUE\n");
-    printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
-    printf("Note: --include-weights and --exclude-weights cannot be used together\n");
-    printf("\nAllowed quantization types:\n");
+    printf("                                      override model metadata by key in the quantized model. may be specified multiple times.\n");
+    printf("                                      WARNING: this is an advanced option, use with care.\n\n");
+    printf("note: --include-weights and --exclude-weights cannot be used together\n\n");
+    printf("-----------------------------------------------------------------------------\n");
+    printf(" allowed quantization types\n");
+    printf("-----------------------------------------------------------------------------\n\n");
    for (const auto & it : QUANT_OPTIONS) {
        if (it.name != "COPY") {
            printf("  %2d  or  ", it.ftype);
Author	SHA1	Message	Date
ddh0	5999b50eb0	llama-quantize : cleanup `--help` output (#19317 ) * cleanup `llama-quantize --help` output some much needed TLC * remove future argument oops, spoiler * cleanup of cleanup	2026-02-08 09:22:38 +02:00
Sigbjørn Skjæret	9a5f57795c	ci : remove server job from webui and move slow test (#19424 ) * remove server job from webui and move slow test * use pip-install option	2026-02-08 01:20:00 +01:00
Georgi Gerganov	96441c955e	ci : use -j param correctly when building with sanitizers (#19411 ) * ci : use less jobs when building with sanitizers * cont : fix nproc * cont : fix the fix * cont : simplify	2026-02-07 23:50:47 +01:00