From 9b342d0a9f2f4892daec065491583ec2be129685 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 16 Mar 2026 21:50:43 +0200 Subject: [PATCH] benches : add Nemotron 3 Nano on DGX Spark (#20652) [no ci] --- benches/nemotron/nemotron-dgx-spark.md | 51 ++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/benches/nemotron/nemotron-dgx-spark.md b/benches/nemotron/nemotron-dgx-spark.md index 2bce30a30e..420664194f 100644 --- a/benches/nemotron/nemotron-dgx-spark.md +++ b/benches/nemotron/nemotron-dgx-spark.md @@ -24,9 +24,9 @@ Fri Mar 6 11:39:45 2026 +-----------------------------------------+------------------------+----------------------+ ``` -## ggml-org/nemotron-3-super-120b-GGUF +## ggml-org/Nemotron-3-Super-120B-GGUF -Model: https://huggingface.co/ggml-org/nemotron-3-super-120b-GGUF +Model: https://huggingface.co/ggml-org/Nemotron-3-Super-120B-GGUF - `llama-batched-bench` @@ -53,7 +53,6 @@ main: n_kv_max = 303104, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_ | 8192 | 32 | 16 | 131584 | 171.066 | 766.21 | 10.774 | 47.52 | 181.840 | 723.62 | | 8192 | 32 | 32 | 263168 | 342.140 | 766.19 | 18.969 | 53.98 | 361.109 | 728.78 | - - `llama-bench` | model | size | params | backend | n_ubatch | fa | test | t/s | @@ -70,3 +69,49 @@ main: n_kv_max = 303104, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_ | nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | tg32 @ d32768 | 19.45 ± 0.18 | build: 04a65daab (8268) + +## ggml-org/Nemotron-3-Nano-4B-GGUF + +Model: https://huggingface.co/ggml-org/Nemotron-3-Nano-4B-GGUF + +- `llama-batched-bench` + +main: n_kv_max = 303104, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = 99, n_threads = 20, n_threads_batch = 20 + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 512 | 32 | 1 | 544 | 0.152 | 3371.61 | 0.597 | 53.64 | 0.748 | 726.90 | +| 512 | 32 | 2 | 1088 | 0.319 | 3208.68 | 0.857 | 74.66 | 1.176 | 924.89 | +| 512 | 32 | 4 | 2176 | 0.720 | 2843.56 | 1.323 | 96.78 | 2.043 | 1065.18 | +| 512 | 32 | 8 | 4352 | 1.428 | 2867.96 | 2.311 | 110.76 | 3.739 | 1163.82 | +| 512 | 32 | 16 | 8704 | 2.857 | 2866.94 | 4.203 | 121.82 | 7.060 | 1232.82 | +| 512 | 32 | 32 | 17408 | 5.709 | 2869.76 | 7.964 | 128.58 | 13.673 | 1273.14 | +| 4096 | 32 | 1 | 4128 | 1.458 | 2809.76 | 0.605 | 52.92 | 2.062 | 2001.52 | +| 4096 | 32 | 2 | 8256 | 2.905 | 2819.95 | 0.875 | 73.12 | 3.780 | 2183.95 | +| 4096 | 32 | 4 | 16512 | 5.790 | 2829.74 | 1.361 | 94.07 | 7.151 | 2309.17 | +| 4096 | 32 | 8 | 33024 | 11.598 | 2825.32 | 2.378 | 107.65 | 13.976 | 2362.89 | +| 4096 | 32 | 16 | 66048 | 23.208 | 2823.88 | 4.348 | 117.76 | 27.556 | 2396.89 | +| 4096 | 32 | 32 | 132096 | 46.515 | 2817.85 | 8.279 | 123.69 | 54.794 | 2410.79 | +| 8192 | 32 | 1 | 8224 | 2.950 | 2776.95 | 0.617 | 51.89 | 3.567 | 2305.75 | +| 8192 | 32 | 2 | 16448 | 5.921 | 2767.32 | 0.896 | 71.45 | 6.816 | 2413.05 | +| 8192 | 32 | 4 | 32896 | 11.842 | 2767.21 | 1.401 | 91.34 | 13.243 | 2484.03 | +| 8192 | 32 | 8 | 65792 | 23.726 | 2762.17 | 2.461 | 104.03 | 26.187 | 2512.38 | +| 8192 | 32 | 16 | 131584 | 47.777 | 2743.43 | 4.577 | 111.86 | 52.354 | 2513.36 | +| 8192 | 32 | 32 | 263168 | 96.691 | 2711.16 | 8.772 | 116.73 | 105.463 | 2495.36 | + +- `llama-bench` + +| model | size | params | backend | n_ubatch | fa | test | t/s | +| ----------------------- | ---------: | ---------: | ---------- | -------: | -: | --------------: | -------------------: | +| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | pp2048 | 2761.90 ± 19.31 | +| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | tg32 | 52.85 ± 0.12 | +| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | pp2048 @ d4096 | 2687.07 ± 21.84 | +| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | tg32 @ d4096 | 52.32 ± 0.23 | +| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | pp2048 @ d8192 | 2564.52 ± 57.69 | +| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | tg32 @ d8192 | 51.27 ± 0.34 | +| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | pp2048 @ d16384 | 2334.02 ± 37.83 | +| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | tg32 @ d16384 | 49.71 ± 0.14 | +| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | pp2048 @ d32768 | 2041.46 ± 40.45 | +| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | tg32 @ d32768 | 46.71 ± 0.13 | + +build: 1bbec6a75 (8382)