ggml-webgpu: Add the support of MUL_MAT_ID (#21147 )

* Add mul_mat_id support to WebGPU * Apply suggestion from @reeselevine --------- Co-authored-by: Reese Levine <reeselevine1@gmail.com>
ggml: add Q1_0 1-bit quantization support (CPU) (#21273 )
2026-05-03 23:54:19 +00:00 · 2026-04-06 13:08:46 -07:00 · 2026-04-06 20:55:21 +02:00
28 changed files with 1398 additions and 625 deletions
--- a/docs/ops.md
+++ b/docs/ops.md
@@ -68,7 +68,7 @@ Legend:
 |                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
-|                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | 🟡 | ❌ |
+|                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
 |                              NEG | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                             NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
 |                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
--- a/docs/ops/WebGPU.csv
+++ b/docs/ops/WebGPU.csv
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -428,7 +428,8 @@ extern "C" {
        // GGML_TYPE_IQ4_NL_8_8 = 38,
        GGML_TYPE_MXFP4   = 39, // MXFP4 (1 block)
        GGML_TYPE_NVFP4   = 40, // NVFP4 (4 blocks, E4M3 scale)
-        GGML_TYPE_COUNT   = 41,
+        GGML_TYPE_Q1_0    = 41,
+        GGML_TYPE_COUNT   = 42,
    };

    // precision
@@ -465,6 +466,7 @@ extern "C" {
        GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
        GGML_FTYPE_MOSTLY_MXFP4   = 25, // except 1d tensors
        GGML_FTYPE_MOSTLY_NVFP4   = 26, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q1_0    = 27, // except 1d tensors
    };

    // available tensor operations:
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -93,6 +93,10 @@ typedef sycl::half2 ggml_half2;
 // QR = QK / number of values before dequantization
 // QI = number of 32 bit integers before dequantization

+#define QI1_0 (QK1_0 / 32)
+#define QR1_0 1
+
+
 #define QI4_0 (QK4_0 / (4 * QR4_0))
 #define QR4_0 2

@@ -170,6 +174,13 @@ typedef sycl::half2 ggml_half2;
 #define GGML_EXTENSION __extension__
 #endif // _MSC_VER

+#define QK1_0 128
+typedef struct {
+    ggml_half d;           // delta
+    uint8_t qs[QK1_0 / 8]; // bits / quants
+} block_q1_0;
+static_assert(sizeof(block_q1_0) == sizeof(ggml_half) + QK1_0 / 8, "wrong q1_0 block size/padding");
+
 #define QK4_0 32
 typedef struct {
    ggml_half d;           // delta
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@@ -16,6 +16,7 @@
 #define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
 #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
+#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
 #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
 #define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
@@ -82,6 +83,7 @@
 #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
 // quants.c
 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
+#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
@@ -112,6 +114,7 @@
 // quants.c
 #define quantize_row_q8_K_generic quantize_row_q8_K
 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
+#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
 #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
 #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
@@ -160,6 +163,7 @@
 #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
 #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
+#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@@ -200,6 +204,7 @@
 #elif defined(__riscv)
 // quants.c
 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
+#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x1_generic ggml_quantize_mat_q8_0_4x1
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
@@ -240,6 +245,7 @@
 // quants.c
 #define quantize_row_q8_K_generic quantize_row_q8_K
 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
+#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
 #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
 #define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
@@ -303,6 +309,7 @@
 #define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
 #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
+#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
--- a/ggml/src/ggml-cpu/arch/arm/quants.c
+++ b/ggml/src/ggml-cpu/arch/arm/quants.c
@@ -137,6 +137,109 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in

 //===================================== Dot products =================================

+void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK1_0;  // 128
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q1_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    float sumf = 0.0f;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv = vdupq_n_f32(0.0f);
+
+    for (int i = 0; i < nb; i++) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d);
+
+        // Process 4 Q8_0 blocks (each has 32 elements)
+        for (int k = 0; k < 4; k++) {
+            const block_q8_0 * GGML_RESTRICT yb = &y[i * 4 + k];
+            const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
+
+            // Get the 4 bytes of bits for this Q8_0 block (32 bits = 4 bytes)
+            // Bits are at offset k*4 bytes in x[i].qs
+            const uint8_t * bits = &x[i].qs[k * 4];
+
+            // Load 32 int8 values from y
+            const int8x16_t y0 = vld1q_s8(yb->qs);
+            const int8x16_t y1 = vld1q_s8(yb->qs + 16);
+
+            // Byte 0-1: bits for y0[0..15]
+            const uint64_t expand0 = table_b2b_0[bits[0]];
+            const uint64_t expand1 = table_b2b_0[bits[1]];
+            // Byte 2-3: bits for y1[0..15]
+            const uint64_t expand2 = table_b2b_0[bits[2]];
+            const uint64_t expand3 = table_b2b_0[bits[3]];
+
+            // Build the sign vectors by reinterpreting the table values
+            uint8x8_t e0 = vcreate_u8(expand0);
+            uint8x8_t e1 = vcreate_u8(expand1);
+            uint8x8_t e2 = vcreate_u8(expand2);
+            uint8x8_t e3 = vcreate_u8(expand3);
+
+            // Shift right by 4 to get 0 or 1
+            int8x8_t s0 = vreinterpret_s8_u8(vshr_n_u8(e0, 4));
+            int8x8_t s1 = vreinterpret_s8_u8(vshr_n_u8(e1, 4));
+            int8x8_t s2 = vreinterpret_s8_u8(vshr_n_u8(e2, 4));
+            int8x8_t s3 = vreinterpret_s8_u8(vshr_n_u8(e3, 4));
+
+            // Convert 0/1 to -1/+1: sign = 2*val - 1
+            int8x8_t one = vdup_n_s8(1);
+            s0 = vsub_s8(vadd_s8(s0, s0), one);  // 2*s0 - 1
+            s1 = vsub_s8(vadd_s8(s1, s1), one);
+            s2 = vsub_s8(vadd_s8(s2, s2), one);
+            s3 = vsub_s8(vadd_s8(s3, s3), one);
+
+            // Combine into 16-element vectors
+            int8x16_t signs0 = vcombine_s8(s0, s1);
+            int8x16_t signs1 = vcombine_s8(s2, s3);
+
+            // Multiply signs with y values and accumulate
+            // dot(signs, y) where signs are +1/-1
+            int32x4_t p0 = ggml_vdotq_s32(vdupq_n_s32(0), signs0, y0);
+            int32x4_t p1 = ggml_vdotq_s32(p0, signs1, y1);
+
+            // Scale by d1 and accumulate
+            sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(p1), d0 * d1);
+        }
+    }
+
+    sumf = vaddvq_f32(sumv);
+#else
+    // Scalar fallback
+    for (int i = 0; i < nb; i++) {
+        const float d0 = GGML_FP16_TO_FP32(x[i].d);
+
+        // Process 4 Q8_0 blocks
+        for (int k = 0; k < 4; k++) {
+            const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
+
+            int sumi = 0;
+            for (int j = 0; j < QK8_0; j++) {
+                const int bit_index = k * QK8_0 + j;
+                const int byte_index = bit_index / 8;
+                const int bit_offset = bit_index % 8;
+
+                const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
+                sumi += xi * y[i*4 + k].qs[j];
+            }
+            sumf += d0 * d1 * sumi;
+        }
+    }
+#endif
+
+    *s = sumf;
+}
+
+
 void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    const int qk = QK8_0;
    const int nb = n / qk;
--- a/ggml/src/ggml-cpu/arch/loongarch/quants.c
+++ b/ggml/src/ggml-cpu/arch/loongarch/quants.c
@@ -2156,4 +2156,3 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
    ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
 }
-
--- a/ggml/src/ggml-cpu/arch/powerpc/quants.c
+++ b/ggml/src/ggml-cpu/arch/powerpc/quants.c
@@ -2302,4 +2302,3 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
    ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
 }
-
--- a/ggml/src/ggml-cpu/arch/s390/quants.c
+++ b/ggml/src/ggml-cpu/arch/s390/quants.c
@@ -1463,4 +1463,3 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
    ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
 }
-
--- a/ggml/src/ggml-cpu/arch/wasm/quants.c
+++ b/ggml/src/ggml-cpu/arch/wasm/quants.c
@@ -1218,4 +1218,3 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
 }
-
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -217,6 +217,12 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_F16,
        .nrows                    = 1,
    },
+    [GGML_TYPE_Q1_0] = {
+        .from_float               = quantize_row_q1_0,
+        .vec_dot                  = ggml_vec_dot_q1_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+    },
    [GGML_TYPE_Q4_0] = {
        .from_float               = quantize_row_q4_0,
        .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -4829,6 +4829,7 @@ void ggml_compute_forward_get_rows(
    const ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
+        case GGML_TYPE_Q1_0:
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
        case GGML_TYPE_Q5_0:
@@ -5554,6 +5555,7 @@ void ggml_compute_forward_clamp(
                ggml_compute_forward_clamp_f16(params, dst);
            } break;
        case GGML_TYPE_BF16:
+        case GGML_TYPE_Q1_0:
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
        case GGML_TYPE_Q5_0:
--- a/ggml/src/ggml-cpu/quants.c
+++ b/ggml/src/ggml-cpu/quants.c
@@ -22,6 +22,10 @@

 #define UNUSED GGML_UNUSED

+void quantize_row_q1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+    quantize_row_q1_0_ref(x, y, k);
+}
+
 void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
    quantize_row_q4_0_ref(x, y, k);
 }
@@ -116,6 +120,51 @@ void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRI

 //===================================== Dot products =================================

+void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK1_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q1_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        const float d0 = GGML_FP16_TO_FP32(x[i].d);
+
+        float sumi = 0.0f;
+
+        for (int k = 0; k < 4; k++) {
+            const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
+
+            int sumi_block = 0;
+
+            for (int j = 0; j < QK8_0; j++) {
+                const int bit_index = k * QK8_0 + j;
+                const int byte_index = bit_index / 8;
+                const int bit_offset = bit_index % 8;
+
+                const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
+                sumi_block += xi * y[i*4 + k].qs[j];
+            }
+
+            sumi += d1 * sumi_block;
+        }
+
+        sumf += d0 * sumi;
+    }
+
+    *s = sumf;
+}
+
+
 void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    const int qk = QK8_0;
    const int nb = n / qk;
--- a/ggml/src/ggml-cpu/quants.h
+++ b/ggml/src/ggml-cpu/quants.h
@@ -12,6 +12,7 @@ extern "C" {
 #endif

 // Quantization
+void quantize_row_q1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
@@ -36,6 +37,7 @@ void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
 void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

 // Dot product
+void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
@@ -68,6 +70,7 @@ void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const
 void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
 void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
 void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -32,6 +32,41 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
    return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
 }

+// reference implementation for deterministic creation of model files
+void quantize_row_q1_0_ref(const float * GGML_RESTRICT x, block_q1_0 * GGML_RESTRICT y, int64_t k) {
+    static const int qk = QK1_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float sum_abs = 0.0f;
+        for (int j = 0; j < qk; j++) {
+            sum_abs += fabsf(x[i*qk + j]);
+        }
+        const float d = sum_abs / qk;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        // Clear all bits first
+        for (int j = 0; j < qk / 8; ++j) {
+            y[i].qs[j] = 0;
+        }
+
+        // Just store sign of each weight directly (no normalization)
+        for (int j = 0; j < qk; ++j) {
+            const int bit_index = j;
+            const int byte_index = bit_index / 8;
+            const int bit_offset = bit_index % 8;
+
+            if (x[i*qk + j] >= 0.0f) {
+                y[i].qs[byte_index] |= (1 << bit_offset);
+            }
+        }
+    }
+}
+
 // reference implementation for deterministic creation of model files
 void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k) {
    static const int qk = QK4_0;
@@ -339,6 +374,26 @@ void quantize_row_nvfp4_ref(const float * GGML_RESTRICT x, block_nvfp4 * GGML_RE
    }
 }

+void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
+    static const int qk = QK1_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const float neg_d = -d;
+
+        for (int j = 0; j < qk; ++j) {
+            const int byte_index = j / 8;
+            const int bit_offset = j % 8;
+            const uint8_t bit = (x[i].qs[byte_index] >> bit_offset) & 1;
+            y[i*qk + j] = bit ? d : neg_d;
+        }
+    }
+}
+
 void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
    static const int qk = QK4_0;

@@ -1978,6 +2033,22 @@ static void quantize_row_q4_0_impl(const float * GGML_RESTRICT x, block_q4_0 * G
    }
 }

+size_t quantize_q1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    if (!quant_weights) {
+        quantize_row_q1_0_ref(src, dst, (int64_t)nrow*n_per_row);
+        return nrow * ggml_row_size(GGML_TYPE_Q1_0, n_per_row);
+    }
+    size_t row_size = ggml_row_size(GGML_TYPE_Q1_0, n_per_row);
+    char * qrow = (char *)dst;
+    for (int64_t row = 0; row < nrow; ++row) {
+        quantize_row_q1_0_ref(src, (block_q1_0*)qrow, n_per_row);
+        src += n_per_row;
+        qrow += row_size;
+    }
+    return nrow * row_size;
+}
+
+
 size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
    if (!quant_weights) {
        quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -5286,6 +5357,10 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
                    }
                }
            } break;
+        case GGML_TYPE_Q1_0:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_q1_0, data, nb);
+            } break;
        case GGML_TYPE_Q4_0:
            {
                VALIDATE_ROW_DATA_D_F16_IMPL(block_q4_0, data, nb);
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -14,6 +14,7 @@ extern "C" {
 // NOTE: these functions are defined as GGML_API because they used by the CPU backend

 // Quantization
+GGML_API void quantize_row_q1_0_ref(const float * GGML_RESTRICT x, block_q1_0 * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
@@ -41,6 +42,7 @@ GGML_API void quantize_row_iq3_s_ref  (const float * GGML_RESTRICT x, block_iq3_
 GGML_API void quantize_row_iq2_s_ref  (const float * GGML_RESTRICT x, block_iq2_s   * GGML_RESTRICT y, int64_t k);

 // Dequantization
+GGML_API void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 GGML_API void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
@@ -90,6 +92,7 @@ GGML_API size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTR
 GGML_API size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 GGML_API size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 GGML_API size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 GGML_API size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 GGML_API size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 GGML_API size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
--- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
@@ -658,6 +658,26 @@ struct ggml_webgpu_mul_mat_shader_decisions {
    uint32_t mul_mat_wg_size;
 };

+/** MUL_MAT_ID **/
+
+struct ggml_webgpu_mul_mat_id_pipeline_key {
+    ggml_type src0_type;
+    ggml_type src1_type;
+
+    bool operator==(const ggml_webgpu_mul_mat_id_pipeline_key & other) const {
+        return src0_type == other.src0_type && src1_type == other.src1_type;
+    }
+};
+
+struct ggml_webgpu_mul_mat_id_pipeline_key_hash {
+    size_t operator()(const ggml_webgpu_mul_mat_id_pipeline_key & key) const {
+        size_t seed = 0;
+        ggml_webgpu_hash_combine(seed, key.src0_type);
+        ggml_webgpu_hash_combine(seed, key.src1_type);
+        return seed;
+    }
+};
+
 /** Cpy **/

 struct ggml_webgpu_cpy_pipeline_key {
@@ -797,7 +817,10 @@ class ggml_webgpu_shader_lib {
    std::unordered_map<ggml_webgpu_mul_mat_vec_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_vec_pipeline_key_hash>
        mul_mat_vec_pipelines;     // fast mat-vec (n==1)
    std::unordered_map<ggml_webgpu_mul_mat_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_pipeline_key_hash>
-        mul_mat_fast_pipelines;    // fast mat-mat (reg-tile or subgroup)
+                                             mul_mat_fast_pipelines;       // fast mat-mat (reg-tile or subgroup)
+    std::unordered_map<int, webgpu_pipeline> mul_mat_id_gather_pipelines;  // key is fixed
+    std::unordered_map<ggml_webgpu_mul_mat_id_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_id_pipeline_key_hash>
+        mul_mat_id_pipelines;                                              // src0_type/src1_type

    std::unordered_map<ggml_webgpu_set_rows_pipeline_key, webgpu_pipeline, ggml_webgpu_set_rows_pipeline_key_hash>
        set_rows_pipelines;
@@ -1598,6 +1621,115 @@ class ggml_webgpu_shader_lib {
        return mul_mat_legacy_pipelines[key];
    }

+    webgpu_pipeline get_mul_mat_id_gather_pipeline(const ggml_webgpu_shader_lib_context & context) {
+        auto it = mul_mat_id_gather_pipelines.find(1);
+        if (it != mul_mat_id_gather_pipelines.end()) {
+            return it->second;
+        }
+        std::vector<std::string> defines;
+        defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
+
+        auto processed     = preprocessor.preprocess(wgsl_mul_mat_id_gather, defines);
+        auto decisions     = std::make_shared<ggml_webgpu_generic_shader_decisions>();
+        decisions->wg_size = context.max_wg_size;
+
+        webgpu_pipeline pipeline       = ggml_webgpu_create_pipeline(device, processed, "mul_mat_id_gather");
+        pipeline.context               = decisions;
+        mul_mat_id_gather_pipelines[1] = pipeline;
+        return pipeline;
+    }
+
+    webgpu_pipeline get_mul_mat_id_pipeline(const ggml_webgpu_shader_lib_context & context) {
+        ggml_webgpu_mul_mat_id_pipeline_key key = {
+            .src0_type = context.src0->type,
+            .src1_type = context.src1->type,
+        };
+
+        auto it = mul_mat_id_pipelines.find(key);
+        if (it != mul_mat_id_pipelines.end()) {
+            return it->second;
+        }
+
+        std::vector<std::string> defines;
+        std::string              variant = "mul_mat_id";
+        defines.push_back("MUL_MAT_ID");
+
+        // src1 type
+        switch (context.src1->type) {
+            case GGML_TYPE_F32:
+                defines.push_back("SRC1_INNER_TYPE=f32");
+                break;
+            case GGML_TYPE_F16:
+                defines.push_back("SRC1_INNER_TYPE=f16");
+                break;
+            default:
+                GGML_ABORT("Unsupported src1 type for mul_mat fast shader");
+        }
+
+        // src0 type
+        const struct ggml_type_traits * src0_traits = ggml_get_type_traits(context.src0->type);
+        const char *                    src0_name   = src0_traits->type_name;
+
+        switch (context.src0->type) {
+            case GGML_TYPE_F32:
+                defines.push_back("SRC0_INNER_TYPE=f32");
+                defines.push_back("FLOAT");
+                defines.push_back("INIT_SRC0_SHMEM_FLOAT");
+                defines.push_back("INIT_SRC1_SHMEM_FLOAT");
+                variant += "_f32";
+                break;
+            case GGML_TYPE_F16:
+                defines.push_back("SRC0_INNER_TYPE=f16");
+                defines.push_back("FLOAT");
+                defines.push_back("INIT_SRC0_SHMEM_FLOAT");
+                defines.push_back("INIT_SRC1_SHMEM_FLOAT");
+                variant += "_f16";
+                break;
+            default:
+                {
+                    std::string type_upper = src0_name;
+                    std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper);
+
+                    defines.push_back("BYTE_HELPERS");
+                    defines.push_back("INIT_SRC0_SHMEM_" + type_upper);
+                    defines.push_back("INIT_SRC1_SHMEM_FLOAT");
+                    defines.push_back("U32_DEQUANT_HELPERS");
+                    defines.push_back("SRC0_INNER_TYPE=u32");
+
+                    variant += std::string("_") + src0_name;
+                    break;
+                }
+        }
+
+        defines.push_back("SCALAR");
+
+        // Tiles
+        defines.push_back("TILE_M=" + std::to_string(WEBGPU_MUL_MAT_TILE_M) + "u");
+        defines.push_back("TILE_N=" + std::to_string(WEBGPU_MUL_MAT_TILE_N) + "u");
+        defines.push_back("TILE_K=" + std::to_string(WEBGPU_MUL_MAT_TILE_K) + "u");
+
+        defines.push_back("WORKGROUP_SIZE_M=" + std::to_string(WEBGPU_MUL_MAT_WG_SIZE_M) + "u");
+        defines.push_back("WORKGROUP_SIZE_N=" + std::to_string(WEBGPU_MUL_MAT_WG_SIZE_N) + "u");
+
+        // variant suffix for src1 type
+        variant += std::string("_") + (context.src1->type == GGML_TYPE_F32 ? "f32" : "f16");
+
+        auto processed = preprocessor.preprocess(wgsl_mul_mat_id, defines);
+
+        auto decisions       = std::make_shared<ggml_webgpu_mul_mat_shader_decisions>();
+        decisions->tile_k    = WEBGPU_MUL_MAT_TILE_K;
+        decisions->tile_m    = WEBGPU_MUL_MAT_TILE_M;
+        decisions->tile_n    = WEBGPU_MUL_MAT_TILE_N;
+        decisions->wg_size_m = WEBGPU_MUL_MAT_WG_SIZE_M;
+        decisions->wg_size_n = WEBGPU_MUL_MAT_WG_SIZE_N;
+        decisions->wg_size   = WEBGPU_MUL_MAT_WG_SIZE_M * WEBGPU_MUL_MAT_WG_SIZE_N;
+
+        webgpu_pipeline pipeline  = ggml_webgpu_create_pipeline(device, processed, variant);
+        pipeline.context          = decisions;
+        mul_mat_id_pipelines[key] = pipeline;
+        return mul_mat_id_pipelines[key];
+    }
+
    webgpu_pipeline get_unary_pipeline(const ggml_webgpu_shader_lib_context & context) {
        const bool                     is_unary = context.dst->op == GGML_OP_UNARY;
        const int                      op       = is_unary ? (int) ggml_get_unary_op(context.dst) : context.dst->op;
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -1376,6 +1376,163 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context &       ctx,
    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x, wg_y);
 }

+static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context &       ctx,
+                                                wgpu::CommandEncoder & encoder,
+                                                ggml_tensor *          src0,
+                                                ggml_tensor *          src1,
+                                                ggml_tensor *          src2,
+                                                ggml_tensor *          dst) {
+    ggml_webgpu_shader_lib_context shader_lib_ctx = {
+        .src0        = src0,
+        .src1        = src1,
+        .src2        = src2,
+        .dst         = dst,
+        .max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup,
+    };
+
+    // Get or create pipeline
+    webgpu_pipeline gather_pipeline, main_pipeline;
+
+    std::vector<webgpu_pipeline>                   pipelines;
+    std::vector<std::vector<uint32_t>>             params_list;
+    std::vector<std::vector<wgpu::BindGroupEntry>> entries_list;
+    std::vector<std::pair<uint32_t, uint32_t>>     workgroups_list;
+
+    gather_pipeline = ctx->shader_lib->get_mul_mat_id_gather_pipeline(shader_lib_ctx);
+    main_pipeline   = ctx->shader_lib->get_mul_mat_id_pipeline(shader_lib_ctx);
+
+    const uint32_t param_n_expert      = (uint32_t) src0->ne[2];
+    const uint32_t param_n_expert_used = (uint32_t) dst->ne[1];
+    const uint32_t param_n_tokens      = (uint32_t) dst->ne[2];
+
+    // params for mul_mat_id_gather.wgsl
+    std::vector<uint32_t> gather_params = {
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src2) / ggml_type_size(src2->type)),
+        param_n_expert,
+        param_n_expert_used,
+        param_n_tokens,
+        (uint32_t) (src2->nb[1] / ggml_type_size(src2->type)),
+    };
+
+    const size_t dst_offset          = ggml_webgpu_tensor_offset(dst);
+    const size_t gathered_buf_nbytes = src0->ne[2] * src1->ne[2] * sizeof(uint32_t);
+
+    const size_t gathered_expert_used_align_offset = ROUNDUP_POW2(
+        dst_offset + ggml_nbytes(dst), ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment);
+    const size_t gathered_tokens_align_offset =
+        ROUNDUP_POW2(gathered_expert_used_align_offset + gathered_buf_nbytes,
+                     ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment);
+    const size_t gathered_count_ids_align_offset =
+        ROUNDUP_POW2(gathered_tokens_align_offset + gathered_buf_nbytes,
+                     ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment);
+
+    const size_t gathered_binding_size = ROUNDUP_POW2(gathered_buf_nbytes, WEBGPU_STORAGE_BUF_BINDING_MULT);
+    const size_t gathered_count_ids_binding_size =
+        ROUNDUP_POW2(src0->ne[2] * sizeof(uint32_t), WEBGPU_STORAGE_BUF_BINDING_MULT);
+
+    // bind group entries for mul_mat_id_gather.wgsl
+    std::vector<wgpu::BindGroupEntry> gather_entries = {
+        { .binding = 0,
+         .buffer  = ggml_webgpu_tensor_buf(src2),
+         .offset  = ggml_webgpu_tensor_align_offset(ctx, src2),
+         .size    = ggml_webgpu_tensor_binding_size(ctx, src2) },
+        { .binding = 1,
+         .buffer  = ggml_webgpu_tensor_buf(dst),
+         .offset  = gathered_expert_used_align_offset,
+         .size    = gathered_binding_size },
+        { .binding = 2,
+         .buffer  = ggml_webgpu_tensor_buf(dst),
+         .offset  = gathered_tokens_align_offset,
+         .size    = gathered_binding_size },
+        { .binding = 3,
+         .buffer  = ggml_webgpu_tensor_buf(dst),
+         .offset  = gathered_count_ids_align_offset,
+         .size    = gathered_count_ids_binding_size },
+    };
+
+    const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
+
+    const uint32_t gather_total_wg = param_n_expert;
+    const uint32_t gather_wg_x     = std::min(gather_total_wg, max_wg_per_dim);
+    const uint32_t gather_wg_y     = CEIL_DIV(gather_total_wg, gather_wg_x);
+
+    pipelines.push_back(gather_pipeline);
+    params_list.push_back(std::move(gather_params));
+    entries_list.push_back(std::move(gather_entries));
+    workgroups_list.push_back({ gather_wg_x, gather_wg_y });
+
+    // params for mul_mat_id.wgsl
+    std::vector<uint32_t> main_params = {
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
+        (uint32_t) src0->ne[0],
+        (uint32_t) src0->ne[1],
+        param_n_expert,
+        param_n_expert_used,
+        param_n_tokens,
+        (uint32_t) src1->ne[1],
+        (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
+        (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
+        (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
+        (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
+    };
+
+    // bind group entries for mul_mat_id.wgsl
+    std::vector<wgpu::BindGroupEntry> main_entries = {
+        { .binding = 0,
+         .buffer  = ggml_webgpu_tensor_buf(src0),
+         .offset  = ggml_webgpu_tensor_align_offset(ctx, src0),
+         .size    = ggml_webgpu_tensor_binding_size(ctx, src0) },
+        { .binding = 1,
+         .buffer  = ggml_webgpu_tensor_buf(src1),
+         .offset  = ggml_webgpu_tensor_align_offset(ctx, src1),
+         .size    = ggml_webgpu_tensor_binding_size(ctx, src1) },
+        { .binding = 2,
+         .buffer  = ggml_webgpu_tensor_buf(dst),
+         .offset  = ggml_webgpu_tensor_align_offset(ctx, dst),
+         .size    = ggml_webgpu_tensor_binding_size(ctx, dst) },
+        { .binding = 3,
+         .buffer  = ggml_webgpu_tensor_buf(dst),
+         .offset  = gathered_expert_used_align_offset,
+         .size    = gathered_binding_size },
+        { .binding = 4,
+         .buffer  = ggml_webgpu_tensor_buf(dst),
+         .offset  = gathered_tokens_align_offset,
+         .size    = gathered_binding_size },
+        { .binding = 5,
+         .buffer  = ggml_webgpu_tensor_buf(dst),
+         .offset  = gathered_count_ids_align_offset,
+         .size    = gathered_count_ids_binding_size },
+    };
+
+    // Calculate workgroup dimensions
+    uint32_t wg_x = 1;
+    uint32_t wg_y = 1;
+
+    auto * main_decisions = static_cast<ggml_webgpu_mul_mat_shader_decisions *>(main_pipeline.context.get());
+
+    uint32_t wg_m;
+
+    uint32_t tile_m_s           = main_decisions->tile_m * main_decisions->wg_size_m;
+    uint32_t tile_n_s           = main_decisions->tile_n * main_decisions->wg_size_n;
+    wg_m                        = CEIL_DIV(dst->ne[0], tile_m_s);
+    uint32_t total_gathered     = dst->ne[1] * dst->ne[2];
+    uint32_t max_active_experts = std::min((uint32_t) src0->ne[2], total_gathered);
+    uint32_t max_wg_n           = CEIL_DIV(total_gathered, tile_n_s) + max_active_experts;
+    uint32_t total_wg           = wg_m * max_wg_n;
+
+    compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
+
+    pipelines.push_back(main_pipeline);
+    params_list.push_back(std::move(main_params));
+    entries_list.push_back(std::move(main_entries));
+    workgroups_list.push_back({ wg_x, wg_y });
+
+    return ggml_backend_webgpu_build_multi(ctx->global_ctx, ctx->param_arena, encoder, pipelines, params_list,
+                                           entries_list, workgroups_list);
+}
+
 #ifndef __EMSCRIPTEN__
 static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context &       ctx,
                                                wgpu::CommandEncoder & encoder,
@@ -2638,6 +2795,8 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_encode_node(webgpu_context
            return ggml_webgpu_get_rows(ctx, encoder, src0, src1, node);
        case GGML_OP_MUL_MAT:
            return ggml_webgpu_mul_mat(ctx, encoder, src0, src1, node);
+        case GGML_OP_MUL_MAT_ID:
+            return ggml_webgpu_mul_mat_id(ctx, encoder, src0, src1, src2, node);
        case GGML_OP_FLASH_ATTN_EXT:
 #ifndef __EMSCRIPTEN__
            return ggml_webgpu_flash_attn(ctx, encoder, src0, src1, src2, node->src[3], node->src[4], node);
@@ -3082,6 +3241,20 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer
                }
            }
            break;
+        case GGML_OP_MUL_MAT_ID:
+            {
+                const ggml_tensor * src0 = tensor->src[0];
+                const ggml_tensor * src1 = tensor->src[1];
+                if (src0 && src1) {
+                    const size_t gathered_size = sizeof(uint32_t) * tensor->src[0]->ne[2] * tensor->src[1]->ne[2];
+                    const size_t gathered_count_ids_size = sizeof(uint32_t) * tensor->src[0]->ne[2];
+                    res                                  = ROUNDUP_POW2(
+                        res + gathered_size * 2 + gathered_count_ids_size +
+                            ctx->webgpu_global_ctx->capabilities.limits.minStorageBufferOffsetAlignment * 3,
+                        WEBGPU_STORAGE_BUF_BINDING_MULT);
+                }
+            }
+            break;
        default:
            break;
    }
@@ -3503,6 +3676,35 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
                }
                break;
            }
+        case GGML_OP_MUL_MAT_ID:
+            switch (src1->type) {
+                case GGML_TYPE_F16:
+                    supports_op |= (src0->type == GGML_TYPE_F16);
+                    break;
+                case GGML_TYPE_F32:
+                    switch (src0->type) {
+                        case GGML_TYPE_F32:
+                        case GGML_TYPE_F16:
+                        case GGML_TYPE_Q4_0:
+                        case GGML_TYPE_Q4_1:
+                        case GGML_TYPE_Q5_0:
+                        case GGML_TYPE_Q5_1:
+                        case GGML_TYPE_Q8_0:
+                        case GGML_TYPE_Q2_K:
+                        case GGML_TYPE_Q3_K:
+                        case GGML_TYPE_Q4_K:
+                        case GGML_TYPE_Q5_K:
+                        case GGML_TYPE_Q6_K:
+                            supports_op = true;
+                            break;
+                        default:
+                            break;
+                    }
+                    break;
+                default:
+                    break;
+            }
+            break;
        case GGML_OP_FLASH_ATTN_EXT:
            {
 #ifndef __EMSCRIPTEN__
--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
@@ -42,6 +42,7 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
 }
 #endif // INIT_SRC0_SHMEM_FLOAT

+#ifndef MUL_MAT_ID
 #ifdef INIT_SRC1_SHMEM_FLOAT
 fn init_shmem_src1(thread_id: u32, batch_offset: u32, offset_n: u32, k_outer: u32) {
    for (var elem_idx = thread_id * VEC_SIZE; elem_idx < TILE_SRC1_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE * VEC_SIZE) {
@@ -58,6 +59,7 @@ fn init_shmem_src1(thread_id: u32, batch_offset: u32, offset_n: u32, k_outer: u3
    }
 }
 #endif // INIT_SRC1_SHMEM_FLOAT
+#endif

 #ifdef INIT_SRC0_SHMEM_Q4_0
 const BLOCK_SIZE = 32u;
--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl
@@ -0,0 +1,193 @@
+enable f16;
+
+#include "common_decls.tmpl"
+#include "mul_mat_decls.tmpl"
+
+#ifdef VEC
+fn store_val(acc: array<array<f16, TILE_M>, TILE_N>, tn: u32, tm: u32) -> vec4<f32> {
+    return vec4<f32>(f32(acc[tn][tm]), f32(acc[tn][tm + 1]), f32(acc[tn][tm + 2]), f32(acc[tn][tm + 3]));
+}
+#endif
+
+#ifdef SCALAR
+fn store_val(acc: array<array<f16, TILE_M>, TILE_N>, tn: u32, tm: u32) -> f32 {
+    return f32(acc[tn][tm]);
+}
+#endif
+
+struct MulMatIdParams {
+    offset_src0: u32,
+    offset_src1: u32,
+    offset_dst: u32,
+
+    k: u32,
+    m: u32,
+    n_expert: u32,
+    n_expert_used: u32,
+    n_tokens: u32,
+    b_ne1: u32,
+
+    stride_01: u32,
+    stride_11: u32,
+    stride_02: u32,
+    stride_12: u32,
+};
+
+@group(0) @binding(0) var<storage, read_write> src0: array<SRC0_TYPE>; // [cols, rows, n_expert]
+@group(0) @binding(1) var<storage, read_write> src1: array<SRC1_TYPE>; // [cols, b_ne1, n_tokens]
+@group(0) @binding(2) var<storage, read_write> dst: array<DST_TYPE>;   // [rows, n_expert_used, n_tokens]
+@group(0) @binding(3) var<storage, read_write> global_gathered_expert_used: array<u32>; // [n_expert][n_tokens]
+@group(0) @binding(4) var<storage, read_write> global_gathered_tokens: array<u32>; // [n_expert][n_tokens]
+@group(0) @binding(5) var<storage, read_write> gathered_count_ids: array<u32>; // [n_expert]
+
+@group(0) @binding(6) var<uniform> params: MulMatIdParams;
+
+fn get_local_n(thread_id: u32) -> u32 {
+    return thread_id / WORKGROUP_SIZE_M;
+}
+fn get_local_m(thread_id: u32) -> u32 {
+    return thread_id % WORKGROUP_SIZE_M;
+}
+
+const TOTAL_WORKGROUP_SIZE = WORKGROUP_SIZE_M * WORKGROUP_SIZE_N;
+const TILE_SRC0_SHMEM = TILE_K * WORKGROUP_SIZE_M * TILE_M;
+const TILE_SRC1_SHMEM = TILE_K * WORKGROUP_SIZE_N * TILE_N;
+
+var<workgroup> shmem: array<f16, TILE_SRC0_SHMEM + TILE_SRC1_SHMEM>;
+var<workgroup> gathered_expert_used: array<u32, TILE_N * WORKGROUP_SIZE_N>;
+var<workgroup> gathered_tokens: array<u32, TILE_N * WORKGROUP_SIZE_N>;
+
+#ifdef INIT_SRC1_SHMEM_FLOAT
+fn init_shmem_id_src1(thread_id: u32, offset_src1: u32, rest_token_n: u32, k_outer: u32) {
+    for (var elem_idx = thread_id * VEC_SIZE; elem_idx < TILE_SRC1_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE * VEC_SIZE) {
+        let tile_n = elem_idx / TILE_K;
+        let tile_k = elem_idx % TILE_K;
+        if (tile_n < rest_token_n) {
+            let global_src10 = k_outer + tile_k;
+            let expert_used_idx = gathered_expert_used[tile_n] % params.b_ne1;
+            let token_idx = gathered_tokens[tile_n];
+            let src1_idx = offset_src1 + token_idx * params.stride_12 + expert_used_idx * params.stride_11 + global_src10;
+            let src1_val = select(
+                SRC1_TYPE(0.0),
+                src1[src1_idx/VEC_SIZE],
+                global_src10 < params.k);
+            store_shmem(SHMEM_TYPE(src1_val), TILE_SRC0_SHMEM + elem_idx);
+        } else {
+            store_shmem(SHMEM_TYPE(0.0), TILE_SRC0_SHMEM + elem_idx);
+        }
+    }
+}
+#endif // INIT_SRC1_SHMEM_FLOAT
+
+@compute @workgroup_size(TOTAL_WORKGROUP_SIZE)
+fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
+        @builtin(local_invocation_id) local_id: vec3<u32>,
+        @builtin(num_workgroups) num_wg: vec3<u32>) {
+
+    let thread_id = local_id.x;
+    let local_m = get_local_m(thread_id);
+    let local_n = get_local_n(thread_id);
+
+    var expert_idx:u32 = 0xFFFFFFFFu;
+    var wg_in_batch:u32 = 0;
+    var wg_sum:u32 = 0;
+    let wg_m_count = (params.m + WORKGROUP_SIZE_M * TILE_M - 1u) / (WORKGROUP_SIZE_M * TILE_M);
+    let wg_linear = wg_id.y * num_wg.x + wg_id.x;
+
+    for (var i = 0u;i < params.n_expert;i += 1) {
+        let wg_n_count = (gathered_count_ids[i] + WORKGROUP_SIZE_N * TILE_N - 1u) / (WORKGROUP_SIZE_N * TILE_N);
+        let wg_per_matrix = wg_m_count * wg_n_count;
+        if (wg_sum <= wg_linear && wg_linear < wg_sum + wg_per_matrix) {
+            expert_idx = i;
+            wg_in_batch = wg_linear - wg_sum;
+            break;
+        }
+        wg_sum += wg_per_matrix;
+    }
+
+    let is_valid = expert_idx != 0xFFFFFFFFu;
+
+    var wg_m: u32 = 0;
+    var wg_n: u32 = 0;
+    var offset_wg_m: u32 = 0;
+    var offset_wg_n: u32 = 0;
+    var rest_token_n: u32 = 0;
+    var src0_batch_offset: u32 = 0;
+
+    wg_m = wg_in_batch % wg_m_count;
+    wg_n = wg_in_batch / wg_m_count;
+
+    offset_wg_m = wg_m * WORKGROUP_SIZE_M * TILE_M;
+    offset_wg_n = wg_n * WORKGROUP_SIZE_N * TILE_N;
+
+    if (is_valid) {
+        rest_token_n = gathered_count_ids[expert_idx] - offset_wg_n;
+        let global_gathered_base = expert_idx * params.n_tokens + offset_wg_n;
+        for (var i = thread_id; i < TILE_N * WORKGROUP_SIZE_N && offset_wg_n + i < gathered_count_ids[expert_idx]; i += TOTAL_WORKGROUP_SIZE) {
+            gathered_expert_used[i] = global_gathered_expert_used[global_gathered_base + i];
+            gathered_tokens[i] = global_gathered_tokens[global_gathered_base + i];
+        }
+        src0_batch_offset = params.offset_src0 + expert_idx * params.stride_02;
+    }
+
+    workgroupBarrier();
+
+    let output_row_base = offset_wg_m + local_m * TILE_M;
+    let output_col_base = offset_wg_n + local_n * TILE_N;
+
+    let dst2_stride = params.m * params.n_expert_used;
+    let dst1_stride = params.m;
+
+    var acc: array<array<f16, TILE_M>, TILE_N>;
+
+    for (var k_outer = 0u; k_outer < params.k; k_outer += TILE_K) {
+
+        if (is_valid) {
+            init_shmem_src0(thread_id, src0_batch_offset, offset_wg_m, k_outer);
+            init_shmem_id_src1(thread_id, params.offset_src1, rest_token_n, k_outer);
+        }
+
+        workgroupBarrier();
+
+        if (is_valid) {
+            let k_end = min(TILE_K, params.k - k_outer);
+
+            for (var k_inner = 0u; k_inner < k_end; k_inner++) {
+                var src0_tile: array<f16, TILE_M>;
+                for (var tm = 0u; tm < TILE_M; tm++) {
+                    let src0_m = local_m * TILE_M + tm;
+                    let src0_idx = k_inner + src0_m * TILE_K;
+                    src0_tile[tm] = shmem[src0_idx];
+                }
+                for (var tn = 0u; tn < TILE_N; tn++) {
+                    let src1_n = local_n * TILE_N + tn;
+                    let src1_idx = src1_n * TILE_K + k_inner;
+                    let src1_val = shmem[TILE_SRC0_SHMEM + src1_idx];
+                    for (var tm = 0u; tm < TILE_M; tm++) {
+                        acc[tn][tm] += src0_tile[tm] * src1_val;
+                    }
+                }
+            }
+        }
+
+        workgroupBarrier();
+    }
+
+    if (is_valid) {
+        for (var tn = 0u; tn < TILE_N; tn++) {
+            let n_idx = output_col_base + tn;
+            if (n_idx < gathered_count_ids[expert_idx]) {
+                let dst1_idx = gathered_expert_used[n_idx - offset_wg_n];
+                let dst2_idx = gathered_tokens[n_idx - offset_wg_n];
+                let dst12_offset = params.offset_dst + dst2_idx * dst2_stride + dst1_idx * dst1_stride;
+                for (var tm = 0u; tm < TILE_M; tm += VEC_SIZE) {
+                    let global_row = output_row_base + tm;
+                    if (global_row < params.m) {
+                        let dst_idx = dst12_offset + global_row;
+                        dst[dst_idx/VEC_SIZE] = store_val(acc, tn, tm);
+                    }
+                }
+            }
+        }
+    }
+}
--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl
@@ -0,0 +1,55 @@
+enable f16;
+
+struct MulMatIdGatherParams {
+    offset_ids: u32,
+
+    n_expert: u32,
+    n_expert_used: u32,
+    n_tokens: u32,
+
+    stride_ids_1: u32,
+};
+
+@group(0) @binding(0) var<storage, read_write> ids: array<i32>;        // [n_expert_used, n_tokens]
+@group(0) @binding(1) var<storage, read_write> global_gathered_expert_used: array<u32>; // [n_expert][n_tokens]
+@group(0) @binding(2) var<storage, read_write> global_gathered_tokens: array<u32>; // [n_expert][n_tokens]
+@group(0) @binding(3) var<storage, read_write> gathered_count_ids: array<u32>; // [n_expert]
+
+@group(0) @binding(4) var<uniform> params: MulMatIdGatherParams;
+
+var<workgroup> count:atomic<u32>;
+
+@compute @workgroup_size(WG_SIZE)
+fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
+        @builtin(local_invocation_id) local_id: vec3<u32>,
+        @builtin(num_workgroups) num_wg: vec3<u32>) {
+
+    let thread_id = local_id.x;
+    let own_expert = wg_id.y * num_wg.x + wg_id.x; // the expert assigned to this workgroup
+
+    if (own_expert < params.n_expert) {
+        if (thread_id == 0u) {
+            atomicStore(&count, 0);
+        }
+
+        workgroupBarrier();
+
+        for (var i = thread_id;i < params.n_expert_used * params.n_tokens;i += WG_SIZE) {
+            let row = i / params.n_expert_used;
+            let col = i % params.n_expert_used;
+            let expert = u32(ids[params.offset_ids + row * params.stride_ids_1 + col]);
+            if (own_expert == expert) {
+                let pos = atomicAdd(&count, 1u);
+                let gathered_id = own_expert * params.n_tokens + pos;
+                global_gathered_expert_used[gathered_id] = col;
+                global_gathered_tokens[gathered_id] = row;
+            }
+        }
+
+        workgroupBarrier();
+
+        if (thread_id == 0u) {
+            gathered_count_ids[own_expert] = atomicLoad(&count);
+        }
+    }
+}
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -651,6 +651,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
        .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
        .from_float_ref           = (ggml_from_float_t) ggml_fp32_to_fp16_row,
    },
+    [GGML_TYPE_Q1_0] = {
+        .type_name                = "q1_0",
+        .blck_size                = QK1_0,
+        .type_size                = sizeof(block_q1_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q1_0,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q1_0_ref,
+    },
    [GGML_TYPE_Q4_0] = {
        .type_name                = "q4_0",
        .blck_size                = QK4_0,
@@ -1384,6 +1392,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
        case GGML_FTYPE_MOSTLY_BF16:          wtype = GGML_TYPE_BF16;  break;
        case GGML_FTYPE_MOSTLY_Q4_0:          wtype = GGML_TYPE_Q4_0;  break;
        case GGML_FTYPE_MOSTLY_Q4_1:          wtype = GGML_TYPE_Q4_1;  break;
+        case GGML_FTYPE_MOSTLY_Q1_0:          wtype = GGML_TYPE_Q1_0;  break;
        case GGML_FTYPE_MOSTLY_Q5_0:          wtype = GGML_TYPE_Q5_0;  break;
        case GGML_FTYPE_MOSTLY_Q5_1:          wtype = GGML_TYPE_Q5_1;  break;
        case GGML_FTYPE_MOSTLY_Q8_0:          wtype = GGML_TYPE_Q8_0;  break;
@@ -7652,6 +7661,7 @@ size_t ggml_quantize_chunk(
    size_t result = 0;

    switch (type) {
+        case GGML_TYPE_Q1_0:    result = quantize_q1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q4_0:    result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q4_1:    result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q5_0:    result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -3996,6 +3996,7 @@ class GGMLQuantizationType(IntEnum):
    TQ2_0   = 35
    MXFP4   = 39
    NVFP4   = 40
+    Q1_0    = 41


 class ExpertGatingFuncType(IntEnum):
@@ -4049,6 +4050,7 @@ class LlamaFileType(IntEnum):
    MOSTLY_TQ2_0         = 37  # except 1d tensors
    MOSTLY_MXFP4_MOE     = 38  # except 1d tensors
    MOSTLY_NVFP4         = 39  # except 1d tensors
+    MOSTLY_Q1_0          = 40  # except 1d tensors

    GUESSED              = 1024  # not specified in the model file

@@ -4161,6 +4163,7 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
    GGMLQuantizationType.TQ2_0:   (256, 2 + 64),
    GGMLQuantizationType.MXFP4:   (32, 1 + 16),
    GGMLQuantizationType.NVFP4:   (64, 4 + 32),
+    GGMLQuantizationType.Q1_0:    (128, 2 + 16),
 }


--- a/include/llama.h
+++ b/include/llama.h
@@ -154,6 +154,7 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_TQ2_0         = 37, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_MXFP4_MOE     = 38, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_NVFP4         = 39, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q1_0          = 40, // except 1d tensors

        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -36,6 +36,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
        case LLAMA_FTYPE_ALL_F32:         return "all F32";
        case LLAMA_FTYPE_MOSTLY_F16:      return "F16";
        case LLAMA_FTYPE_MOSTLY_BF16:     return "BF16";
+        case LLAMA_FTYPE_MOSTLY_Q1_0:     return "Q1_0";
        case LLAMA_FTYPE_MOSTLY_Q4_0:     return "Q4_0";
        case LLAMA_FTYPE_MOSTLY_Q4_1:     return "Q4_1";
        case LLAMA_FTYPE_MOSTLY_Q5_0:     return "Q5_0";
@@ -758,6 +759,7 @@ llama_model_loader::llama_model_loader(
            case GGML_TYPE_IQ4_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS;  break;
            case GGML_TYPE_IQ3_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ3_S;   break;
            case GGML_TYPE_NVFP4:   ftype = LLAMA_FTYPE_MOSTLY_NVFP4;   break;
+            case GGML_TYPE_Q1_0:    ftype = LLAMA_FTYPE_MOSTLY_Q1_0;    break;
            default:
                {
                    LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -799,6 +799,7 @@ ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
        case LLAMA_FTYPE_MOSTLY_F16:  return GGML_TYPE_F16;
        case LLAMA_FTYPE_MOSTLY_BF16: return GGML_TYPE_BF16;
        case LLAMA_FTYPE_ALL_F32:     return GGML_TYPE_F32;
+        case LLAMA_FTYPE_MOSTLY_Q1_0: return GGML_TYPE_Q1_0;

        case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return GGML_TYPE_MXFP4;

--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -16,6 +16,7 @@

 constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
 constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
+constexpr float MAX_QUANTIZATION_TOTAL_ERROR_BINARY = 0.025f;
 constexpr float MAX_QUANTIZATION_TOTAL_ERROR_TERNARY = 0.01f;
 constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
 constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
@@ -24,6 +25,7 @@ constexpr float MAX_QUANTIZATION_TOTAL_ERROR_FP4 = 0.0030f;
 constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
 constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f;
 constexpr float MAX_DOT_PRODUCT_ERROR_FP4 = 0.03f;
+constexpr float MAX_DOT_PRODUCT_ERROR_BINARY = 0.40f;
 constexpr float MAX_DOT_PRODUCT_ERROR_TERNARY = 0.15f;

 static const char* RESULT_STR[] = {"ok", "FAILED"};
@@ -145,6 +147,7 @@ int main(int argc, char * argv[]) {
        if (qfns_cpu->from_float && qfns->to_float) {
            const float total_error = total_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
            const float max_quantization_error =
+                type == GGML_TYPE_Q1_0    ? MAX_QUANTIZATION_TOTAL_ERROR_BINARY :
                type == GGML_TYPE_TQ1_0   ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
                type == GGML_TYPE_TQ2_0   ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
                type == GGML_TYPE_Q2_K    ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
@@ -170,6 +173,8 @@ int main(int argc, char * argv[]) {
            const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
                                            type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
                                          ? MAX_DOT_PRODUCT_ERROR_LOWBIT
+                                          : type == GGML_TYPE_Q1_0
+                                          ? MAX_DOT_PRODUCT_ERROR_BINARY
                                          : type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0
                                          ? MAX_DOT_PRODUCT_ERROR_TERNARY
                                          : type == GGML_TYPE_NVFP4
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -29,6 +29,7 @@ struct quant_option {
 };

 static const std::vector<quant_option> QUANT_OPTIONS = {
+    { "Q1_0",     LLAMA_FTYPE_MOSTLY_Q1_0,     " 1.125 bpw quantization",           },
    { "Q4_0",     LLAMA_FTYPE_MOSTLY_Q4_0,     " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
    { "Q4_1",     LLAMA_FTYPE_MOSTLY_Q4_1,     " 4.78G, +0.4511 ppl @ Llama-3-8B",  },
    { "MXFP4_MOE",LLAMA_FTYPE_MOSTLY_MXFP4_MOE," MXFP4 MoE",  },