From 472bddf733377018e9c8d0828358f91dc0602ffa Mon Sep 17 00:00:00 2001 From: mdecimus <11444311+mdecimus@users.noreply.github.com> Date: Mon, 15 Dec 2025 16:30:27 +0100 Subject: [PATCH] Spam filter and index configuration updates --- .github/workflows/ci.yml | 6 +-- Dockerfile.build | 8 +-- crates/common/src/config/jmap/settings.rs | 15 ++++-- crates/common/src/config/spamfilter.rs | 18 +++++-- crates/http/src/management/spam.rs | 16 +++++- crates/migration/src/blob.rs | 6 +++ crates/nlp/src/classifier/feature.rs | 19 ++++--- crates/nlp/src/classifier/sgd.rs | 4 +- crates/spam-filter/src/modules/classifier.rs | 53 ++++++++++++++------ 9 files changed, 100 insertions(+), 45 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8cfd1851..791d159c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -311,7 +311,7 @@ jobs: - name: Build run: | rustup target add ${{matrix.target}} - cargo build --release --target ${{matrix.target}} -p stalwart --no-default-features --features "sqlite postgres mysql rocks elastic s3 redis azure nats enterprise" + cargo build --release --target ${{matrix.target}} -p stalwart --no-default-features --features "sqlite postgres mysql rocks s3 redis azure nats enterprise" cargo build --release --target ${{matrix.target}} -p stalwart-cli mkdir -p artifacts mv ./target/${{matrix.target}}/release/stalwart.exe ./artifacts/stalwart.exe @@ -349,14 +349,14 @@ jobs: # Get latest FoundationDB installer curl --retry 5 -Lso foundationdb.pkg "$(gh api -X GET /repos/apple/foundationdb/releases --jq '.[] | select(.prerelease == false) | .assets[] | select(.name | test("${{startsWith(matrix.target, 'x86') && 'x86_64' || 'arm64'}}" + ".pkg$")) | .browser_download_url' | head -n1)" sudo installer -allowUntrusted -dumplog -pkg foundationdb.pkg -target / - cargo build --release --target ${{matrix.target}} -p stalwart --no-default-features --features "foundationdb elastic s3 redis nats enterprise" + cargo build --release --target ${{matrix.target}} -p stalwart --no-default-features --features "foundationdb s3 redis nats enterprise" mkdir -p artifacts mv ./target/${{matrix.target}}/release/stalwart ./artifacts/stalwart-foundationdb - name: Build run: | rustup target add ${{matrix.target}} - cargo build --release --target ${{matrix.target}} -p stalwart --no-default-features --features "sqlite postgres mysql rocks elastic s3 redis azure nats enterprise" + cargo build --release --target ${{matrix.target}} -p stalwart --no-default-features --features "sqlite postgres mysql rocks s3 redis azure nats enterprise" cargo build --release --target ${{matrix.target}} -p stalwart-cli mkdir -p artifacts mv ./target/${{matrix.target}}/release/stalwart ./artifacts/stalwart diff --git a/Dockerfile.build b/Dockerfile.build index 064cc895..dbbb9988 100644 --- a/Dockerfile.build +++ b/Dockerfile.build @@ -92,7 +92,7 @@ RUN \ --mount=type=cache,target=/usr/local/cargo/git \ source /env-cargo && \ if [ ! -z "${FDB_ARCH}" ]; then \ - RUSTFLAGS="-L /usr/lib" cargo chef cook --recipe-path recipe.json --zigbuild --release --target ${TARGET} -p stalwart --no-default-features --features "foundationdb elastic s3 redis nats enterprise"; \ + RUSTFLAGS="-L /usr/lib" cargo chef cook --recipe-path recipe.json --zigbuild --release --target ${TARGET} -p stalwart --no-default-features --features "foundationdb s3 redis nats enterprise"; \ fi RUN \ --mount=type=secret,id=ACTIONS_RESULTS_URL,env=ACTIONS_RESULTS_URL \ @@ -100,7 +100,7 @@ RUN \ --mount=type=cache,target=/usr/local/cargo/registry \ --mount=type=cache,target=/usr/local/cargo/git \ source /env-cargo && \ - cargo chef cook --recipe-path recipe.json --zigbuild --release --target ${TARGET} -p stalwart --no-default-features --features "sqlite postgres mysql rocks elastic s3 redis azure nats enterprise" && \ + cargo chef cook --recipe-path recipe.json --zigbuild --release --target ${TARGET} -p stalwart --no-default-features --features "sqlite postgres mysql rocks s3 redis azure nats enterprise" && \ cargo chef cook --recipe-path recipe.json --zigbuild --release --target ${TARGET} -p stalwart-cli # Copy the source code COPY . . @@ -114,7 +114,7 @@ RUN \ --mount=type=cache,target=/usr/local/cargo/git \ source /env-cargo && \ if [ ! -z "${FDB_ARCH}" ]; then \ - RUSTFLAGS="-L /usr/lib" cargo zigbuild --release --target ${TARGET} -p stalwart --no-default-features --features "foundationdb elastic s3 redis nats enterprise" && \ + RUSTFLAGS="-L /usr/lib" cargo zigbuild --release --target ${TARGET} -p stalwart --no-default-features --features "foundationdb s3 redis nats enterprise" && \ mv /app/target/${TARGET}/release/stalwart /app/artifact/stalwart-foundationdb; \ fi # Build generic version @@ -124,7 +124,7 @@ RUN \ --mount=type=cache,target=/usr/local/cargo/registry \ --mount=type=cache,target=/usr/local/cargo/git \ source /env-cargo && \ - cargo zigbuild --release --target ${TARGET} -p stalwart --no-default-features --features "sqlite postgres mysql rocks elastic s3 redis azure nats enterprise" && \ + cargo zigbuild --release --target ${TARGET} -p stalwart --no-default-features --features "sqlite postgres mysql rocks s3 redis azure nats enterprise" && \ cargo zigbuild --release --target ${TARGET} -p stalwart-cli && \ mv /app/target/${TARGET}/release/stalwart /app/artifact/stalwart && \ mv /app/target/${TARGET}/release/stalwart-cli /app/artifact/stalwart-cli diff --git a/crates/common/src/config/jmap/settings.rs b/crates/common/src/config/jmap/settings.rs index 0f8c2f3d..971ad86c 100644 --- a/crates/common/src/config/jmap/settings.rs +++ b/crates/common/src/config/jmap/settings.rs @@ -219,7 +219,7 @@ impl JmapConfig { let mut jmap = JmapConfig { default_language: Language::from_iso_639( config - .value("storage.full-text.default-language") + .value("storage.search-index.default-language") .unwrap_or("en"), ) .unwrap_or(Language::English), @@ -356,7 +356,9 @@ impl JmapConfig { calendar_parse_max_items: config .property("jmap.calendar.parse.max-items") .unwrap_or(10), - index_batch_size: config.property("jmap.index.batch-size").unwrap_or(100), + index_batch_size: config + .property("storage.search-index.batch-size") + .unwrap_or(100), index_fields: AHashMap::new(), default_folders, shared_folder, @@ -379,14 +381,17 @@ impl JmapConfig { }; if !config - .property_or_default::(&format!("jmap.index.{index_name}.enabled"), "true") + .property_or_default::( + &format!("storage.search-index.{index_name}.enabled"), + "true", + ) .unwrap_or(true) { continue; } - for (_, field) in - config.properties::(&format!("jmap.index.{index_name}.fields")) + for (_, field) in config + .properties::(&format!("storage.search-index.{index_name}.fields")) { fields.insert(field); } diff --git a/crates/common/src/config/spamfilter.rs b/crates/common/src/config/spamfilter.rs index d8b76d7b..8b64971a 100644 --- a/crates/common/src/config/spamfilter.rs +++ b/crates/common/src/config/spamfilter.rs @@ -91,6 +91,8 @@ pub struct ClassifierConfig { pub auto_learn_ham_score: f32, pub hold_samples_for: u64, pub train_frequency: Option, + pub log_scale: bool, + pub l2_normalize: bool, } #[derive(Debug, Clone, Default)] @@ -454,7 +456,7 @@ impl ClassifierConfig { let ccfh = match config.value("spam-filter.classifier.model") { Some("ftrl-fh") | None => false, Some("ftrl-ccfh") => true, - Some("disabled") => return None, + Some("disabled" | "disable") => return None, Some(other) => { config.new_build_error( "spam-filter.classifier.model", @@ -498,11 +500,11 @@ impl ClassifierConfig { .unwrap_or(Duration::from_secs(180 * 24 * 60 * 60)) .as_secs(), min_ham_samples: config - .property_or_default("spam-filter.classifier.samples.min-ham", "10") - .unwrap_or(10), + .property_or_default("spam-filter.classifier.samples.min-ham", "100") + .unwrap_or(100), min_spam_samples: config - .property_or_default("spam-filter.classifier.samples.min-spam", "10") - .unwrap_or(10), + .property_or_default("spam-filter.classifier.samples.min-spam", "100") + .unwrap_or(100), train_frequency: config .property_or_default::>( "spam-filter.classifier.training.frequency", @@ -510,6 +512,12 @@ impl ClassifierConfig { ) .unwrap_or(Some(Duration::from_secs(12 * 60 * 60))) .map(|d| d.as_secs()), + log_scale: config + .property_or_default("spam-filter.classifier.features.log-scale", "true") + .unwrap_or(true), + l2_normalize: config + .property_or_default("spam-filter.classifier.features.l2-normalize", "true") + .unwrap_or(true), } .into() } diff --git a/crates/http/src/management/spam.rs b/crates/http/src/management/spam.rs index 9f227151..c6f12279 100644 --- a/crates/http/src/management/spam.rs +++ b/crates/http/src/management/spam.rs @@ -4,7 +4,13 @@ * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL */ -use common::{Server, auth::AccessToken, config::spamfilter::SpamFilterAction, psl}; +use common::{ + Server, + auth::AccessToken, + config::spamfilter::SpamFilterAction, + manager::{SPAM_CLASSIFIER_KEY, SPAM_TRAINER_KEY}, + psl, +}; use directory::{ Permission, backend::internal::manage::{self, ManageDirectory}, @@ -87,7 +93,7 @@ impl ManageSpamHandler for Server { access_token: &AccessToken, ) -> trc::Result { match (path.get(1).copied(), path.get(2).copied(), req.method()) { - (Some("sample"), Some(class @ ("ham" | "spam")), &Method::POST) => { + (Some("upload"), Some(class @ ("ham" | "spam")), &Method::POST) => { // Validate the access token access_token.assert_has_permission(Permission::SpamFilterTrain)?; @@ -166,6 +172,12 @@ impl ManageSpamHandler for Server { false } } + Some("delete") => { + for key in [SPAM_CLASSIFIER_KEY, SPAM_TRAINER_KEY] { + self.blob_store().delete_blob(key).await?; + } + true + } Some("status") => self.inner.ipc.train_task_controller.is_running(), _ => { return Err(trc::ResourceEvent::NotFound.into_err()); diff --git a/crates/migration/src/blob.rs b/crates/migration/src/blob.rs index a0add790..9db014b0 100644 --- a/crates/migration/src/blob.rs +++ b/crates/migration/src/blob.rs @@ -216,6 +216,10 @@ pub(crate) async fn migrate_blobs_v014(server: &Server) -> trc::Result<()> { ); } OldType::Undelete { deleted_at, size } => { + // SPDX-SnippetBegin + // SPDX-FileCopyrightText: 2020 Stalwart Labs LLC + // SPDX-License-Identifier: LicenseRef-SEL + #[cfg(feature = "enterprise")] { batch @@ -244,6 +248,8 @@ pub(crate) async fn migrate_blobs_v014(server: &Server) -> trc::Result<()> { .caused_by(trc::location!())?, ); } + + // SPDX-SnippetEnd } OldType::Temp => { batch.set( diff --git a/crates/nlp/src/classifier/feature.rs b/crates/nlp/src/classifier/feature.rs index 88aeeb07..dfd9863a 100644 --- a/crates/nlp/src/classifier/feature.rs +++ b/crates/nlp/src/classifier/feature.rs @@ -124,6 +124,7 @@ pub trait FeatureBuilder { &self, features_in: &HashMap, account_id: Option, + l2_normalize: bool, ) -> Vec { let mut features_out = Vec::with_capacity(features_in.len()); let mut buf = Vec::with_capacity(2 + 4 + 63); @@ -141,14 +142,16 @@ pub trait FeatureBuilder { } // L2 normalization - let sum_of_squares = features_out - .iter() - .map(|f| f.weight() as f64 * f.weight() as f64) - .sum::(); - if sum_of_squares > 0.0 { - let norm = sum_of_squares.sqrt() as f32; - for feature in &mut features_out { - *feature.weight_mut() /= norm; + if l2_normalize { + let sum_of_squares = features_out + .iter() + .map(|f| f.weight() as f64 * f.weight() as f64) + .sum::(); + if sum_of_squares > 0.0 { + let norm = sum_of_squares.sqrt() as f32; + for feature in &mut features_out { + *feature.weight_mut() /= norm; + } } } diff --git a/crates/nlp/src/classifier/sgd.rs b/crates/nlp/src/classifier/sgd.rs index d387ff10..8947cb31 100644 --- a/crates/nlp/src/classifier/sgd.rs +++ b/crates/nlp/src/classifier/sgd.rs @@ -410,7 +410,7 @@ pub mod tests { } builder.scale(&mut sample); samples.push(Sample { - features: builder.build(&sample, 12345.into()), + features: builder.build(&sample, 12345.into(), true), class: if *class { 1.0 } else { 0.0 }, }); } @@ -431,7 +431,7 @@ pub mod tests { } builder.scale(&mut sample); samples.push(Sample { - features: builder.build(&sample, 12345.into()), + features: builder.build(&sample, 12345.into(), true), class: if *class { 1.0 } else { 0.0 }, }); } diff --git a/crates/spam-filter/src/modules/classifier.rs b/crates/spam-filter/src/modules/classifier.rs index 5ffa9041..db238e9f 100644 --- a/crates/spam-filter/src/modules/classifier.rs +++ b/crates/spam-filter/src/modules/classifier.rs @@ -367,16 +367,20 @@ impl SpamClassifier for Server { match &task { TrainTask::Fh { builder, .. } => { - builder.scale(&mut tokens); + if config.log_scale { + builder.scale(&mut tokens); + } fh_samples.push(Sample::new( - builder.build(&tokens, account_id), + builder.build(&tokens, account_id, config.l2_normalize), sample.is_spam, )); } TrainTask::Ccfh { builder, .. } => { - builder.scale(&mut tokens); + if config.log_scale { + builder.scale(&mut tokens); + } ccfh_samples.push(Sample::new( - builder.build(&tokens, account_id), + builder.build(&tokens, account_id, config.l2_normalize), sample.is_spam, )); } @@ -558,6 +562,9 @@ impl SpamClassifier for Server { async fn spam_classify(&self, ctx: &mut SpamFilterContext<'_>) -> trc::Result<()> { let classifier = self.inner.data.spam_classifier.load_full(); + let Some(config) = &self.core.spam.classifier else { + return Ok(()); + }; let started = Instant::now(); match classifier.as_ref() { @@ -566,7 +573,9 @@ impl SpamClassifier for Server { let mut has_prediction = false; let mut tokens = self.spam_build_tokens(ctx).await.0; let feature_builder = classifier.feature_builder(); - feature_builder.scale(&mut tokens); + if config.log_scale { + feature_builder.scale(&mut tokens); + } for rcpt in &ctx.input.env_rcpt_to { let prediction = if let Some(account_id) = self @@ -577,9 +586,11 @@ impl SpamClassifier for Server { { has_prediction = true; classifier - .predict_proba_sample( - &feature_builder.build(&tokens, account_id.into()), - ) + .predict_proba_sample(&feature_builder.build( + &tokens, + account_id.into(), + config.l2_normalize, + )) .into() } else { None @@ -591,8 +602,11 @@ impl SpamClassifier for Server { ctx.result.classifier_confidence = classifier_confidence; } else { // None of the recipients are local, default to global model prediction - let prediction = - classifier.predict_proba_sample(&feature_builder.build(&tokens, None)); + let prediction = classifier.predict_proba_sample(&feature_builder.build( + &tokens, + None, + config.l2_normalize, + )); ctx.result.classifier_confidence = vec![prediction.into(); ctx.input.env_rcpt_to.len()]; } @@ -602,7 +616,9 @@ impl SpamClassifier for Server { let mut has_prediction = false; let mut tokens = self.spam_build_tokens(ctx).await.0; let feature_builder = classifier.feature_builder(); - feature_builder.scale(&mut tokens); + if config.log_scale { + feature_builder.scale(&mut tokens); + } for rcpt in &ctx.input.env_rcpt_to { let prediction = if let Some(account_id) = self @@ -613,9 +629,11 @@ impl SpamClassifier for Server { { has_prediction = true; classifier - .predict_proba_sample( - &feature_builder.build(&tokens, account_id.into()), - ) + .predict_proba_sample(&feature_builder.build( + &tokens, + account_id.into(), + config.l2_normalize, + )) .into() } else { None @@ -627,8 +645,11 @@ impl SpamClassifier for Server { ctx.result.classifier_confidence = classifier_confidence; } else { // None of the recipients are local, default to global model prediction - let prediction = - classifier.predict_proba_sample(&feature_builder.build(&tokens, None)); + let prediction = classifier.predict_proba_sample(&feature_builder.build( + &tokens, + None, + config.l2_normalize, + )); ctx.result.classifier_confidence = vec![prediction.into(); ctx.input.env_rcpt_to.len()]; }