android: routine maintenance - Dec 2025 (#18338 )

* Fix `msg` typo * Fix thread safety in destroy() to support generation abortion in lifecycle callbacks. * UI polish: stack new message change from below; fix GGUF margin not in view port * Bug fixes: rare racing condition when main thread updating view and and default thread updating messages at the same time; user input not disabled during generation. * Bump dependencies' versions; Deprecated outdated dsl usage.
server : handle closed connection for tasks (#18459 )
2026-05-09 18:44:16 +00:00 · 2025-12-29 15:51:13 +02:00 · 2025-12-29 15:34:41 +02:00 · 2025-12-29 13:37:02 +01:00
9 changed files with 336 additions and 193 deletions
--- a/examples/llama.android/app/build.gradle.kts
+++ b/examples/llama.android/app/build.gradle.kts
@@ -41,11 +41,8 @@ android {
        }
    }
    compileOptions {
-        sourceCompatibility = JavaVersion.VERSION_1_8
-        targetCompatibility = JavaVersion.VERSION_1_8
-    }
-    kotlinOptions {
-        jvmTarget = "1.8"
+        sourceCompatibility = JavaVersion.VERSION_17
+        targetCompatibility = JavaVersion.VERSION_17
    }
 }

--- a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
@@ -6,6 +6,7 @@ import android.util.Log
 import android.widget.EditText
 import android.widget.TextView
 import android.widget.Toast
+import androidx.activity.addCallback
 import androidx.activity.enableEdgeToEdge
 import androidx.activity.result.contract.ActivityResultContracts
 import androidx.appcompat.app.AppCompatActivity
@@ -18,6 +19,7 @@ import com.arm.aichat.gguf.GgufMetadata
 import com.arm.aichat.gguf.GgufMetadataReader
 import com.google.android.material.floatingactionbutton.FloatingActionButton
 import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.Job
 import kotlinx.coroutines.flow.onCompletion
 import kotlinx.coroutines.launch
 import kotlinx.coroutines.withContext
@@ -36,6 +38,7 @@ class MainActivity : AppCompatActivity() {

    // Arm AI Chat inference engine
    private lateinit var engine: InferenceEngine
+    private var generationJob: Job? = null

    // Conversation states
    private var isModelReady = false
@@ -47,11 +50,13 @@ class MainActivity : AppCompatActivity() {
        super.onCreate(savedInstanceState)
        enableEdgeToEdge()
        setContentView(R.layout.activity_main)
+        // View model boilerplate and state management is out of this basic sample's scope
+        onBackPressedDispatcher.addCallback { Log.w(TAG, "Ignore back press for simplicity") }

        // Find views
        ggufTv = findViewById(R.id.gguf)
        messagesRv = findViewById(R.id.messages)
-        messagesRv.layoutManager = LinearLayoutManager(this)
+        messagesRv.layoutManager = LinearLayoutManager(this).apply { stackFromEnd = true }
        messagesRv.adapter = messageAdapter
        userInputEt = findViewById(R.id.user_input)
        userActionFab = findViewById(R.id.fab)
@@ -157,33 +162,35 @@ class MainActivity : AppCompatActivity() {
     * Validate and send the user message into [InferenceEngine]
     */
    private fun handleUserInput() {
-        userInputEt.text.toString().also { userSsg ->
-            if (userSsg.isEmpty()) {
+        userInputEt.text.toString().also { userMsg ->
+            if (userMsg.isEmpty()) {
                Toast.makeText(this, "Input message is empty!", Toast.LENGTH_SHORT).show()
            } else {
                userInputEt.text = null
+                userInputEt.isEnabled = false
                userActionFab.isEnabled = false

                // Update message states
-                messages.add(Message(UUID.randomUUID().toString(), userSsg, true))
+                messages.add(Message(UUID.randomUUID().toString(), userMsg, true))
                lastAssistantMsg.clear()
                messages.add(Message(UUID.randomUUID().toString(), lastAssistantMsg.toString(), false))

-                lifecycleScope.launch(Dispatchers.Default) {
-                    engine.sendUserPrompt(userSsg)
+                generationJob = lifecycleScope.launch(Dispatchers.Default) {
+                    engine.sendUserPrompt(userMsg)
                        .onCompletion {
                            withContext(Dispatchers.Main) {
+                                userInputEt.isEnabled = true
                                userActionFab.isEnabled = true
                            }
                        }.collect { token ->
-                            val messageCount = messages.size
-                            check(messageCount > 0 && !messages[messageCount - 1].isUser)
-
-                            messages.removeAt(messageCount - 1).copy(
-                                content = lastAssistantMsg.append(token).toString()
-                            ).let { messages.add(it) }
-
                            withContext(Dispatchers.Main) {
+                                val messageCount = messages.size
+                                check(messageCount > 0 && !messages[messageCount - 1].isUser)
+
+                                messages.removeAt(messageCount - 1).copy(
+                                    content = lastAssistantMsg.append(token).toString()
+                                ).let { messages.add(it) }
+
                                messageAdapter.notifyItemChanged(messages.size - 1)
                            }
                        }
@@ -195,6 +202,7 @@ class MainActivity : AppCompatActivity() {
    /**
     * Run a benchmark with the model file
     */
+    @Deprecated("This benchmark doesn't accurately indicate GUI performance expected by app developers")
    private suspend fun runBenchmark(modelName: String, modelFile: File) =
        withContext(Dispatchers.Default) {
            Log.i(TAG, "Starts benchmarking $modelName")
@@ -223,6 +231,16 @@ class MainActivity : AppCompatActivity() {
            if (!it.exists()) { it.mkdir() }
        }

+    override fun onStop() {
+        generationJob?.cancel()
+        super.onStop()
+    }
+
+    override fun onDestroy() {
+        engine.destroy()
+        super.onDestroy()
+    }
+
    companion object {
        private val TAG = MainActivity::class.java.simpleName

--- a/examples/llama.android/app/src/main/res/layout/activity_main.xml
+++ b/examples/llama.android/app/src/main/res/layout/activity_main.xml
@@ -24,7 +24,7 @@
                android:id="@+id/gguf"
                android:layout_width="match_parent"
                android:layout_height="wrap_content"
-                android:layout_margin="16dp"
+                android:padding="16dp"
                android:text="Selected GGUF model's metadata will show here."
                style="@style/TextAppearance.MaterialComponents.Body2" />

@@ -33,8 +33,7 @@
        <com.google.android.material.divider.MaterialDivider
            android:layout_width="match_parent"
            android:layout_height="2dp"
-            android:layout_marginHorizontal="16dp"
-            android:layout_marginVertical="8dp" />
+            android:layout_marginHorizontal="16dp" />

        <androidx.recyclerview.widget.RecyclerView
            android:id="@+id/messages"
--- a/examples/llama.android/gradle/libs.versions.toml
+++ b/examples/llama.android/gradle/libs.versions.toml
@@ -1,15 +1,15 @@
 [versions]

 # Plugins
-agp = "8.13.0"
-kotlin = "2.2.20"
+agp = "8.13.2"
+kotlin = "2.3.0"

 # AndroidX
-activity = "1.11.0"
+activity = "1.12.2"
 appcompat = "1.7.1"
 core-ktx = "1.17.0"
 constraint-layout = "2.2.1"
-datastore-preferences = "1.1.7"
+datastore-preferences = "1.2.0"

 # Material
 material = "1.13.0"
--- a/examples/llama.android/lib/src/main/cpp/ai_chat.cpp
+++ b/examples/llama.android/lib/src/main/cpp/ai_chat.cpp
@@ -560,6 +560,6 @@ Java_com_arm_aichat_internal_InferenceEngineImpl_unload(JNIEnv * /*unused*/, job

 extern "C"
 JNIEXPORT void JNICALL
-Java_com_arm_aichat_internal_InferenceEngineImpl_shutdown(JNIEnv *env, jobject /*unused*/) {
+Java_com_arm_aichat_internal_InferenceEngineImpl_shutdown(JNIEnv *, jobject /*unused*/) {
    llama_backend_free();
 }
--- a/examples/llama.android/lib/src/main/java/com/arm/aichat/InferenceEngine.kt
+++ b/examples/llama.android/lib/src/main/java/com/arm/aichat/InferenceEngine.kt
@@ -38,7 +38,7 @@ interface InferenceEngine {
    /**
     * Unloads the currently loaded model.
     */
-    suspend fun cleanUp()
+    fun cleanUp()

    /**
     * Cleans up resources when the engine is no longer needed.
--- a/examples/llama.android/lib/src/main/java/com/arm/aichat/internal/InferenceEngineImpl.kt
+++ b/examples/llama.android/lib/src/main/java/com/arm/aichat/internal/InferenceEngineImpl.kt
@@ -15,9 +15,11 @@ import kotlinx.coroutines.cancel
 import kotlinx.coroutines.flow.Flow
 import kotlinx.coroutines.flow.MutableStateFlow
 import kotlinx.coroutines.flow.StateFlow
+import kotlinx.coroutines.flow.asStateFlow
 import kotlinx.coroutines.flow.flow
 import kotlinx.coroutines.flow.flowOn
 import kotlinx.coroutines.launch
+import kotlinx.coroutines.runBlocking
 import kotlinx.coroutines.withContext
 import java.io.File
 import java.io.IOException
@@ -109,9 +111,11 @@ internal class InferenceEngineImpl private constructor(

    private val _state =
        MutableStateFlow<InferenceEngine.State>(InferenceEngine.State.Uninitialized)
-    override val state: StateFlow<InferenceEngine.State> = _state
+    override val state: StateFlow<InferenceEngine.State> = _state.asStateFlow()

    private var _readyForSystemPrompt = false
+    @Volatile
+    private var _cancelGeneration = false

    /**
     * Single-threaded coroutine dispatcher & scope for LLama asynchronous operations
@@ -169,6 +173,8 @@ internal class InferenceEngineImpl private constructor(
                }
                Log.i(TAG, "Model loaded!")
                _readyForSystemPrompt = true
+
+                _cancelGeneration = false
                _state.value = InferenceEngine.State.ModelReady
            } catch (e: Exception) {
                Log.e(TAG, (e.message ?: "Error loading model") + "\n" + pathToModel, e)
@@ -231,15 +237,19 @@ internal class InferenceEngineImpl private constructor(

            Log.i(TAG, "User prompt processed. Generating assistant prompt...")
            _state.value = InferenceEngine.State.Generating
-            while (true) {
+            while (!_cancelGeneration) {
                generateNextToken()?.let { utf8token ->
                    if (utf8token.isNotEmpty()) emit(utf8token)
                } ?: break
            }
-            Log.i(TAG, "Assistant generation complete. Awaiting user prompt...")
+            if (_cancelGeneration) {
+                Log.i(TAG, "Assistant generation aborted per requested.")
+            } else {
+                Log.i(TAG, "Assistant generation complete. Awaiting user prompt...")
+            }
            _state.value = InferenceEngine.State.ModelReady
        } catch (e: CancellationException) {
-            Log.i(TAG, "Generation cancelled by user.")
+            Log.i(TAG, "Assistant generation's flow collection cancelled.")
            _state.value = InferenceEngine.State.ModelReady
            throw e
        } catch (e: Exception) {
@@ -268,8 +278,9 @@ internal class InferenceEngineImpl private constructor(
    /**
     * Unloads the model and frees resources, or reset error states
     */
-    override suspend fun cleanUp() =
-        withContext(llamaDispatcher) {
+    override fun cleanUp() {
+        _cancelGeneration = true
+        runBlocking(llamaDispatcher) {
            when (val state = _state.value) {
                is InferenceEngine.State.ModelReady -> {
                    Log.i(TAG, "Unloading model and free resources...")
@@ -293,17 +304,21 @@ internal class InferenceEngineImpl private constructor(
                else -> throw IllegalStateException("Cannot unload model in ${state.javaClass.simpleName}")
            }
        }
+    }

    /**
     * Cancel all ongoing coroutines and free GGML backends
     */
    override fun destroy() {
-        _readyForSystemPrompt = false
-        llamaScope.cancel()
-        when(_state.value) {
-            is InferenceEngine.State.Uninitialized -> {}
-            is InferenceEngine.State.Initialized -> shutdown()
-            else -> { unload(); shutdown() }
+        _cancelGeneration = true
+        runBlocking(llamaDispatcher) {
+            _readyForSystemPrompt = false
+            when(_state.value) {
+                is InferenceEngine.State.Uninitialized -> {}
+                is InferenceEngine.State.Initialized -> shutdown()
+                else -> { unload(); shutdown() }
+            }
        }
+        llamaScope.cancel()
    }
 }
--- a/examples/model-conversion/scripts/embedding/run-original-model.py
+++ b/examples/model-conversion/scripts/embedding/run-original-model.py
@@ -2,6 +2,7 @@

 import argparse
 import os
+import sys
 import numpy as np
 import importlib
 from pathlib import Path
@@ -9,169 +10,243 @@ from pathlib import Path
 from transformers import AutoTokenizer, AutoConfig, AutoModel
 import torch

-unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')

-parser = argparse.ArgumentParser(description='Process model with specified path')
-parser.add_argument('--model-path', '-m', help='Path to the model')
-parser.add_argument('--prompts-file', '-p', help='Path to file containing prompts (one per line)')
-parser.add_argument('--use-sentence-transformers', action='store_true',
-                    help='Use SentenceTransformer to apply all numbered layers (01_Pooling, 02_Dense, 03_Dense, 04_Normalize)')
-args = parser.parse_args()
+def parse_arguments():
+    parser = argparse.ArgumentParser(description='Run original embedding model')
+    parser.add_argument(
+        '--model-path',
+        '-m',
+        help='Path to the model'
+    )
+    parser.add_argument(
+        '--prompts-file',
+        '-p',
+        help='Path to file containing prompts (one per line)'
+    )
+    parser.add_argument(
+        '--use-sentence-transformers',
+        action='store_true',
+        help=('Use SentenceTransformer to apply all numbered layers '
+              '(01_Pooling, 02_Dense, 03_Dense, 04_Normalize)')
+    )
+    parser.add_argument(
+        '--device',
+        '-d',
+        help='Device to use (cpu, cuda, mps, auto)',
+        default='auto'
+    )
+    return parser.parse_args()

-def read_prompt_from_file(file_path):
-    try:
-        with open(file_path, 'r', encoding='utf-8') as f:
-            return f.read().strip()
-    except FileNotFoundError:
-        print(f"Error: Prompts file '{file_path}' not found")
-        exit(1)
-    except Exception as e:
-        print(f"Error reading prompts file: {e}")
-        exit(1)

-model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
-if model_path is None:
-    parser.error("Model path must be specified either via --model-path argument or EMBEDDING_MODEL_PATH environment variable")
-
-# Determine if we should use SentenceTransformer
-use_sentence_transformers = args.use_sentence_transformers or os.environ.get('USE_SENTENCE_TRANSFORMERS', '').lower() in ('1', 'true', 'yes')
-
-if use_sentence_transformers:
-    from sentence_transformers import SentenceTransformer
-    print("Using SentenceTransformer to apply all numbered layers")
-    model = SentenceTransformer(model_path)
-    tokenizer = model.tokenizer
-    config = model[0].auto_model.config  # type: ignore
-else:
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
-
-    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-
-    # This can be used to override the sliding window size for manual testing. This
-    # can be useful to verify the sliding window attention mask in the original model
-    # and compare it with the converted .gguf model.
-    if hasattr(config, 'sliding_window'):
-        original_sliding_window = config.sliding_window
-        #original_sliding_window = 6
-        print(f"Modified sliding window: {original_sliding_window} -> {config.sliding_window}")
-
-    print(f"Using unreleased model: {unreleased_model_name}")
-    if unreleased_model_name:
-        model_name_lower = unreleased_model_name.lower()
-        unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
-        class_name = f"{unreleased_model_name}Model"
-        print(f"Importing unreleased model module: {unreleased_module_path}")
-
-        try:
-            model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
-            model = model_class.from_pretrained(model_path, config=config, trust_remote_code=True)
-        except (ImportError, AttributeError) as e:
-            print(f"Failed to import or load model: {e}")
-            exit(1)
+def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device="auto"):
+    if device == "cpu":
+        device_map = {"": "cpu"}
+        print("Forcing CPU usage")
+    elif device == "auto":
+        # On Mac, "auto" device_map can cause issues with accelerate
+        # So we detect the best device manually
+        if torch.cuda.is_available():
+            device_map = {"": "cuda"}
+            print("Using CUDA")
+        elif torch.backends.mps.is_available():
+            device_map = {"": "mps"}
+            print("Using MPS (Apple Metal)")
+        else:
+            device_map = {"": "cpu"}
+            print("Using CPU")
    else:
-        model = AutoModel.from_pretrained(model_path, config=config, trust_remote_code=True)
-    print(f"Model class: {type(model)}")
-    print(f"Model file: {type(model).__module__}")
+        device_map = {"": device}

-# Verify the model is using the correct sliding window
-if not use_sentence_transformers:
-    if hasattr(model.config, 'sliding_window'):  # type: ignore
-        print(f"Model's sliding_window: {model.config.sliding_window}")  # type: ignore
-    else:
-        print("Model config does not have sliding_window attribute")
-
-model_name = os.path.basename(model_path)
-
-if args.prompts_file:
-    prompt_text = read_prompt_from_file(args.prompts_file)
-    texts = [prompt_text]
-else:
-    texts = ["Hello world today"]
-
-with torch.no_grad():
    if use_sentence_transformers:
-        embeddings = model.encode(texts, convert_to_numpy=True)
-        all_embeddings = embeddings  # Shape: [batch_size, hidden_size]
-
-        encoded = tokenizer(
-            texts,
-            padding=True,
-            truncation=True,
-            return_tensors="pt"
-        )
-        tokens = encoded['input_ids'][0]
-        token_strings = tokenizer.convert_ids_to_tokens(tokens)
-        for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
-            print(f"{token_id:6d} -> '{token_str}'")
-
-        print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}")
-        print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}")  # type: ignore
+        from sentence_transformers import SentenceTransformer
+        print("Using SentenceTransformer to apply all numbered layers")
+        model = SentenceTransformer(model_path)
+        tokenizer = model.tokenizer
+        config = model[0].auto_model.config  # type: ignore
    else:
-        # Standard approach: use base model output only
-        encoded = tokenizer(
-            texts,
-            padding=True,
-            truncation=True,
-            return_tensors="pt"
-        )
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)

-        tokens = encoded['input_ids'][0]
-        token_strings = tokenizer.convert_ids_to_tokens(tokens)
-        for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
-            print(f"{token_id:6d} -> '{token_str}'")
+        # This can be used to override the sliding window size for manual testing. This
+        # can be useful to verify the sliding window attention mask in the original model
+        # and compare it with the converted .gguf model.
+        if hasattr(config, 'sliding_window'):
+            original_sliding_window = config.sliding_window
+            print(f"Modified sliding window: {original_sliding_window} -> {config.sliding_window}")

-        outputs = model(**encoded)
-        hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_size]
+        unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
+        print(f"Using unreleased model: {unreleased_model_name}")
+        if unreleased_model_name:
+            model_name_lower = unreleased_model_name.lower()
+            unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
+            class_name = f"{unreleased_model_name}Model"
+            print(f"Importing unreleased model module: {unreleased_module_path}")

-        all_embeddings = hidden_states[0].float().cpu().numpy()  # Shape: [seq_len, hidden_size]
+            try:
+                model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
+                model = model_class.from_pretrained(
+                    model_path,
+                    device_map=device_map,
+                    offload_folder="offload",
+                    trust_remote_code=True,
+                    config=config
+                )
+            except (ImportError, AttributeError) as e:
+                print(f"Failed to import or load model: {e}")
+                sys.exit(1)
+        else:
+            model = AutoModel.from_pretrained(
+                model_path,
+                device_map=device_map,
+                offload_folder="offload",
+                trust_remote_code=True,
+                config=config
+            )
+        print(f"Model class: {type(model)}")
+        print(f"Model file: {type(model).__module__}")

-        print(f"Hidden states shape: {hidden_states.shape}")
-        print(f"All embeddings shape: {all_embeddings.shape}")
-        print(f"Embedding dimension: {all_embeddings.shape[1]}")
+        # Verify the model is using the correct sliding window
+        if hasattr(model.config, 'sliding_window'):  # type: ignore
+            print(f"Model's sliding_window: {model.config.sliding_window}")  # type: ignore
+        else:
+            print("Model config does not have sliding_window attribute")

-    if len(all_embeddings.shape) == 1:
-        n_embd = all_embeddings.shape[0]  # type: ignore
-        n_embd_count = 1
-        all_embeddings = all_embeddings.reshape(1, -1)
+    return model, tokenizer, config
+
+
+def get_prompt(args):
+    if args.prompts_file:
+        try:
+            with open(args.prompts_file, 'r', encoding='utf-8') as f:
+                return f.read().strip()
+        except FileNotFoundError:
+            print(f"Error: Prompts file '{args.prompts_file}' not found")
+            sys.exit(1)
+        except Exception as e:
+            print(f"Error reading prompts file: {e}")
+            sys.exit(1)
    else:
-        n_embd = all_embeddings.shape[1]  # type: ignore
-        n_embd_count = all_embeddings.shape[0]  # type: ignore
+        return "Hello world today"

-    print()

-    for j in range(n_embd_count):
-        embedding = all_embeddings[j]
-        print(f"embedding {j}: ", end="")
+def main():
+    args = parse_arguments()

-        # Print first 3 values
-        for i in range(min(3, n_embd)):
-            print(f"{embedding[i]:9.6f} ", end="")
+    model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
+    if model_path is None:
+        print("Error: Model path must be specified either via --model-path argument "
+              "or EMBEDDING_MODEL_PATH environment variable")
+        sys.exit(1)

-        print(" ... ", end="")
+    # Determine if we should use SentenceTransformer
+    use_st = (
+        args.use_sentence_transformers or os.environ.get('USE_SENTENCE_TRANSFORMERS', '').lower() in ('1', 'true', 'yes')
+    )

-        # Print last 3 values
-        for i in range(n_embd - 3, n_embd):
-            print(f"{embedding[i]:9.6f} ", end="")
+    model, tokenizer, config = load_model_and_tokenizer(model_path, use_st, args.device)

-        print()  # New line
+    # Get the device the model is on
+    if not use_st:
+        device = next(model.parameters()).device
+    else:
+        # For SentenceTransformer, get device from the underlying model
+        device = next(model[0].auto_model.parameters()).device  # type: ignore

-    print()
+    model_name = os.path.basename(model_path)

-    data_dir = Path("data")
-    data_dir.mkdir(exist_ok=True)
-    bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
-    txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
+    prompt_text = get_prompt(args)
+    texts = [prompt_text]

-    flattened_embeddings = all_embeddings.flatten()
-    flattened_embeddings.astype(np.float32).tofile(bin_filename)
+    with torch.no_grad():
+        if use_st:
+            embeddings = model.encode(texts, convert_to_numpy=True)
+            all_embeddings = embeddings  # Shape: [batch_size, hidden_size]
+
+            encoded = tokenizer(
+                texts,
+                padding=True,
+                truncation=True,
+                return_tensors="pt"
+            )
+            tokens = encoded['input_ids'][0]
+            token_strings = tokenizer.convert_ids_to_tokens(tokens)
+            for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
+                print(f"{token_id:6d} -> '{token_str}'")
+
+            print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}")
+            print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}")  # type: ignore
+        else:
+            # Standard approach: use base model output only
+            encoded = tokenizer(
+                texts,
+                padding=True,
+                truncation=True,
+                return_tensors="pt"
+            )
+
+            tokens = encoded['input_ids'][0]
+            token_strings = tokenizer.convert_ids_to_tokens(tokens)
+            for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
+                print(f"{token_id:6d} -> '{token_str}'")
+
+            # Move inputs to the same device as the model
+            encoded = {k: v.to(device) for k, v in encoded.items()}
+            outputs = model(**encoded)
+            hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_size]
+
+            all_embeddings = hidden_states[0].float().cpu().numpy()  # Shape: [seq_len, hidden_size]
+
+            print(f"Hidden states shape: {hidden_states.shape}")
+            print(f"All embeddings shape: {all_embeddings.shape}")
+            print(f"Embedding dimension: {all_embeddings.shape[1]}")
+
+        if len(all_embeddings.shape) == 1:
+            n_embd = all_embeddings.shape[0]  # type: ignore
+            n_embd_count = 1
+            all_embeddings = all_embeddings.reshape(1, -1)
+        else:
+            n_embd = all_embeddings.shape[1]  # type: ignore
+            n_embd_count = all_embeddings.shape[0]  # type: ignore
+
+        print()

-    with open(txt_filename, "w") as f:
-        idx = 0
        for j in range(n_embd_count):
-            for value in all_embeddings[j]:
-                f.write(f"{idx}: {value:.6f}\n")
-                idx += 1
-    print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} embeddings × {n_embd} dimensions)")
-    print("")
-    print(f"Saved bin embeddings to: {bin_filename}")
-    print(f"Saved txt embeddings to: {txt_filename}")
+            embedding = all_embeddings[j]
+            print(f"embedding {j}: ", end="")
+
+            # Print first 3 values
+            for i in range(min(3, n_embd)):
+                print(f"{embedding[i]:9.6f} ", end="")
+
+            print(" ... ", end="")
+
+            # Print last 3 values
+            for i in range(n_embd - 3, n_embd):
+                print(f"{embedding[i]:9.6f} ", end="")
+
+            print()  # New line
+
+        print()
+
+        data_dir = Path("data")
+        data_dir.mkdir(exist_ok=True)
+        bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
+        txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
+
+        flattened_embeddings = all_embeddings.flatten()
+        flattened_embeddings.astype(np.float32).tofile(bin_filename)
+
+        with open(txt_filename, "w") as f:
+            idx = 0
+            for j in range(n_embd_count):
+                for value in all_embeddings[j]:
+                    f.write(f"{idx}: {value:.6f}\n")
+                    idx += 1
+        print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} embeddings × {n_embd} dimensions)")
+        print("")
+        print(f"Saved bin embeddings to: {bin_filename}")
+        print(f"Saved txt embeddings to: {txt_filename}")
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -2960,19 +2960,22 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
        // in streaming mode, the first error must be treated as non-stream response
        // this is to match the OAI API behavior
        // ref: https://github.com/ggml-org/llama.cpp/pull/16486#discussion_r2419657309
-        server_task_result_ptr first_result = rd.next(req.should_stop);
+        auto first_result = rd.next(req.should_stop);
        if (first_result == nullptr) {
+            GGML_ASSERT(req.should_stop());
            return res; // connection is closed
-        } else if (first_result->is_error()) {
+        }
+
+        if (first_result->is_error()) {
            res->error(first_result->to_json());
            return res;
-        } else {
-            GGML_ASSERT(
-                dynamic_cast<server_task_result_cmpl_partial*>(first_result.get()) != nullptr
-                || dynamic_cast<server_task_result_cmpl_final*>(first_result.get()) != nullptr
-            );
        }

+        GGML_ASSERT(
+            dynamic_cast<server_task_result_cmpl_partial*>(first_result.get()) != nullptr ||
+            dynamic_cast<server_task_result_cmpl_final*>  (first_result.get()) != nullptr
+        );
+
        // next responses are streamed
        // to be sent immediately
        json first_result_json = first_result->to_json();
@@ -3028,6 +3031,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
                auto result = rd.next(req.should_stop);
                if (result == nullptr) {
                    SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
+                    GGML_ASSERT(req.should_stop());
                    return false; // should_stop condition met
                }

@@ -3111,6 +3115,11 @@ void server_routes::init_routes() {

        // get the result
        auto result = res->rd.next(req.should_stop);
+        if (!result) {
+            // connection was closed
+            GGML_ASSERT(req.should_stop());
+            return res;
+        }

        if (result->is_error()) {
            res->error(result->to_json());
@@ -3211,6 +3220,11 @@ void server_routes::init_routes() {

        // get the result
        auto result = res->rd.next(req.should_stop);
+        if (!result) {
+            // connection was closed
+            GGML_ASSERT(req.should_stop());
+            return res;
+        }

        if (result->is_error()) {
            res->error(result->to_json());
@@ -3717,7 +3731,12 @@ void server_routes::init_routes() {
        }

        // get the result
-        server_task_result_ptr result = rd.next(req.should_stop);
+        auto result = rd.next(req.should_stop);
+        if (!result) {
+            // connection was closed
+            GGML_ASSERT(req.should_stop());
+            return res;
+        }

        if (result->is_error()) {
            res->error(result->to_json());
@@ -3746,7 +3765,12 @@ void server_routes::init_routes() {
        }

        // get the result
-        server_task_result_ptr result = rd.next(req.should_stop);
+        auto result = rd.next(req.should_stop);
+        if (!result) {
+            // connection was closed
+            GGML_ASSERT(req.should_stop());
+            return res;
+        }

        if (result->is_error()) {
            res->error(result->to_json());
@@ -3779,7 +3803,12 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_save(const ser
        rd.post_task(std::move(task));
    }

-    server_task_result_ptr result = rd.next(req.should_stop);
+    auto result = rd.next(req.should_stop);
+    if (!result) {
+        // connection was closed
+        GGML_ASSERT(req.should_stop());
+        return res;
+    }

    if (result->is_error()) {
        res->error(result->to_json());
@@ -3810,7 +3839,12 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_restore(const
        rd.post_task(std::move(task));
    }

-    server_task_result_ptr result = rd.next(req.should_stop);
+    auto result = rd.next(req.should_stop);
+    if (!result) {
+        // connection was closed
+        GGML_ASSERT(req.should_stop());
+        return res;
+    }

    if (result->is_error()) {
        res->error(result->to_json());
@@ -3832,7 +3866,12 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_erase(const se
        rd.post_task(std::move(task));
    }

-    server_task_result_ptr result = rd.next(req.should_stop);
+    auto result = rd.next(req.should_stop);
+    if (!result) {
+        // connection was closed
+        GGML_ASSERT(req.should_stop());
+        return res;
+    }

    if (result->is_error()) {
        res->error(result->to_json());