From f64d56bcd819cb3fa7c8dd633eb493d00877d753 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 10 May 2026 20:47:08 +0300
Subject: [PATCH] llama-server-simulator : replace Flask with stdlib
 http.server

- Use HTTPServer + BaseHTTPRequestHandler instead of Flask
- RequestHandler handles POST /v1/chat/completions
- Server runs in daemon thread with clean Ctrl+C shutdown
- Remove flask and unused asdict imports

Assisted-by: llama.cpp:local pi
---
 examples/llama-eval/llama-server-simulator.py | 64 ++++++++++++++-----
 1 file changed, 49 insertions(+), 15 deletions(-)

diff --git a/examples/llama-eval/llama-server-simulator.py b/examples/llama-eval/llama-server-simulator.py
index 210683953e..5554132dee 100755
--- a/examples/llama-eval/llama-server-simulator.py
+++ b/examples/llama-eval/llama-server-simulator.py
@@ -7,12 +7,13 @@ import re
 import time
 import sys
 import os
+import threading
+from http.server import HTTPServer, BaseHTTPRequestHandler
 from typing import Dict, List, Optional
-from dataclasses import dataclass, asdict
+from dataclasses import dataclass
 from pathlib import Path
 
 import datasets
-from flask import Flask, request, jsonify
 
 # Set cache directory for HuggingFace datasets
 cache_dir = Path.home() / ".cache" / "huggingface" / "datasets"
@@ -48,7 +49,7 @@ def debug_log(message: str):
     with open("/tmp/simulator-debug.log", "a") as f:
         f.write(message + "\n")
 
-app = Flask(__name__)
+simulator: Optional["Simulator"] = None
 
 @dataclass
 class EvalState:
@@ -216,21 +217,46 @@ class Simulator:
 
         return response
 
-@app.route('/v1/chat/completions', methods=['POST'])
-def chat_completions():
-    try:
-        request_data = request.get_json()
+class RequestHandler(BaseHTTPRequestHandler):
+    def do_POST(self):
+        if self.path != "/v1/chat/completions":
+            self._send_json({"error": "Not found"}, 404)
+            return
 
-        if not request_data:
-            return jsonify({"error": "Invalid JSON"}), 400
+        try:
+            content_length = int(self.headers.get("Content-Length", 0))
+            body = self.rfile.read(content_length)
+            request_data = json.loads(body) if body else None
 
-        response = simulator._process_request(request_data)
+            if not request_data:
+                self._send_json({"error": "Invalid JSON"}, 400)
+                return
 
-        return jsonify(response)
+            if simulator is None:
+                self._send_json({"error": "Simulator not initialized"}, 500)
+                return
+
+            response = simulator._process_request(request_data)
+            self._send_json(response, 200)
+
+        except json.JSONDecodeError:
+            self._send_json({"error": "Invalid JSON"}, 400)
+        except Exception as e:
+            print(f"Error processing request: {e}")
+            self._send_json({"error": str(e)}, 500)
+
+    def _send_json(self, data: dict, status: int = 200):
+        body = json.dumps(data).encode("utf-8")
+        self.send_response(status)
+        self.send_header("Content-Type", "application/json")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def log_message(self, format, *args):
+        # Suppress default request logging
+        pass
 
-    except Exception as e:
-        print(f"Error processing request: {e}")
-        return jsonify({"error": str(e)}), 500
 
 def main():
     parser = argparse.ArgumentParser(
@@ -271,13 +297,21 @@ def main():
         dataset_split=args.dataset_split
     )
 
+    server = HTTPServer((args.host, args.port), RequestHandler)
+    server_thread = threading.Thread(target=server.serve_forever, daemon=True)
+    server_thread.start()
+
     print("\n=== llama-server-simulator ===")
     print(f"Server running on http://{args.host}:{args.port}")
     print(f"Success rate: {args.success_rate}")
     print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions")
     print("\nPress Ctrl+C to stop\n")
 
-    app.run(host=args.host, port=args.port, debug=False)
+    try:
+        server_thread.join()
+    except KeyboardInterrupt:
+        print("\nShutting down...")
+        server.shutdown()
 
 if __name__ == "__main__":
     main()