From f64d56bcd819cb3fa7c8dd633eb493d00877d753 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 10 May 2026 20:47:08 +0300 Subject: [PATCH] llama-server-simulator : replace Flask with stdlib http.server - Use HTTPServer + BaseHTTPRequestHandler instead of Flask - RequestHandler handles POST /v1/chat/completions - Server runs in daemon thread with clean Ctrl+C shutdown - Remove flask and unused asdict imports Assisted-by: llama.cpp:local pi --- examples/llama-eval/llama-server-simulator.py | 64 ++++++++++++++----- 1 file changed, 49 insertions(+), 15 deletions(-) diff --git a/examples/llama-eval/llama-server-simulator.py b/examples/llama-eval/llama-server-simulator.py index 210683953e..5554132dee 100755 --- a/examples/llama-eval/llama-server-simulator.py +++ b/examples/llama-eval/llama-server-simulator.py @@ -7,12 +7,13 @@ import re import time import sys import os +import threading +from http.server import HTTPServer, BaseHTTPRequestHandler from typing import Dict, List, Optional -from dataclasses import dataclass, asdict +from dataclasses import dataclass from pathlib import Path import datasets -from flask import Flask, request, jsonify # Set cache directory for HuggingFace datasets cache_dir = Path.home() / ".cache" / "huggingface" / "datasets" @@ -48,7 +49,7 @@ def debug_log(message: str): with open("/tmp/simulator-debug.log", "a") as f: f.write(message + "\n") -app = Flask(__name__) +simulator: Optional["Simulator"] = None @dataclass class EvalState: @@ -216,21 +217,46 @@ class Simulator: return response -@app.route('/v1/chat/completions', methods=['POST']) -def chat_completions(): - try: - request_data = request.get_json() +class RequestHandler(BaseHTTPRequestHandler): + def do_POST(self): + if self.path != "/v1/chat/completions": + self._send_json({"error": "Not found"}, 404) + return - if not request_data: - return jsonify({"error": "Invalid JSON"}), 400 + try: + content_length = int(self.headers.get("Content-Length", 0)) + body = self.rfile.read(content_length) + request_data = json.loads(body) if body else None - response = simulator._process_request(request_data) + if not request_data: + self._send_json({"error": "Invalid JSON"}, 400) + return - return jsonify(response) + if simulator is None: + self._send_json({"error": "Simulator not initialized"}, 500) + return + + response = simulator._process_request(request_data) + self._send_json(response, 200) + + except json.JSONDecodeError: + self._send_json({"error": "Invalid JSON"}, 400) + except Exception as e: + print(f"Error processing request: {e}") + self._send_json({"error": str(e)}, 500) + + def _send_json(self, data: dict, status: int = 200): + body = json.dumps(data).encode("utf-8") + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, format, *args): + # Suppress default request logging + pass - except Exception as e: - print(f"Error processing request: {e}") - return jsonify({"error": str(e)}), 500 def main(): parser = argparse.ArgumentParser( @@ -271,13 +297,21 @@ def main(): dataset_split=args.dataset_split ) + server = HTTPServer((args.host, args.port), RequestHandler) + server_thread = threading.Thread(target=server.serve_forever, daemon=True) + server_thread.start() + print("\n=== llama-server-simulator ===") print(f"Server running on http://{args.host}:{args.port}") print(f"Success rate: {args.success_rate}") print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions") print("\nPress Ctrl+C to stop\n") - app.run(host=args.host, port=args.port, debug=False) + try: + server_thread.join() + except KeyboardInterrupt: + print("\nShutting down...") + server.shutdown() if __name__ == "__main__": main()