From d26b1ffcc9ece54cc69ce7be966a4d09f475e1f7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 10 May 2026 19:23:30 +0300 Subject: [PATCH] llama-eval : rename display, escaped, and count variables to use prefix convention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - _display suffix → display_ prefix (answer, tokens, tps, t_gen) - _escaped suffix → escaped_ prefix (response, prompt, reasoning) - _count suffix → n_ prefix (correct, incorrect, pending) Assisted-by: llama.cpp:local pi --- examples/llama-eval/llama-eval.py | 48 +++++++++++++++---------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/examples/llama-eval/llama-eval.py b/examples/llama-eval/llama-eval.py index 8b47371cb9..9c9b337933 100755 --- a/examples/llama-eval/llama-eval.py +++ b/examples/llama-eval/llama-eval.py @@ -226,19 +226,19 @@ class EvalState: self.correct = sum(1 for c in self.task_states.get("cases", {}).values() if c.get("correct", False)) - def print_progress(self, task_state: TaskState, total_tasks: int, correct_count: int = 0): - answer_display = task_state.answer if task_state.answer else "N/A" - tokens_display = str(task_state.tokens) if task_state.tokens is not None else "N/A" - tps_display = f"{task_state.tps_gen:.1f}" if task_state.tps_gen is not None else "N/A" - t_gen_display = f"{task_state.t_gen_ms/1000:.1f}" if task_state.t_gen_ms is not None else "N/A" - success_ratio = correct_count / self.processed if self.processed > 0 else 0.0 + def print_progress(self, task_state: TaskState, total_tasks: int, n_correct: int = 0): + display_answer = task_state.answer if task_state.answer else "N/A" + display_tokens = str(task_state.tokens) if task_state.tokens is not None else "N/A" + display_tps = f"{task_state.tps_gen:.1f}" if task_state.tps_gen is not None else "N/A" + display_t_gen = f"{task_state.t_gen_ms/1000:.1f}" if task_state.t_gen_ms is not None else "N/A" + success_ratio = n_correct / self.processed if self.processed > 0 else 0.0 first_line = task_state.question_text.split('\n')[0] truncated_question = first_line[:43] if len(first_line) > 43: truncated_question += "..." else: truncated_question = truncated_question.ljust(43) + "..." - print(f"{self.processed:3}/{total_tasks:3} {task_state.task_id:<20} {self.dataset_type.upper()} {truncated_question:<40} {task_state.expected:<10} {answer_display:<10} {tokens_display:<6} {tps_display:<6} {t_gen_display:<8} {'✓' if task_state.correct else '✗'} [{correct_count:3}/{self.processed:3}, {success_ratio:.3f}]") + print(f"{self.processed:3}/{total_tasks:3} {task_state.task_id:<20} {self.dataset_type.upper()} {truncated_question:<40} {task_state.expected:<10} {display_answer:<10} {display_tokens:<6} {display_tps:<6} {display_t_gen:<8} {'✓' if task_state.correct else '✗'} [{n_correct:3}/{self.processed:3}, {success_ratio:.3f}]") def print_summary(self): if self.total == 0: @@ -299,11 +299,11 @@ class EvalState: cases = all_cases completed = {tid: c for tid, c in cases.items() if c.get("status") == "ok"} - correct_count = sum(1 for c in completed.values() if c.get("correct", False)) - incorrect_count = len(completed) - correct_count - pending_count = len(tasks_to_save) - len(completed) - accuracy = correct_count / len(completed) * 100 if completed else 0.0 - ci_lower, ci_upper = wilson_interval(correct_count, len(completed)) if completed else (0.0, 1.0) + n_correct = sum(1 for c in completed.values() if c.get("correct", False)) + n_incorrect = len(completed) - n_correct + n_pending = len(tasks_to_save) - len(completed) + accuracy = n_correct / len(completed) * 100 if completed else 0.0 + ci_lower, ci_upper = wilson_interval(n_correct, len(completed)) if completed else (0.0, 1.0) sampling_parts = [] for k, v in self.sampling_config.items(): @@ -340,9 +340,9 @@ class EvalState: t_gen_str = f"{t_gen_ms/1000:.1f}" if t_gen_ms is not None else "" reasoning_content = case.get("reasoning_content", "") or "" - response_escaped = self._escape_html(response) - prompt_escaped = self._escape_html(prompt) - reasoning_escaped = self._escape_html(reasoning_content) + escaped_response = self._escape_html(response) + escaped_prompt = self._escape_html(prompt) + escaped_reasoning = self._escape_html(reasoning_content) grader_log_str = self._escape_html(json.dumps(grader_log, indent=2)) rows.append(f""" @@ -358,11 +358,11 @@ class EvalState:

Prompt

-
{prompt_escaped}
+
{escaped_prompt}

Reasoning ▶

- +

Response

-
{response_escaped}
+
{escaped_response}

Grader Log

{grader_log_str}
@@ -407,9 +407,9 @@ class EvalState: Dataset{self.dataset_type} Total Tasks{len(tasks_to_save)} Completed{len(completed)} - Correct{correct_count} - Incorrect{incorrect_count} - Pending{pending_count} + Correct{n_correct} + Incorrect{n_incorrect} + Pending{n_pending} Accuracy{accuracy:.1f}% [{ci_lower*100:.1f}%, {ci_upper*100:.1f}%] Total Time{self.total_time:.1f}s Sampling{sampling_str} @@ -1049,7 +1049,7 @@ class Processor: print(f"Sampling: temp={eval_state.sampling_config.get('temperature', 'skip')}, top-k={eval_state.sampling_config.get('top_k', 'skip')}, top-p={eval_state.sampling_config.get('top_p', 'skip')}, min-p={eval_state.sampling_config.get('min_p', 'skip')}") print() - correct_count = 0 + n_correct = 0 with ThreadPoolExecutor(max_workers=self.threads) as executor: futures = { @@ -1062,12 +1062,12 @@ class Processor: task_state = future.result() eval_state.processed += 1 if task_state.correct: - correct_count += 1 + n_correct += 1 elapsed = time.time() - start_time eval_state.total_time += elapsed session_time += elapsed start_time = time.time() - eval_state.print_progress(task_state, total_tasks, correct_count) + eval_state.print_progress(task_state, total_tasks, n_correct) if verbose: print(f"\nCase {eval_state.processed}: {task_state.correct}")