RC: (add) python script files

This commit is contained in:
2026-05-15 16:50:26 +01:00
parent 51e9389726
commit ab7875303e
9 changed files with 2350 additions and 0 deletions

276
runner.py Normal file
View File

@@ -0,0 +1,276 @@
"""
benchmark_v4/runner.py
======================
Executes models via Ollama CLI and orchestrates the benchmark loop.
Handles: warmup, GPU polling, cooldown, multi-run variance.
"""
import subprocess
import time
import re
import statistics
from datetime import datetime
from config import (
COOLDOWN_SECONDS, GPU_POLL_EVERY,
TEST_WEIGHTS, CATEGORIES,
)
from prompts import ALL_TESTS
from validators import normalize_text
from judge import warmup_judge
from scoring import (
score_test, compute_weighted, compute_category_scores,
compute_compliance, compute_variance_stats
)
from storage import insert_run, insert_details, insert_variance
# ============================================
# GPU MONITORING
# ============================================
_gpu_cache = {"temp": -1, "mem": -1, "util": -1, "clock": -1}
_gpu_poll_count = 0
def get_gpu(force=False):
"""Poll GPU stats. Cached every GPU_POLL_EVERY tests to reduce overhead."""
global _gpu_cache, _gpu_poll_count
_gpu_poll_count += 1
if not force and GPU_POLL_EVERY > 0 and _gpu_poll_count % GPU_POLL_EVERY != 0:
return _gpu_cache
try:
result = subprocess.run(
["nvidia-smi",
"--query-gpu=temperature.gpu,memory.used,utilization.gpu,clocks.sm",
"--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=5
)
temp, mem, util, clock = result.stdout.strip().split(", ")
_gpu_cache = {
"temp": int(temp), "mem": int(mem),
"util": int(util), "clock": int(clock)
}
except Exception:
pass
return _gpu_cache
# ============================================
# PARSE OLLAMA VERBOSE
# ============================================
def parse_generation_speed(output):
"""
Parse GENERATION (eval) speed from Ollama verbose output.
The last tokens/s value is the generation rate.
"""
matches = re.findall(r'(\d+\.\d+)\s+tokens/s', output)
return float(matches[-1]) if matches else None
# ============================================
# RUN SINGLE MODEL + PROMPT
# ============================================
def run_model(model, prompt):
"""Execute model via Ollama CLI. Returns result dict."""
start = time.time()
result = subprocess.run(
["ollama", "run", model, prompt, "--verbose"],
capture_output=True, text=True
)
elapsed = round(time.time() - start, 2)
gpu = get_gpu()
output = result.stdout + "\n" + result.stderr
return {
"output": output,
"time": elapsed,
"tok_s": parse_generation_speed(output),
"gpu_temp": gpu["temp"],
"gpu_mem": gpu["mem"],
"gpu_util": gpu["util"],
"gpu_clock": gpu["clock"]
}
# ============================================
# BENCHMARK A GROUP OF MODELS
# ============================================
def run_benchmark(
models,
label,
is_baseline,
all_prompts,
num_runs=1,
no_cooldown=False
):
"""
Run benchmark for a list of models.
Returns list of run_ids (one per model).
"""
run_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
run_ids = []
for model in models:
# Accumulate across runs
sem_by_test = {t: [] for t in ALL_TESTS}
fmt_by_test = {t: [] for t in ALL_TESTS}
tok_s_all = []
temp_all = []
detail_rows = []
print(f"\n[{label}] Model: {model} ({num_runs} run{'s' if num_runs > 1 else ''})")
# Warmup
subprocess.run(
["ollama", "run", model, "hello"],
capture_output=True, text=True
)
time.sleep(5)
warmup_judge()
for run_num in range(1, num_runs + 1):
if num_runs > 1:
print(f"\n ── Run {run_num}/{num_runs} ──")
for test_name in ALL_TESTS:
prompt = all_prompts.get(test_name, "")
if not prompt or not prompt.strip():
continue
result = run_model(model, prompt)
scores = score_test(test_name, prompt, result["output"])
sem = scores["semantic_score"]
fmt = scores["format_score"]
sem_by_test[test_name].append(sem)
fmt_by_test[test_name].append(fmt)
if result["tok_s"]:
tok_s_all.append(result["tok_s"])
if result["gpu_temp"] > 0:
temp_all.append(result["gpu_temp"])
flag = "J" if scores["used_judge"] else "V"
print(
f" [{run_num}] {test_name:<22} [{flag}] "
f"sem={sem:>2}/10 fmt={fmt:>2}/10 "
f"comb={scores['combined_score']:>5.2f} "
f"{scores['notes'][:52]}"
)
detail_rows.append({
"run_date": run_date,
"run_num": run_num,
"model": model,
"type": label,
"is_baseline": 1 if is_baseline else 0,
"test": test_name,
"weight": TEST_WEIGHTS.get(test_name, 0),
"time_s": result["time"],
"tok_s": result["tok_s"],
"gpu_temp": result["gpu_temp"],
"gpu_mem": result["gpu_mem"],
"gpu_util": result["gpu_util"],
"gpu_clock": result["gpu_clock"],
"output_length": len(result["output"]),
"semantic_score":sem,
"format_score": fmt,
"combined_score":scores["combined_score"],
"used_judge": 1 if scores["used_judge"] else 0,
"notes": scores["notes"][:120],
})
if not no_cooldown:
time.sleep(COOLDOWN_SECONDS)
# Aggregate
avg_sem = {t: round(statistics.mean(v), 2) for t, v in sem_by_test.items() if v}
avg_fmt = {t: round(statistics.mean(v), 2) for t, v in fmt_by_test.items() if v}
w_total, w_avg = compute_weighted(avg_sem)
cat_scores = compute_category_scores(avg_sem)
compliance = compute_compliance(sem_by_test)
var_stats = compute_variance_stats(sem_by_test)
fmt_avg = round(statistics.mean([s for v in fmt_by_test.values() for s in v]), 2) if fmt_by_test else 0
avg_tok = round(statistics.mean(tok_s_all), 1) if tok_s_all else 0
avg_tmp = round(statistics.mean(temp_all), 1) if temp_all else 0
print(f"\n ─── {model} ───")
print(f" Weighted avg: {w_avg} (total={w_total})")
print(f" Format avg: {fmt_avg}/10")
print(f" Variance: mean={var_stats['mean']} σ={var_stats['stdev']} failures={var_stats['failure_rate']}%")
print(f" Compliance: JSON={compliance.get('json_valid')}% YAML={compliance.get('yaml_valid')}% "
f"tool={compliance.get('tool_format')}% hall={compliance.get('hallucination_free')}%")
print(f" Categories: agent={cat_scores.get('agent_tool')} coding={cat_scores.get('coding')} "
f"rag={cat_scores.get('rag_context')} struct={cat_scores.get('structured')} "
f"hall={cat_scores.get('hallucination')} reason={cat_scores.get('reasoning')}")
print(f" tok/s={avg_tok} temp={avg_tmp}°C")
# Save to DB
run_row = {
"run_date": run_date,
"model": model,
"type": label,
"is_baseline": 1 if is_baseline else 0,
"num_runs": num_runs,
"weighted_total": w_total,
"weighted_avg": w_avg,
"avg_format": fmt_avg,
"mean_all": var_stats["mean"],
"stdev_all": var_stats["stdev"],
"min_score": var_stats["min"],
"max_score": var_stats["max"],
"failure_rate_pct":var_stats["failure_rate"],
"compliance_json": compliance.get("json_valid"),
"compliance_yaml": compliance.get("yaml_valid"),
"compliance_tool": compliance.get("tool_format"),
"compliance_hall": compliance.get("hallucination_free"),
"cat_agent_tool": cat_scores.get("agent_tool"),
"cat_coding": cat_scores.get("coding"),
"cat_rag_context": cat_scores.get("rag_context"),
"cat_structured": cat_scores.get("structured"),
"cat_hallucination":cat_scores.get("hallucination"),
"cat_reasoning": cat_scores.get("reasoning"),
"avg_tok_s": avg_tok,
"avg_gpu_temp": avg_tmp,
"tests_run": len(avg_sem) * num_runs,
}
run_id = insert_run(run_row)
insert_details(run_id, detail_rows)
# Variance rows (only if multiple runs)
if num_runs > 1:
var_rows = []
for test_name, scores_list in sem_by_test.items():
if len(scores_list) > 1:
var_rows.append({
"run_date": run_date,
"model": model,
"test": test_name,
"num_runs": num_runs,
"mean": round(statistics.mean(scores_list), 2),
"stdev": round(statistics.stdev(scores_list), 2),
"min_score": min(scores_list),
"max_score": max(scores_list),
"failure_rate_pct":round(
sum(1 for s in scores_list if s <= 2) / len(scores_list) * 100, 1
),
"scores_raw": str(scores_list),
})
if var_rows:
insert_variance(var_rows)
run_ids.append(run_id)
print(f"\nCooldown after {model}...\n")
time.sleep(30)
return run_ids