RC: (add) python script files
This commit is contained in:
276
runner.py
Normal file
276
runner.py
Normal file
@@ -0,0 +1,276 @@
|
||||
"""
|
||||
benchmark_v4/runner.py
|
||||
======================
|
||||
Executes models via Ollama CLI and orchestrates the benchmark loop.
|
||||
Handles: warmup, GPU polling, cooldown, multi-run variance.
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import time
|
||||
import re
|
||||
import statistics
|
||||
from datetime import datetime
|
||||
|
||||
from config import (
|
||||
COOLDOWN_SECONDS, GPU_POLL_EVERY,
|
||||
TEST_WEIGHTS, CATEGORIES,
|
||||
)
|
||||
from prompts import ALL_TESTS
|
||||
from validators import normalize_text
|
||||
from judge import warmup_judge
|
||||
from scoring import (
|
||||
score_test, compute_weighted, compute_category_scores,
|
||||
compute_compliance, compute_variance_stats
|
||||
)
|
||||
from storage import insert_run, insert_details, insert_variance
|
||||
|
||||
|
||||
# ============================================
|
||||
# GPU MONITORING
|
||||
# ============================================
|
||||
|
||||
_gpu_cache = {"temp": -1, "mem": -1, "util": -1, "clock": -1}
|
||||
_gpu_poll_count = 0
|
||||
|
||||
|
||||
def get_gpu(force=False):
|
||||
"""Poll GPU stats. Cached every GPU_POLL_EVERY tests to reduce overhead."""
|
||||
global _gpu_cache, _gpu_poll_count
|
||||
|
||||
_gpu_poll_count += 1
|
||||
if not force and GPU_POLL_EVERY > 0 and _gpu_poll_count % GPU_POLL_EVERY != 0:
|
||||
return _gpu_cache
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["nvidia-smi",
|
||||
"--query-gpu=temperature.gpu,memory.used,utilization.gpu,clocks.sm",
|
||||
"--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
temp, mem, util, clock = result.stdout.strip().split(", ")
|
||||
_gpu_cache = {
|
||||
"temp": int(temp), "mem": int(mem),
|
||||
"util": int(util), "clock": int(clock)
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return _gpu_cache
|
||||
|
||||
|
||||
# ============================================
|
||||
# PARSE OLLAMA VERBOSE
|
||||
# ============================================
|
||||
|
||||
def parse_generation_speed(output):
|
||||
"""
|
||||
Parse GENERATION (eval) speed from Ollama verbose output.
|
||||
The last tokens/s value is the generation rate.
|
||||
"""
|
||||
matches = re.findall(r'(\d+\.\d+)\s+tokens/s', output)
|
||||
return float(matches[-1]) if matches else None
|
||||
|
||||
|
||||
# ============================================
|
||||
# RUN SINGLE MODEL + PROMPT
|
||||
# ============================================
|
||||
|
||||
def run_model(model, prompt):
|
||||
"""Execute model via Ollama CLI. Returns result dict."""
|
||||
start = time.time()
|
||||
result = subprocess.run(
|
||||
["ollama", "run", model, prompt, "--verbose"],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
elapsed = round(time.time() - start, 2)
|
||||
gpu = get_gpu()
|
||||
output = result.stdout + "\n" + result.stderr
|
||||
|
||||
return {
|
||||
"output": output,
|
||||
"time": elapsed,
|
||||
"tok_s": parse_generation_speed(output),
|
||||
"gpu_temp": gpu["temp"],
|
||||
"gpu_mem": gpu["mem"],
|
||||
"gpu_util": gpu["util"],
|
||||
"gpu_clock": gpu["clock"]
|
||||
}
|
||||
|
||||
|
||||
# ============================================
|
||||
# BENCHMARK A GROUP OF MODELS
|
||||
# ============================================
|
||||
|
||||
def run_benchmark(
|
||||
models,
|
||||
label,
|
||||
is_baseline,
|
||||
all_prompts,
|
||||
num_runs=1,
|
||||
no_cooldown=False
|
||||
):
|
||||
"""
|
||||
Run benchmark for a list of models.
|
||||
Returns list of run_ids (one per model).
|
||||
"""
|
||||
run_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
run_ids = []
|
||||
|
||||
for model in models:
|
||||
# Accumulate across runs
|
||||
sem_by_test = {t: [] for t in ALL_TESTS}
|
||||
fmt_by_test = {t: [] for t in ALL_TESTS}
|
||||
tok_s_all = []
|
||||
temp_all = []
|
||||
detail_rows = []
|
||||
|
||||
print(f"\n[{label}] Model: {model} ({num_runs} run{'s' if num_runs > 1 else ''})")
|
||||
|
||||
# Warmup
|
||||
subprocess.run(
|
||||
["ollama", "run", model, "hello"],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
time.sleep(5)
|
||||
warmup_judge()
|
||||
|
||||
for run_num in range(1, num_runs + 1):
|
||||
if num_runs > 1:
|
||||
print(f"\n ── Run {run_num}/{num_runs} ──")
|
||||
|
||||
for test_name in ALL_TESTS:
|
||||
prompt = all_prompts.get(test_name, "")
|
||||
if not prompt or not prompt.strip():
|
||||
continue
|
||||
|
||||
result = run_model(model, prompt)
|
||||
scores = score_test(test_name, prompt, result["output"])
|
||||
|
||||
sem = scores["semantic_score"]
|
||||
fmt = scores["format_score"]
|
||||
|
||||
sem_by_test[test_name].append(sem)
|
||||
fmt_by_test[test_name].append(fmt)
|
||||
|
||||
if result["tok_s"]:
|
||||
tok_s_all.append(result["tok_s"])
|
||||
if result["gpu_temp"] > 0:
|
||||
temp_all.append(result["gpu_temp"])
|
||||
|
||||
flag = "J" if scores["used_judge"] else "V"
|
||||
print(
|
||||
f" [{run_num}] {test_name:<22} [{flag}] "
|
||||
f"sem={sem:>2}/10 fmt={fmt:>2}/10 "
|
||||
f"comb={scores['combined_score']:>5.2f} "
|
||||
f"{scores['notes'][:52]}"
|
||||
)
|
||||
|
||||
detail_rows.append({
|
||||
"run_date": run_date,
|
||||
"run_num": run_num,
|
||||
"model": model,
|
||||
"type": label,
|
||||
"is_baseline": 1 if is_baseline else 0,
|
||||
"test": test_name,
|
||||
"weight": TEST_WEIGHTS.get(test_name, 0),
|
||||
"time_s": result["time"],
|
||||
"tok_s": result["tok_s"],
|
||||
"gpu_temp": result["gpu_temp"],
|
||||
"gpu_mem": result["gpu_mem"],
|
||||
"gpu_util": result["gpu_util"],
|
||||
"gpu_clock": result["gpu_clock"],
|
||||
"output_length": len(result["output"]),
|
||||
"semantic_score":sem,
|
||||
"format_score": fmt,
|
||||
"combined_score":scores["combined_score"],
|
||||
"used_judge": 1 if scores["used_judge"] else 0,
|
||||
"notes": scores["notes"][:120],
|
||||
})
|
||||
|
||||
if not no_cooldown:
|
||||
time.sleep(COOLDOWN_SECONDS)
|
||||
|
||||
# Aggregate
|
||||
avg_sem = {t: round(statistics.mean(v), 2) for t, v in sem_by_test.items() if v}
|
||||
avg_fmt = {t: round(statistics.mean(v), 2) for t, v in fmt_by_test.items() if v}
|
||||
w_total, w_avg = compute_weighted(avg_sem)
|
||||
cat_scores = compute_category_scores(avg_sem)
|
||||
compliance = compute_compliance(sem_by_test)
|
||||
var_stats = compute_variance_stats(sem_by_test)
|
||||
fmt_avg = round(statistics.mean([s for v in fmt_by_test.values() for s in v]), 2) if fmt_by_test else 0
|
||||
avg_tok = round(statistics.mean(tok_s_all), 1) if tok_s_all else 0
|
||||
avg_tmp = round(statistics.mean(temp_all), 1) if temp_all else 0
|
||||
|
||||
print(f"\n ─── {model} ───")
|
||||
print(f" Weighted avg: {w_avg} (total={w_total})")
|
||||
print(f" Format avg: {fmt_avg}/10")
|
||||
print(f" Variance: mean={var_stats['mean']} σ={var_stats['stdev']} failures={var_stats['failure_rate']}%")
|
||||
print(f" Compliance: JSON={compliance.get('json_valid')}% YAML={compliance.get('yaml_valid')}% "
|
||||
f"tool={compliance.get('tool_format')}% hall={compliance.get('hallucination_free')}%")
|
||||
print(f" Categories: agent={cat_scores.get('agent_tool')} coding={cat_scores.get('coding')} "
|
||||
f"rag={cat_scores.get('rag_context')} struct={cat_scores.get('structured')} "
|
||||
f"hall={cat_scores.get('hallucination')} reason={cat_scores.get('reasoning')}")
|
||||
print(f" tok/s={avg_tok} temp={avg_tmp}°C")
|
||||
|
||||
# Save to DB
|
||||
run_row = {
|
||||
"run_date": run_date,
|
||||
"model": model,
|
||||
"type": label,
|
||||
"is_baseline": 1 if is_baseline else 0,
|
||||
"num_runs": num_runs,
|
||||
"weighted_total": w_total,
|
||||
"weighted_avg": w_avg,
|
||||
"avg_format": fmt_avg,
|
||||
"mean_all": var_stats["mean"],
|
||||
"stdev_all": var_stats["stdev"],
|
||||
"min_score": var_stats["min"],
|
||||
"max_score": var_stats["max"],
|
||||
"failure_rate_pct":var_stats["failure_rate"],
|
||||
"compliance_json": compliance.get("json_valid"),
|
||||
"compliance_yaml": compliance.get("yaml_valid"),
|
||||
"compliance_tool": compliance.get("tool_format"),
|
||||
"compliance_hall": compliance.get("hallucination_free"),
|
||||
"cat_agent_tool": cat_scores.get("agent_tool"),
|
||||
"cat_coding": cat_scores.get("coding"),
|
||||
"cat_rag_context": cat_scores.get("rag_context"),
|
||||
"cat_structured": cat_scores.get("structured"),
|
||||
"cat_hallucination":cat_scores.get("hallucination"),
|
||||
"cat_reasoning": cat_scores.get("reasoning"),
|
||||
"avg_tok_s": avg_tok,
|
||||
"avg_gpu_temp": avg_tmp,
|
||||
"tests_run": len(avg_sem) * num_runs,
|
||||
}
|
||||
|
||||
run_id = insert_run(run_row)
|
||||
insert_details(run_id, detail_rows)
|
||||
|
||||
# Variance rows (only if multiple runs)
|
||||
if num_runs > 1:
|
||||
var_rows = []
|
||||
for test_name, scores_list in sem_by_test.items():
|
||||
if len(scores_list) > 1:
|
||||
var_rows.append({
|
||||
"run_date": run_date,
|
||||
"model": model,
|
||||
"test": test_name,
|
||||
"num_runs": num_runs,
|
||||
"mean": round(statistics.mean(scores_list), 2),
|
||||
"stdev": round(statistics.stdev(scores_list), 2),
|
||||
"min_score": min(scores_list),
|
||||
"max_score": max(scores_list),
|
||||
"failure_rate_pct":round(
|
||||
sum(1 for s in scores_list if s <= 2) / len(scores_list) * 100, 1
|
||||
),
|
||||
"scores_raw": str(scores_list),
|
||||
})
|
||||
if var_rows:
|
||||
insert_variance(var_rows)
|
||||
|
||||
run_ids.append(run_id)
|
||||
|
||||
print(f"\nCooldown after {model}...\n")
|
||||
time.sleep(30)
|
||||
|
||||
return run_ids
|
||||
Reference in New Issue
Block a user