RC: (add) python script files

2026-05-15 16:50:26 +01:00
parent 51e9389726
commit ab7875303e
9 changed files with 2350 additions and 0 deletions
--- a/runner.py
+++ b/runner.py
@@ -0,0 +1,276 @@
+"""
+benchmark_v4/runner.py
+======================
+Executes models via Ollama CLI and orchestrates the benchmark loop.
+Handles: warmup, GPU polling, cooldown, multi-run variance.
+"""
+
+import subprocess
+import time
+import re
+import statistics
+from datetime import datetime
+
+from config import (
+    COOLDOWN_SECONDS, GPU_POLL_EVERY,
+    TEST_WEIGHTS, CATEGORIES,
+)
+from prompts import ALL_TESTS
+from validators import normalize_text
+from judge import warmup_judge
+from scoring import (
+    score_test, compute_weighted, compute_category_scores,
+    compute_compliance, compute_variance_stats
+)
+from storage import insert_run, insert_details, insert_variance
+
+
+# ============================================
+# GPU MONITORING
+# ============================================
+
+_gpu_cache      = {"temp": -1, "mem": -1, "util": -1, "clock": -1}
+_gpu_poll_count = 0
+
+
+def get_gpu(force=False):
+    """Poll GPU stats. Cached every GPU_POLL_EVERY tests to reduce overhead."""
+    global _gpu_cache, _gpu_poll_count
+
+    _gpu_poll_count += 1
+    if not force and GPU_POLL_EVERY > 0 and _gpu_poll_count % GPU_POLL_EVERY != 0:
+        return _gpu_cache
+
+    try:
+        result = subprocess.run(
+            ["nvidia-smi",
+             "--query-gpu=temperature.gpu,memory.used,utilization.gpu,clocks.sm",
+             "--format=csv,noheader,nounits"],
+            capture_output=True, text=True, timeout=5
+        )
+        temp, mem, util, clock = result.stdout.strip().split(", ")
+        _gpu_cache = {
+            "temp": int(temp), "mem": int(mem),
+            "util": int(util), "clock": int(clock)
+        }
+    except Exception:
+        pass
+
+    return _gpu_cache
+
+
+# ============================================
+# PARSE OLLAMA VERBOSE
+# ============================================
+
+def parse_generation_speed(output):
+    """
+    Parse GENERATION (eval) speed from Ollama verbose output.
+    The last tokens/s value is the generation rate.
+    """
+    matches = re.findall(r'(\d+\.\d+)\s+tokens/s', output)
+    return float(matches[-1]) if matches else None
+
+
+# ============================================
+# RUN SINGLE MODEL + PROMPT
+# ============================================
+
+def run_model(model, prompt):
+    """Execute model via Ollama CLI. Returns result dict."""
+    start  = time.time()
+    result = subprocess.run(
+        ["ollama", "run", model, prompt, "--verbose"],
+        capture_output=True, text=True
+    )
+    elapsed = round(time.time() - start, 2)
+    gpu     = get_gpu()
+    output  = result.stdout + "\n" + result.stderr
+
+    return {
+        "output":    output,
+        "time":      elapsed,
+        "tok_s":     parse_generation_speed(output),
+        "gpu_temp":  gpu["temp"],
+        "gpu_mem":   gpu["mem"],
+        "gpu_util":  gpu["util"],
+        "gpu_clock": gpu["clock"]
+    }
+
+
+# ============================================
+# BENCHMARK A GROUP OF MODELS
+# ============================================
+
+def run_benchmark(
+    models,
+    label,
+    is_baseline,
+    all_prompts,
+    num_runs=1,
+    no_cooldown=False
+):
+    """
+    Run benchmark for a list of models.
+    Returns list of run_ids (one per model).
+    """
+    run_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    run_ids  = []
+
+    for model in models:
+        # Accumulate across runs
+        sem_by_test  = {t: [] for t in ALL_TESTS}
+        fmt_by_test  = {t: [] for t in ALL_TESTS}
+        tok_s_all    = []
+        temp_all     = []
+        detail_rows  = []
+
+        print(f"\n[{label}] Model: {model}  ({num_runs} run{'s' if num_runs > 1 else ''})")
+
+        # Warmup
+        subprocess.run(
+            ["ollama", "run", model, "hello"],
+            capture_output=True, text=True
+        )
+        time.sleep(5)
+        warmup_judge()
+
+        for run_num in range(1, num_runs + 1):
+            if num_runs > 1:
+                print(f"\n  ── Run {run_num}/{num_runs} ──")
+
+            for test_name in ALL_TESTS:
+                prompt = all_prompts.get(test_name, "")
+                if not prompt or not prompt.strip():
+                    continue
+
+                result  = run_model(model, prompt)
+                scores  = score_test(test_name, prompt, result["output"])
+
+                sem = scores["semantic_score"]
+                fmt = scores["format_score"]
+
+                sem_by_test[test_name].append(sem)
+                fmt_by_test[test_name].append(fmt)
+
+                if result["tok_s"]:
+                    tok_s_all.append(result["tok_s"])
+                if result["gpu_temp"] > 0:
+                    temp_all.append(result["gpu_temp"])
+
+                flag = "J" if scores["used_judge"] else "V"
+                print(
+                    f"  [{run_num}] {test_name:<22} [{flag}]  "
+                    f"sem={sem:>2}/10  fmt={fmt:>2}/10  "
+                    f"comb={scores['combined_score']:>5.2f}  "
+                    f"{scores['notes'][:52]}"
+                )
+
+                detail_rows.append({
+                    "run_date":      run_date,
+                    "run_num":       run_num,
+                    "model":         model,
+                    "type":          label,
+                    "is_baseline":   1 if is_baseline else 0,
+                    "test":          test_name,
+                    "weight":        TEST_WEIGHTS.get(test_name, 0),
+                    "time_s":        result["time"],
+                    "tok_s":         result["tok_s"],
+                    "gpu_temp":      result["gpu_temp"],
+                    "gpu_mem":       result["gpu_mem"],
+                    "gpu_util":      result["gpu_util"],
+                    "gpu_clock":     result["gpu_clock"],
+                    "output_length": len(result["output"]),
+                    "semantic_score":sem,
+                    "format_score":  fmt,
+                    "combined_score":scores["combined_score"],
+                    "used_judge":    1 if scores["used_judge"] else 0,
+                    "notes":         scores["notes"][:120],
+                })
+
+                if not no_cooldown:
+                    time.sleep(COOLDOWN_SECONDS)
+
+        # Aggregate
+        avg_sem    = {t: round(statistics.mean(v), 2) for t, v in sem_by_test.items() if v}
+        avg_fmt    = {t: round(statistics.mean(v), 2) for t, v in fmt_by_test.items() if v}
+        w_total, w_avg = compute_weighted(avg_sem)
+        cat_scores = compute_category_scores(avg_sem)
+        compliance = compute_compliance(sem_by_test)
+        var_stats  = compute_variance_stats(sem_by_test)
+        fmt_avg    = round(statistics.mean([s for v in fmt_by_test.values() for s in v]), 2) if fmt_by_test else 0
+        avg_tok    = round(statistics.mean(tok_s_all), 1) if tok_s_all else 0
+        avg_tmp    = round(statistics.mean(temp_all), 1) if temp_all else 0
+
+        print(f"\n  ─── {model} ───")
+        print(f"      Weighted avg:   {w_avg}  (total={w_total})")
+        print(f"      Format avg:     {fmt_avg}/10")
+        print(f"      Variance:       mean={var_stats['mean']} σ={var_stats['stdev']} failures={var_stats['failure_rate']}%")
+        print(f"      Compliance:     JSON={compliance.get('json_valid')}% YAML={compliance.get('yaml_valid')}% "
+              f"tool={compliance.get('tool_format')}% hall={compliance.get('hallucination_free')}%")
+        print(f"      Categories:     agent={cat_scores.get('agent_tool')} coding={cat_scores.get('coding')} "
+              f"rag={cat_scores.get('rag_context')} struct={cat_scores.get('structured')} "
+              f"hall={cat_scores.get('hallucination')} reason={cat_scores.get('reasoning')}")
+        print(f"      tok/s={avg_tok}  temp={avg_tmp}°C")
+
+        # Save to DB
+        run_row = {
+            "run_date":        run_date,
+            "model":           model,
+            "type":            label,
+            "is_baseline":     1 if is_baseline else 0,
+            "num_runs":        num_runs,
+            "weighted_total":  w_total,
+            "weighted_avg":    w_avg,
+            "avg_format":      fmt_avg,
+            "mean_all":        var_stats["mean"],
+            "stdev_all":       var_stats["stdev"],
+            "min_score":       var_stats["min"],
+            "max_score":       var_stats["max"],
+            "failure_rate_pct":var_stats["failure_rate"],
+            "compliance_json": compliance.get("json_valid"),
+            "compliance_yaml": compliance.get("yaml_valid"),
+            "compliance_tool": compliance.get("tool_format"),
+            "compliance_hall": compliance.get("hallucination_free"),
+            "cat_agent_tool":  cat_scores.get("agent_tool"),
+            "cat_coding":      cat_scores.get("coding"),
+            "cat_rag_context": cat_scores.get("rag_context"),
+            "cat_structured":  cat_scores.get("structured"),
+            "cat_hallucination":cat_scores.get("hallucination"),
+            "cat_reasoning":   cat_scores.get("reasoning"),
+            "avg_tok_s":       avg_tok,
+            "avg_gpu_temp":    avg_tmp,
+            "tests_run":       len(avg_sem) * num_runs,
+        }
+
+        run_id = insert_run(run_row)
+        insert_details(run_id, detail_rows)
+
+        # Variance rows (only if multiple runs)
+        if num_runs > 1:
+            var_rows = []
+            for test_name, scores_list in sem_by_test.items():
+                if len(scores_list) > 1:
+                    var_rows.append({
+                        "run_date":        run_date,
+                        "model":           model,
+                        "test":            test_name,
+                        "num_runs":        num_runs,
+                        "mean":            round(statistics.mean(scores_list), 2),
+                        "stdev":           round(statistics.stdev(scores_list), 2),
+                        "min_score":       min(scores_list),
+                        "max_score":       max(scores_list),
+                        "failure_rate_pct":round(
+                            sum(1 for s in scores_list if s <= 2) / len(scores_list) * 100, 1
+                        ),
+                        "scores_raw":      str(scores_list),
+                    })
+            if var_rows:
+                insert_variance(var_rows)
+
+        run_ids.append(run_id)
+
+        print(f"\nCooldown after {model}...\n")
+        time.sleep(30)
+
+    return run_ids