RC: (add) python script files

2026-05-15 16:50:26 +01:00
parent 51e9389726
commit ab7875303e
9 changed files with 2350 additions and 0 deletions
--- a/reporting.py
+++ b/reporting.py
@@ -0,0 +1,170 @@
+"""
+benchmark_v4/reporting.py
+=========================
+All output formatting — terminal reports and CSV export.
+Completely separate from scoring and storage logic.
+"""
+
+from storage import load_latest_runs, load_all_runs, export_summary_csv
+from config import MODELS_BASELINE_THINKING, MODELS_NEW_THINKING
+
+
+def _tag(model, all_thinking):
+    return "🧠" if model in all_thinking else "⚡"
+
+
+def _base(row):
+    return "★" if row.get("is_baseline") else " "
+
+
+def print_weights():
+    from config import TEST_WEIGHTS, CATEGORIES
+    print("\n  TEST WEIGHTS:")
+    category_labels = {
+        "agent_tool":    "Agent/Tool reliability (25%)",
+        "coding":        "Coding/Infrastructure (25%)",
+        "rag_context":   "RAG/Context fidelity (20%)",
+        "structured":    "Structured outputs (15%)",
+        "hallucination": "Hallucination resistance (10%)",
+        "reasoning":     "Pure reasoning (5%)",
+    }
+    for cat, tests in CATEGORIES.items():
+        w = sum(TEST_WEIGHTS.get(t, 0) for t in tests)
+        label = category_labels.get(cat, cat)
+        print(f"    {label:<42} {w*100:.0f}%")
+
+
+def print_comparison(new_run_ids, existing_baseline_rows):
+    """Compare current run against existing baseline."""
+    from storage import load_all_runs, get_connection
+
+    print("\n" + "=" * 68)
+    print("  📊 RESULTS vs BASELINE")
+    print("=" * 68)
+
+    all_thinking = MODELS_BASELINE_THINKING + MODELS_NEW_THINKING
+
+    if existing_baseline_rows:
+        best = max(float(r.get("weighted_avg") or 0) for r in existing_baseline_rows)
+        print(f"\n  EXISTING BASELINE (best w_avg: {best:.2f}):")
+        for r in sorted(existing_baseline_rows, key=lambda x: -float(x.get("weighted_avg") or 0)):
+            print(
+                f"    {r['model']:<44} "
+                f"w={float(r.get('weighted_avg',0)):>5.2f}  "
+                f"σ={r.get('stdev_all','?'):>4}  "
+                f"fail={r.get('failure_rate_pct','?')}%  "
+                f"[{str(r.get('run_date',''))[:10]}]"
+            )
+    else:
+        best = 0
+
+    # Load current runs
+    if new_run_ids:
+        from storage import get_connection
+        placeholders = ",".join("?" * len(new_run_ids))
+        with get_connection() as conn:
+            new_rows = [dict(r) for r in conn.execute(
+                f"SELECT * FROM runs WHERE id IN ({placeholders})",
+                new_run_ids
+            ).fetchall()]
+
+        print(f"\n  THIS RUN:")
+        for r in sorted(new_rows, key=lambda x: -float(x.get("weighted_avg") or 0)):
+            diff  = float(r.get("weighted_avg") or 0) - best
+            arrow = "▲" if diff > 0.05 else "▼" if diff < -0.05 else "="
+            tag   = "BASE" if r.get("is_baseline") else "NEW "
+            print(
+                f"    [{tag}] {r['model']:<40} "
+                f"w={float(r.get('weighted_avg',0)):>5.2f}  {arrow}{abs(diff):.2f}  "
+                f"σ={r.get('stdev_all','?'):>4}  "
+                f"fail={r.get('failure_rate_pct','?')}%"
+            )
+
+
+def print_full_ranking(best=False):
+    """Print complete ranking of all models."""
+    from storage import load_best_runs, load_latest_runs
+    print("\n" + "=" * 68)
+    title = "BEST RUN" if best else "LATEST RUN"
+    print(f"  🏆 FULL RANKING ({title} per model, weighted semantic avg)")
+    print("=" * 68)
+
+    all_thinking = MODELS_BASELINE_THINKING + MODELS_NEW_THINKING
+    rows = load_best_runs() if best else load_latest_runs()
+
+    for i, r in enumerate(rows, 1):
+        tag  = _tag(r["model"], all_thinking)
+        base = "★" if r.get("is_baseline") else " "
+        print(
+            f"  {i:>2}. {tag}{base} {r['model']:<42} "
+            f"w={float(r.get('weighted_avg',0)):>5.2f}  "
+            f"σ={r.get('stdev_all','?'):>4}  "
+            f"fail={r.get('failure_rate_pct','?'):>4}%  "
+            f"tok/s={r.get('avg_tok_s','?'):>5}  "
+            f"🌡={r.get('avg_gpu_temp','?'):>2}°C  "
+            f"[{str(r.get('run_date',''))[:10]}]"
+        )
+
+    print(f"\n  ★=baseline w=weighted avg  σ=stdev(low better)  fail=failure rate  🌡=avg temps")
+
+
+def print_category_breakdown():
+    """Print category scores for latest run of each model."""
+    print("\n" + "=" * 68)
+    print("  📂 CATEGORY BREAKDOWN (latest run per model)")
+    print("=" * 68)
+
+    rows = load_latest_runs()
+    if not rows:
+        return
+
+    header = f"  {'Model':<40} {'agent':>6} {'code':>6} {'rag':>6} {'struct':>7} {'hall':>5} {'reason':>7}"
+    print(f"\n{header}")
+    print("  " + "-" * 64)
+
+    for r in rows:
+        base = "★" if r.get("is_baseline") else " "
+        print(
+            f"  {base}{r['model']:<41} "
+            f"{r.get('cat_agent_tool','?'):>6}  "
+            f"{r.get('cat_coding','?'):>6}  "
+            f"{r.get('cat_rag_context','?'):>6}  "
+            f"{r.get('cat_structured','?'):>7}  "
+            f"{r.get('cat_hallucination','?'):>5}  "
+            f"{r.get('cat_reasoning','?'):>7}"
+        )
+
+
+def print_compliance_table():
+    """Print compliance rates for latest run of each model."""
+    print("\n" + "=" * 68)
+    print("  ✅ COMPLIANCE RATES (latest run per model)")
+    print("=" * 68)
+
+    rows = load_latest_runs()
+    if not rows:
+        return
+
+    header = f"  {'Model':<44} {'JSON':>6} {'YAML':>6} {'Tool':>6} {'Hall':>6}"
+    print(f"\n{header}")
+    print("  " + "-" * 64)
+
+    for r in rows:
+        base = "★" if r.get("is_baseline") else " "
+        def fmt(v):
+            return f"{v}%" if v is not None else "  n/a"
+        print(
+            f"  {base}{r['model']:<43} "
+            f"{fmt(r.get('compliance_json')):>6}  "
+            f"{fmt(r.get('compliance_yaml')):>6}  "
+            f"{fmt(r.get('compliance_tool')):>6}  "
+            f"{fmt(r.get('compliance_hall')):>6}"
+        )
+
+
+def run_report():
+    """Full report: ranking + categories + compliance."""
+    print_full_ranking()
+    print_category_breakdown()
+    print_compliance_table()
+    export_summary_csv()