""" benchmark_v4/reporting.py ========================= All output formatting — terminal reports and CSV export. Completely separate from scoring and storage logic. """ from storage import load_latest_runs, load_all_runs, export_summary_csv from config import MODELS_BASELINE_THINKING, MODELS_NEW_THINKING def _tag(model, all_thinking): return "🧠" if model in all_thinking else "⚡" def _base(row): return "★" if row.get("is_baseline") else " " def print_weights(): from config import TEST_WEIGHTS, CATEGORIES print("\n TEST WEIGHTS:") category_labels = { "agent_tool": "Agent/Tool reliability (25%)", "coding": "Coding/Infrastructure (25%)", "rag_context": "RAG/Context fidelity (20%)", "structured": "Structured outputs (15%)", "hallucination": "Hallucination resistance (10%)", "reasoning": "Pure reasoning (5%)", } for cat, tests in CATEGORIES.items(): w = sum(TEST_WEIGHTS.get(t, 0) for t in tests) label = category_labels.get(cat, cat) print(f" {label:<42} {w*100:.0f}%") def print_comparison(new_run_ids, existing_baseline_rows): """Compare current run against existing baseline.""" from storage import load_all_runs, get_connection print("\n" + "=" * 68) print(" 📊 RESULTS vs BASELINE") print("=" * 68) all_thinking = MODELS_BASELINE_THINKING + MODELS_NEW_THINKING if existing_baseline_rows: best = max(float(r.get("weighted_avg") or 0) for r in existing_baseline_rows) print(f"\n EXISTING BASELINE (best w_avg: {best:.2f}):") for r in sorted(existing_baseline_rows, key=lambda x: -float(x.get("weighted_avg") or 0)): print( f" {r['model']:<44} " f"w={float(r.get('weighted_avg',0)):>5.2f} " f"σ={r.get('stdev_all','?'):>4} " f"fail={r.get('failure_rate_pct','?')}% " f"[{str(r.get('run_date',''))[:10]}]" ) else: best = 0 # Load current runs if new_run_ids: from storage import get_connection placeholders = ",".join("?" * len(new_run_ids)) with get_connection() as conn: new_rows = [dict(r) for r in conn.execute( f"SELECT * FROM runs WHERE id IN ({placeholders})", new_run_ids ).fetchall()] print(f"\n THIS RUN:") for r in sorted(new_rows, key=lambda x: -float(x.get("weighted_avg") or 0)): diff = float(r.get("weighted_avg") or 0) - best arrow = "▲" if diff > 0.05 else "▼" if diff < -0.05 else "=" tag = "BASE" if r.get("is_baseline") else "NEW " print( f" [{tag}] {r['model']:<40} " f"w={float(r.get('weighted_avg',0)):>5.2f} {arrow}{abs(diff):.2f} " f"σ={r.get('stdev_all','?'):>4} " f"fail={r.get('failure_rate_pct','?')}%" ) def print_full_ranking(best=False): """Print complete ranking of all models.""" from storage import load_best_runs, load_latest_runs print("\n" + "=" * 68) title = "BEST RUN" if best else "LATEST RUN" print(f" 🏆 FULL RANKING ({title} per model, weighted semantic avg)") print("=" * 68) all_thinking = MODELS_BASELINE_THINKING + MODELS_NEW_THINKING rows = load_best_runs() if best else load_latest_runs() for i, r in enumerate(rows, 1): tag = _tag(r["model"], all_thinking) base = "★" if r.get("is_baseline") else " " print( f" {i:>2}. {tag}{base} {r['model']:<42} " f"w={float(r.get('weighted_avg',0)):>5.2f} " f"σ={r.get('stdev_all','?'):>4} " f"fail={r.get('failure_rate_pct','?'):>4}% " f"tok/s={r.get('avg_tok_s','?'):>5} " f"🌡={r.get('avg_gpu_temp','?'):>2}°C " f"[{str(r.get('run_date',''))[:10]}]" ) print(f"\n ★=baseline w=weighted avg σ=stdev(low better) fail=failure rate 🌡=avg temps") def print_category_breakdown(): """Print category scores for latest run of each model.""" print("\n" + "=" * 68) print(" 📂 CATEGORY BREAKDOWN (latest run per model)") print("=" * 68) rows = load_latest_runs() if not rows: return header = f" {'Model':<40} {'agent':>6} {'code':>6} {'rag':>6} {'struct':>7} {'hall':>5} {'reason':>7}" print(f"\n{header}") print(" " + "-" * 64) for r in rows: base = "★" if r.get("is_baseline") else " " print( f" {base}{r['model']:<41} " f"{r.get('cat_agent_tool','?'):>6} " f"{r.get('cat_coding','?'):>6} " f"{r.get('cat_rag_context','?'):>6} " f"{r.get('cat_structured','?'):>7} " f"{r.get('cat_hallucination','?'):>5} " f"{r.get('cat_reasoning','?'):>7}" ) def print_compliance_table(): """Print compliance rates for latest run of each model.""" print("\n" + "=" * 68) print(" ✅ COMPLIANCE RATES (latest run per model)") print("=" * 68) rows = load_latest_runs() if not rows: return header = f" {'Model':<44} {'JSON':>6} {'YAML':>6} {'Tool':>6} {'Hall':>6}" print(f"\n{header}") print(" " + "-" * 64) for r in rows: base = "★" if r.get("is_baseline") else " " def fmt(v): return f"{v}%" if v is not None else " n/a" print( f" {base}{r['model']:<43} " f"{fmt(r.get('compliance_json')):>6} " f"{fmt(r.get('compliance_yaml')):>6} " f"{fmt(r.get('compliance_tool')):>6} " f"{fmt(r.get('compliance_hall')):>6}" ) def run_report(): """Full report: ranking + categories + compliance.""" print_full_ranking() print_category_breakdown() print_compliance_table() export_summary_csv()