Files
llm-benchmark/reporting.py

171 lines
5.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
benchmark_v4/reporting.py
=========================
All output formatting — terminal reports and CSV export.
Completely separate from scoring and storage logic.
"""
from storage import load_latest_runs, load_all_runs, export_summary_csv
from config import MODELS_BASELINE_THINKING, MODELS_NEW_THINKING
def _tag(model, all_thinking):
return "🧠" if model in all_thinking else ""
def _base(row):
return "" if row.get("is_baseline") else " "
def print_weights():
from config import TEST_WEIGHTS, CATEGORIES
print("\n TEST WEIGHTS:")
category_labels = {
"agent_tool": "Agent/Tool reliability (25%)",
"coding": "Coding/Infrastructure (25%)",
"rag_context": "RAG/Context fidelity (20%)",
"structured": "Structured outputs (15%)",
"hallucination": "Hallucination resistance (10%)",
"reasoning": "Pure reasoning (5%)",
}
for cat, tests in CATEGORIES.items():
w = sum(TEST_WEIGHTS.get(t, 0) for t in tests)
label = category_labels.get(cat, cat)
print(f" {label:<42} {w*100:.0f}%")
def print_comparison(new_run_ids, existing_baseline_rows):
"""Compare current run against existing baseline."""
from storage import load_all_runs, get_connection
print("\n" + "=" * 68)
print(" 📊 RESULTS vs BASELINE")
print("=" * 68)
all_thinking = MODELS_BASELINE_THINKING + MODELS_NEW_THINKING
if existing_baseline_rows:
best = max(float(r.get("weighted_avg") or 0) for r in existing_baseline_rows)
print(f"\n EXISTING BASELINE (best w_avg: {best:.2f}):")
for r in sorted(existing_baseline_rows, key=lambda x: -float(x.get("weighted_avg") or 0)):
print(
f" {r['model']:<44} "
f"w={float(r.get('weighted_avg',0)):>5.2f} "
f"σ={r.get('stdev_all','?'):>4} "
f"fail={r.get('failure_rate_pct','?')}% "
f"[{str(r.get('run_date',''))[:10]}]"
)
else:
best = 0
# Load current runs
if new_run_ids:
from storage import get_connection
placeholders = ",".join("?" * len(new_run_ids))
with get_connection() as conn:
new_rows = [dict(r) for r in conn.execute(
f"SELECT * FROM runs WHERE id IN ({placeholders})",
new_run_ids
).fetchall()]
print(f"\n THIS RUN:")
for r in sorted(new_rows, key=lambda x: -float(x.get("weighted_avg") or 0)):
diff = float(r.get("weighted_avg") or 0) - best
arrow = "" if diff > 0.05 else "" if diff < -0.05 else "="
tag = "BASE" if r.get("is_baseline") else "NEW "
print(
f" [{tag}] {r['model']:<40} "
f"w={float(r.get('weighted_avg',0)):>5.2f} {arrow}{abs(diff):.2f} "
f"σ={r.get('stdev_all','?'):>4} "
f"fail={r.get('failure_rate_pct','?')}%"
)
def print_full_ranking(best=False):
"""Print complete ranking of all models."""
from storage import load_best_runs, load_latest_runs
print("\n" + "=" * 68)
title = "BEST RUN" if best else "LATEST RUN"
print(f" 🏆 FULL RANKING ({title} per model, weighted semantic avg)")
print("=" * 68)
all_thinking = MODELS_BASELINE_THINKING + MODELS_NEW_THINKING
rows = load_best_runs() if best else load_latest_runs()
for i, r in enumerate(rows, 1):
tag = _tag(r["model"], all_thinking)
base = "" if r.get("is_baseline") else " "
print(
f" {i:>2}. {tag}{base} {r['model']:<42} "
f"w={float(r.get('weighted_avg',0)):>5.2f} "
f"σ={r.get('stdev_all','?'):>4} "
f"fail={r.get('failure_rate_pct','?'):>4}% "
f"tok/s={r.get('avg_tok_s','?'):>5} "
f"🌡={r.get('avg_gpu_temp','?'):>2}°C "
f"[{str(r.get('run_date',''))[:10]}]"
)
print(f"\n ★=baseline w=weighted avg σ=stdev(low better) fail=failure rate 🌡=avg temps")
def print_category_breakdown():
"""Print category scores for latest run of each model."""
print("\n" + "=" * 68)
print(" 📂 CATEGORY BREAKDOWN (latest run per model)")
print("=" * 68)
rows = load_latest_runs()
if not rows:
return
header = f" {'Model':<40} {'agent':>6} {'code':>6} {'rag':>6} {'struct':>7} {'hall':>5} {'reason':>7}"
print(f"\n{header}")
print(" " + "-" * 64)
for r in rows:
base = "" if r.get("is_baseline") else " "
print(
f" {base}{r['model']:<41} "
f"{r.get('cat_agent_tool','?'):>6} "
f"{r.get('cat_coding','?'):>6} "
f"{r.get('cat_rag_context','?'):>6} "
f"{r.get('cat_structured','?'):>7} "
f"{r.get('cat_hallucination','?'):>5} "
f"{r.get('cat_reasoning','?'):>7}"
)
def print_compliance_table():
"""Print compliance rates for latest run of each model."""
print("\n" + "=" * 68)
print(" ✅ COMPLIANCE RATES (latest run per model)")
print("=" * 68)
rows = load_latest_runs()
if not rows:
return
header = f" {'Model':<44} {'JSON':>6} {'YAML':>6} {'Tool':>6} {'Hall':>6}"
print(f"\n{header}")
print(" " + "-" * 64)
for r in rows:
base = "" if r.get("is_baseline") else " "
def fmt(v):
return f"{v}%" if v is not None else " n/a"
print(
f" {base}{r['model']:<43} "
f"{fmt(r.get('compliance_json')):>6} "
f"{fmt(r.get('compliance_yaml')):>6} "
f"{fmt(r.get('compliance_tool')):>6} "
f"{fmt(r.get('compliance_hall')):>6}"
)
def run_report():
"""Full report: ranking + categories + compliance."""
print_full_ranking()
print_category_breakdown()
print_compliance_table()
export_summary_csv()