RC: (add) python script files
This commit is contained in:
170
reporting.py
Normal file
170
reporting.py
Normal file
@@ -0,0 +1,170 @@
|
||||
"""
|
||||
benchmark_v4/reporting.py
|
||||
=========================
|
||||
All output formatting β terminal reports and CSV export.
|
||||
Completely separate from scoring and storage logic.
|
||||
"""
|
||||
|
||||
from storage import load_latest_runs, load_all_runs, export_summary_csv
|
||||
from config import MODELS_BASELINE_THINKING, MODELS_NEW_THINKING
|
||||
|
||||
|
||||
def _tag(model, all_thinking):
|
||||
return "π§ " if model in all_thinking else "β‘"
|
||||
|
||||
|
||||
def _base(row):
|
||||
return "β
" if row.get("is_baseline") else " "
|
||||
|
||||
|
||||
def print_weights():
|
||||
from config import TEST_WEIGHTS, CATEGORIES
|
||||
print("\n TEST WEIGHTS:")
|
||||
category_labels = {
|
||||
"agent_tool": "Agent/Tool reliability (25%)",
|
||||
"coding": "Coding/Infrastructure (25%)",
|
||||
"rag_context": "RAG/Context fidelity (20%)",
|
||||
"structured": "Structured outputs (15%)",
|
||||
"hallucination": "Hallucination resistance (10%)",
|
||||
"reasoning": "Pure reasoning (5%)",
|
||||
}
|
||||
for cat, tests in CATEGORIES.items():
|
||||
w = sum(TEST_WEIGHTS.get(t, 0) for t in tests)
|
||||
label = category_labels.get(cat, cat)
|
||||
print(f" {label:<42} {w*100:.0f}%")
|
||||
|
||||
|
||||
def print_comparison(new_run_ids, existing_baseline_rows):
|
||||
"""Compare current run against existing baseline."""
|
||||
from storage import load_all_runs, get_connection
|
||||
|
||||
print("\n" + "=" * 68)
|
||||
print(" π RESULTS vs BASELINE")
|
||||
print("=" * 68)
|
||||
|
||||
all_thinking = MODELS_BASELINE_THINKING + MODELS_NEW_THINKING
|
||||
|
||||
if existing_baseline_rows:
|
||||
best = max(float(r.get("weighted_avg") or 0) for r in existing_baseline_rows)
|
||||
print(f"\n EXISTING BASELINE (best w_avg: {best:.2f}):")
|
||||
for r in sorted(existing_baseline_rows, key=lambda x: -float(x.get("weighted_avg") or 0)):
|
||||
print(
|
||||
f" {r['model']:<44} "
|
||||
f"w={float(r.get('weighted_avg',0)):>5.2f} "
|
||||
f"Ο={r.get('stdev_all','?'):>4} "
|
||||
f"fail={r.get('failure_rate_pct','?')}% "
|
||||
f"[{str(r.get('run_date',''))[:10]}]"
|
||||
)
|
||||
else:
|
||||
best = 0
|
||||
|
||||
# Load current runs
|
||||
if new_run_ids:
|
||||
from storage import get_connection
|
||||
placeholders = ",".join("?" * len(new_run_ids))
|
||||
with get_connection() as conn:
|
||||
new_rows = [dict(r) for r in conn.execute(
|
||||
f"SELECT * FROM runs WHERE id IN ({placeholders})",
|
||||
new_run_ids
|
||||
).fetchall()]
|
||||
|
||||
print(f"\n THIS RUN:")
|
||||
for r in sorted(new_rows, key=lambda x: -float(x.get("weighted_avg") or 0)):
|
||||
diff = float(r.get("weighted_avg") or 0) - best
|
||||
arrow = "β²" if diff > 0.05 else "βΌ" if diff < -0.05 else "="
|
||||
tag = "BASE" if r.get("is_baseline") else "NEW "
|
||||
print(
|
||||
f" [{tag}] {r['model']:<40} "
|
||||
f"w={float(r.get('weighted_avg',0)):>5.2f} {arrow}{abs(diff):.2f} "
|
||||
f"Ο={r.get('stdev_all','?'):>4} "
|
||||
f"fail={r.get('failure_rate_pct','?')}%"
|
||||
)
|
||||
|
||||
|
||||
def print_full_ranking(best=False):
|
||||
"""Print complete ranking of all models."""
|
||||
from storage import load_best_runs, load_latest_runs
|
||||
print("\n" + "=" * 68)
|
||||
title = "BEST RUN" if best else "LATEST RUN"
|
||||
print(f" π FULL RANKING ({title} per model, weighted semantic avg)")
|
||||
print("=" * 68)
|
||||
|
||||
all_thinking = MODELS_BASELINE_THINKING + MODELS_NEW_THINKING
|
||||
rows = load_best_runs() if best else load_latest_runs()
|
||||
|
||||
for i, r in enumerate(rows, 1):
|
||||
tag = _tag(r["model"], all_thinking)
|
||||
base = "β
" if r.get("is_baseline") else " "
|
||||
print(
|
||||
f" {i:>2}. {tag}{base} {r['model']:<42} "
|
||||
f"w={float(r.get('weighted_avg',0)):>5.2f} "
|
||||
f"Ο={r.get('stdev_all','?'):>4} "
|
||||
f"fail={r.get('failure_rate_pct','?'):>4}% "
|
||||
f"tok/s={r.get('avg_tok_s','?'):>5} "
|
||||
f"π‘={r.get('avg_gpu_temp','?'):>2}Β°C "
|
||||
f"[{str(r.get('run_date',''))[:10]}]"
|
||||
)
|
||||
|
||||
print(f"\n β
=baseline w=weighted avg Ο=stdev(low better) fail=failure rate π‘=avg temps")
|
||||
|
||||
|
||||
def print_category_breakdown():
|
||||
"""Print category scores for latest run of each model."""
|
||||
print("\n" + "=" * 68)
|
||||
print(" π CATEGORY BREAKDOWN (latest run per model)")
|
||||
print("=" * 68)
|
||||
|
||||
rows = load_latest_runs()
|
||||
if not rows:
|
||||
return
|
||||
|
||||
header = f" {'Model':<40} {'agent':>6} {'code':>6} {'rag':>6} {'struct':>7} {'hall':>5} {'reason':>7}"
|
||||
print(f"\n{header}")
|
||||
print(" " + "-" * 64)
|
||||
|
||||
for r in rows:
|
||||
base = "β
" if r.get("is_baseline") else " "
|
||||
print(
|
||||
f" {base}{r['model']:<41} "
|
||||
f"{r.get('cat_agent_tool','?'):>6} "
|
||||
f"{r.get('cat_coding','?'):>6} "
|
||||
f"{r.get('cat_rag_context','?'):>6} "
|
||||
f"{r.get('cat_structured','?'):>7} "
|
||||
f"{r.get('cat_hallucination','?'):>5} "
|
||||
f"{r.get('cat_reasoning','?'):>7}"
|
||||
)
|
||||
|
||||
|
||||
def print_compliance_table():
|
||||
"""Print compliance rates for latest run of each model."""
|
||||
print("\n" + "=" * 68)
|
||||
print(" β
COMPLIANCE RATES (latest run per model)")
|
||||
print("=" * 68)
|
||||
|
||||
rows = load_latest_runs()
|
||||
if not rows:
|
||||
return
|
||||
|
||||
header = f" {'Model':<44} {'JSON':>6} {'YAML':>6} {'Tool':>6} {'Hall':>6}"
|
||||
print(f"\n{header}")
|
||||
print(" " + "-" * 64)
|
||||
|
||||
for r in rows:
|
||||
base = "β
" if r.get("is_baseline") else " "
|
||||
def fmt(v):
|
||||
return f"{v}%" if v is not None else " n/a"
|
||||
print(
|
||||
f" {base}{r['model']:<43} "
|
||||
f"{fmt(r.get('compliance_json')):>6} "
|
||||
f"{fmt(r.get('compliance_yaml')):>6} "
|
||||
f"{fmt(r.get('compliance_tool')):>6} "
|
||||
f"{fmt(r.get('compliance_hall')):>6}"
|
||||
)
|
||||
|
||||
|
||||
def run_report():
|
||||
"""Full report: ranking + categories + compliance."""
|
||||
print_full_ranking()
|
||||
print_category_breakdown()
|
||||
print_compliance_table()
|
||||
export_summary_csv()
|
||||
Reference in New Issue
Block a user