Files
llm-benchmark/storage.py

280 lines
8.0 KiB
Python

"""
benchmark_v4/storage.py
=======================
SQLite persistence for benchmark results.
Three tables:
- runs: one row per model per benchmark run
- details: one row per test per model per run
- variance: one row per test per model (multi-run stats)
Query examples:
SELECT model, weighted_avg, stdev_all
FROM runs
WHERE is_baseline = 1
ORDER BY weighted_avg DESC;
SELECT model, test, semantic_score
FROM details
WHERE run_id = (SELECT MAX(id) FROM runs WHERE model = 'granite4.1:8b');
"""
import sqlite3
import json
from datetime import datetime
from config import DB_FILE
# ============================================
# SCHEMA
# ============================================
SCHEMA = """
CREATE TABLE IF NOT EXISTS runs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
run_date TEXT NOT NULL,
model TEXT NOT NULL,
type TEXT NOT NULL,
is_baseline INTEGER NOT NULL DEFAULT 0,
num_runs INTEGER NOT NULL DEFAULT 1,
-- Weighted scores
weighted_total REAL,
weighted_avg REAL,
-- Format
avg_format REAL,
-- Variance
mean_all REAL,
stdev_all REAL,
min_score REAL,
max_score REAL,
failure_rate_pct REAL,
-- Compliance (%)
compliance_json REAL,
compliance_yaml REAL,
compliance_tool REAL,
compliance_hall REAL,
-- Category scores
cat_agent_tool REAL,
cat_coding REAL,
cat_rag_context REAL,
cat_structured REAL,
cat_hallucination REAL,
cat_reasoning REAL,
-- Performance
avg_tok_s REAL,
avg_gpu_temp REAL,
tests_run INTEGER
);
CREATE TABLE IF NOT EXISTS details (
id INTEGER PRIMARY KEY AUTOINCREMENT,
run_id INTEGER NOT NULL REFERENCES runs(id),
run_date TEXT NOT NULL,
run_num INTEGER NOT NULL DEFAULT 1,
model TEXT NOT NULL,
type TEXT NOT NULL,
is_baseline INTEGER NOT NULL DEFAULT 0,
test TEXT NOT NULL,
weight REAL,
time_s REAL,
tok_s REAL,
gpu_temp INTEGER,
gpu_mem INTEGER,
gpu_util INTEGER,
gpu_clock INTEGER,
output_length INTEGER,
semantic_score INTEGER,
format_score INTEGER,
combined_score REAL,
used_judge INTEGER,
notes TEXT
);
CREATE TABLE IF NOT EXISTS variance (
id INTEGER PRIMARY KEY AUTOINCREMENT,
run_date TEXT NOT NULL,
model TEXT NOT NULL,
test TEXT NOT NULL,
num_runs INTEGER NOT NULL,
mean REAL,
stdev REAL,
min_score INTEGER,
max_score INTEGER,
failure_rate_pct REAL,
scores_raw TEXT
);
CREATE INDEX IF NOT EXISTS idx_runs_model ON runs(model);
CREATE INDEX IF NOT EXISTS idx_details_run ON details(run_id);
CREATE INDEX IF NOT EXISTS idx_details_model ON details(model);
CREATE INDEX IF NOT EXISTS idx_details_test ON details(test);
"""
# ============================================
# CONNECTION
# ============================================
def get_connection():
conn = sqlite3.connect(DB_FILE)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA foreign_keys=ON")
return conn
def init_db():
"""Create tables if they don't exist."""
with get_connection() as conn:
conn.executescript(SCHEMA)
# ============================================
# WRITE
# ============================================
def insert_run(run_data):
"""Insert a run summary row. Returns the run_id."""
sql = """
INSERT INTO runs (
run_date, model, type, is_baseline, num_runs,
weighted_total, weighted_avg, avg_format,
mean_all, stdev_all, min_score, max_score, failure_rate_pct,
compliance_json, compliance_yaml, compliance_tool, compliance_hall,
cat_agent_tool, cat_coding, cat_rag_context,
cat_structured, cat_hallucination, cat_reasoning,
avg_tok_s, avg_gpu_temp, tests_run
) VALUES (
:run_date, :model, :type, :is_baseline, :num_runs,
:weighted_total, :weighted_avg, :avg_format,
:mean_all, :stdev_all, :min_score, :max_score, :failure_rate_pct,
:compliance_json, :compliance_yaml, :compliance_tool, :compliance_hall,
:cat_agent_tool, :cat_coding, :cat_rag_context,
:cat_structured, :cat_hallucination, :cat_reasoning,
:avg_tok_s, :avg_gpu_temp, :tests_run
)
"""
with get_connection() as conn:
cursor = conn.execute(sql, run_data)
return cursor.lastrowid
def insert_details(run_id, detail_rows):
"""Insert detail rows for a run."""
sql = """
INSERT INTO details (
run_id, run_date, run_num, model, type, is_baseline,
test, weight, time_s, tok_s,
gpu_temp, gpu_mem, gpu_util, gpu_clock, output_length,
semantic_score, format_score, combined_score, used_judge, notes
) VALUES (
:run_id, :run_date, :run_num, :model, :type, :is_baseline,
:test, :weight, :time_s, :tok_s,
:gpu_temp, :gpu_mem, :gpu_util, :gpu_clock, :output_length,
:semantic_score, :format_score, :combined_score, :used_judge, :notes
)
"""
rows = [{**r, "run_id": run_id} for r in detail_rows]
with get_connection() as conn:
conn.executemany(sql, rows)
def insert_variance(variance_rows):
"""Insert variance rows."""
sql = """
INSERT INTO variance (
run_date, model, test, num_runs,
mean, stdev, min_score, max_score, failure_rate_pct, scores_raw
) VALUES (
:run_date, :model, :test, :num_runs,
:mean, :stdev, :min_score, :max_score, :failure_rate_pct, :scores_raw
)
"""
with get_connection() as conn:
conn.executemany(sql, variance_rows)
# ============================================
# READ
# ============================================
def load_best_runs():
"""Load best scoring run per model."""
with get_connection() as conn:
rows = conn.execute("""
SELECT r.*
FROM runs r
INNER JOIN (
SELECT model, MAX(weighted_avg) AS best_w
FROM runs
GROUP BY model
) best ON r.model = best.model
AND r.weighted_avg = best.best_w
ORDER BY r.weighted_avg DESC
""").fetchall()
return [dict(r) for r in rows]
def load_latest_runs(is_baseline=None):
"""Load latest run per model."""
sql = """
SELECT r.*
FROM runs r
INNER JOIN (
SELECT model, MAX(run_date) AS latest
FROM runs
GROUP BY model
) latest ON r.model = latest.model AND r.run_date = latest.latest
"""
params = []
if is_baseline is not None:
sql += " WHERE r.is_baseline = ?"
params.append(1 if is_baseline else 0)
sql += " ORDER BY r.weighted_avg DESC"
with get_connection() as conn:
rows = conn.execute(sql, params).fetchall()
return [dict(r) for r in rows]
def load_all_runs():
"""Load all run summaries."""
with get_connection() as conn:
rows = conn.execute(
"SELECT * FROM runs ORDER BY run_date DESC"
).fetchall()
return [dict(r) for r in rows]
def load_details_for_run(run_id):
"""Load all test details for a specific run."""
with get_connection() as conn:
rows = conn.execute(
"SELECT * FROM details WHERE run_id = ? ORDER BY test",
(run_id,)
).fetchall()
return [dict(r) for r in rows]
def export_summary_csv(filepath="benchmark_summary.csv"):
"""Export latest run per model to CSV for Excel analysis."""
import csv
rows = load_latest_runs()
if not rows:
print("No runs to export.")
return
with open(filepath, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=rows[0].keys())
writer.writeheader()
writer.writerows(rows)
print(f" Exported {len(rows)} rows to {filepath}")