RC: (add) python script files
This commit is contained in:
279
storage.py
Normal file
279
storage.py
Normal file
@@ -0,0 +1,279 @@
|
||||
"""
|
||||
benchmark_v4/storage.py
|
||||
=======================
|
||||
SQLite persistence for benchmark results.
|
||||
Three tables:
|
||||
- runs: one row per model per benchmark run
|
||||
- details: one row per test per model per run
|
||||
- variance: one row per test per model (multi-run stats)
|
||||
|
||||
Query examples:
|
||||
SELECT model, weighted_avg, stdev_all
|
||||
FROM runs
|
||||
WHERE is_baseline = 1
|
||||
ORDER BY weighted_avg DESC;
|
||||
|
||||
SELECT model, test, semantic_score
|
||||
FROM details
|
||||
WHERE run_id = (SELECT MAX(id) FROM runs WHERE model = 'granite4.1:8b');
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import json
|
||||
from datetime import datetime
|
||||
from config import DB_FILE
|
||||
|
||||
|
||||
# ============================================
|
||||
# SCHEMA
|
||||
# ============================================
|
||||
|
||||
SCHEMA = """
|
||||
CREATE TABLE IF NOT EXISTS runs (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
run_date TEXT NOT NULL,
|
||||
model TEXT NOT NULL,
|
||||
type TEXT NOT NULL,
|
||||
is_baseline INTEGER NOT NULL DEFAULT 0,
|
||||
num_runs INTEGER NOT NULL DEFAULT 1,
|
||||
|
||||
-- Weighted scores
|
||||
weighted_total REAL,
|
||||
weighted_avg REAL,
|
||||
|
||||
-- Format
|
||||
avg_format REAL,
|
||||
|
||||
-- Variance
|
||||
mean_all REAL,
|
||||
stdev_all REAL,
|
||||
min_score REAL,
|
||||
max_score REAL,
|
||||
failure_rate_pct REAL,
|
||||
|
||||
-- Compliance (%)
|
||||
compliance_json REAL,
|
||||
compliance_yaml REAL,
|
||||
compliance_tool REAL,
|
||||
compliance_hall REAL,
|
||||
|
||||
-- Category scores
|
||||
cat_agent_tool REAL,
|
||||
cat_coding REAL,
|
||||
cat_rag_context REAL,
|
||||
cat_structured REAL,
|
||||
cat_hallucination REAL,
|
||||
cat_reasoning REAL,
|
||||
|
||||
-- Performance
|
||||
avg_tok_s REAL,
|
||||
avg_gpu_temp REAL,
|
||||
|
||||
tests_run INTEGER
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS details (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
run_id INTEGER NOT NULL REFERENCES runs(id),
|
||||
run_date TEXT NOT NULL,
|
||||
run_num INTEGER NOT NULL DEFAULT 1,
|
||||
model TEXT NOT NULL,
|
||||
type TEXT NOT NULL,
|
||||
is_baseline INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
test TEXT NOT NULL,
|
||||
weight REAL,
|
||||
time_s REAL,
|
||||
tok_s REAL,
|
||||
gpu_temp INTEGER,
|
||||
gpu_mem INTEGER,
|
||||
gpu_util INTEGER,
|
||||
gpu_clock INTEGER,
|
||||
output_length INTEGER,
|
||||
|
||||
semantic_score INTEGER,
|
||||
format_score INTEGER,
|
||||
combined_score REAL,
|
||||
used_judge INTEGER,
|
||||
notes TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS variance (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
run_date TEXT NOT NULL,
|
||||
model TEXT NOT NULL,
|
||||
test TEXT NOT NULL,
|
||||
num_runs INTEGER NOT NULL,
|
||||
mean REAL,
|
||||
stdev REAL,
|
||||
min_score INTEGER,
|
||||
max_score INTEGER,
|
||||
failure_rate_pct REAL,
|
||||
scores_raw TEXT
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_runs_model ON runs(model);
|
||||
CREATE INDEX IF NOT EXISTS idx_details_run ON details(run_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_details_model ON details(model);
|
||||
CREATE INDEX IF NOT EXISTS idx_details_test ON details(test);
|
||||
"""
|
||||
|
||||
|
||||
# ============================================
|
||||
# CONNECTION
|
||||
# ============================================
|
||||
|
||||
def get_connection():
|
||||
conn = sqlite3.connect(DB_FILE)
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.execute("PRAGMA foreign_keys=ON")
|
||||
return conn
|
||||
|
||||
|
||||
def init_db():
|
||||
"""Create tables if they don't exist."""
|
||||
with get_connection() as conn:
|
||||
conn.executescript(SCHEMA)
|
||||
|
||||
|
||||
# ============================================
|
||||
# WRITE
|
||||
# ============================================
|
||||
|
||||
def insert_run(run_data):
|
||||
"""Insert a run summary row. Returns the run_id."""
|
||||
sql = """
|
||||
INSERT INTO runs (
|
||||
run_date, model, type, is_baseline, num_runs,
|
||||
weighted_total, weighted_avg, avg_format,
|
||||
mean_all, stdev_all, min_score, max_score, failure_rate_pct,
|
||||
compliance_json, compliance_yaml, compliance_tool, compliance_hall,
|
||||
cat_agent_tool, cat_coding, cat_rag_context,
|
||||
cat_structured, cat_hallucination, cat_reasoning,
|
||||
avg_tok_s, avg_gpu_temp, tests_run
|
||||
) VALUES (
|
||||
:run_date, :model, :type, :is_baseline, :num_runs,
|
||||
:weighted_total, :weighted_avg, :avg_format,
|
||||
:mean_all, :stdev_all, :min_score, :max_score, :failure_rate_pct,
|
||||
:compliance_json, :compliance_yaml, :compliance_tool, :compliance_hall,
|
||||
:cat_agent_tool, :cat_coding, :cat_rag_context,
|
||||
:cat_structured, :cat_hallucination, :cat_reasoning,
|
||||
:avg_tok_s, :avg_gpu_temp, :tests_run
|
||||
)
|
||||
"""
|
||||
with get_connection() as conn:
|
||||
cursor = conn.execute(sql, run_data)
|
||||
return cursor.lastrowid
|
||||
|
||||
|
||||
def insert_details(run_id, detail_rows):
|
||||
"""Insert detail rows for a run."""
|
||||
sql = """
|
||||
INSERT INTO details (
|
||||
run_id, run_date, run_num, model, type, is_baseline,
|
||||
test, weight, time_s, tok_s,
|
||||
gpu_temp, gpu_mem, gpu_util, gpu_clock, output_length,
|
||||
semantic_score, format_score, combined_score, used_judge, notes
|
||||
) VALUES (
|
||||
:run_id, :run_date, :run_num, :model, :type, :is_baseline,
|
||||
:test, :weight, :time_s, :tok_s,
|
||||
:gpu_temp, :gpu_mem, :gpu_util, :gpu_clock, :output_length,
|
||||
:semantic_score, :format_score, :combined_score, :used_judge, :notes
|
||||
)
|
||||
"""
|
||||
rows = [{**r, "run_id": run_id} for r in detail_rows]
|
||||
with get_connection() as conn:
|
||||
conn.executemany(sql, rows)
|
||||
|
||||
|
||||
def insert_variance(variance_rows):
|
||||
"""Insert variance rows."""
|
||||
sql = """
|
||||
INSERT INTO variance (
|
||||
run_date, model, test, num_runs,
|
||||
mean, stdev, min_score, max_score, failure_rate_pct, scores_raw
|
||||
) VALUES (
|
||||
:run_date, :model, :test, :num_runs,
|
||||
:mean, :stdev, :min_score, :max_score, :failure_rate_pct, :scores_raw
|
||||
)
|
||||
"""
|
||||
with get_connection() as conn:
|
||||
conn.executemany(sql, variance_rows)
|
||||
|
||||
|
||||
# ============================================
|
||||
# READ
|
||||
# ============================================
|
||||
def load_best_runs():
|
||||
"""Load best scoring run per model."""
|
||||
with get_connection() as conn:
|
||||
rows = conn.execute("""
|
||||
SELECT r.*
|
||||
FROM runs r
|
||||
INNER JOIN (
|
||||
SELECT model, MAX(weighted_avg) AS best_w
|
||||
FROM runs
|
||||
GROUP BY model
|
||||
) best ON r.model = best.model
|
||||
AND r.weighted_avg = best.best_w
|
||||
ORDER BY r.weighted_avg DESC
|
||||
""").fetchall()
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
def load_latest_runs(is_baseline=None):
|
||||
"""Load latest run per model."""
|
||||
sql = """
|
||||
SELECT r.*
|
||||
FROM runs r
|
||||
INNER JOIN (
|
||||
SELECT model, MAX(run_date) AS latest
|
||||
FROM runs
|
||||
GROUP BY model
|
||||
) latest ON r.model = latest.model AND r.run_date = latest.latest
|
||||
"""
|
||||
params = []
|
||||
if is_baseline is not None:
|
||||
sql += " WHERE r.is_baseline = ?"
|
||||
params.append(1 if is_baseline else 0)
|
||||
|
||||
sql += " ORDER BY r.weighted_avg DESC"
|
||||
|
||||
with get_connection() as conn:
|
||||
rows = conn.execute(sql, params).fetchall()
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
|
||||
def load_all_runs():
|
||||
"""Load all run summaries."""
|
||||
with get_connection() as conn:
|
||||
rows = conn.execute(
|
||||
"SELECT * FROM runs ORDER BY run_date DESC"
|
||||
).fetchall()
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
|
||||
def load_details_for_run(run_id):
|
||||
"""Load all test details for a specific run."""
|
||||
with get_connection() as conn:
|
||||
rows = conn.execute(
|
||||
"SELECT * FROM details WHERE run_id = ? ORDER BY test",
|
||||
(run_id,)
|
||||
).fetchall()
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
|
||||
def export_summary_csv(filepath="benchmark_summary.csv"):
|
||||
"""Export latest run per model to CSV for Excel analysis."""
|
||||
import csv
|
||||
rows = load_latest_runs()
|
||||
if not rows:
|
||||
print("No runs to export.")
|
||||
return
|
||||
|
||||
with open(filepath, "w", newline="", encoding="utf-8") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=rows[0].keys())
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
|
||||
print(f" Exported {len(rows)} rows to {filepath}")
|
||||
Reference in New Issue
Block a user