RC: (add) python script files

2026-05-15 16:50:26 +01:00
parent 51e9389726
commit ab7875303e
9 changed files with 2350 additions and 0 deletions
--- a/storage.py
+++ b/storage.py
@@ -0,0 +1,279 @@
+"""
+benchmark_v4/storage.py
+=======================
+SQLite persistence for benchmark results.
+Three tables:
+  - runs:     one row per model per benchmark run
+  - details:  one row per test per model per run
+  - variance: one row per test per model (multi-run stats)
+
+Query examples:
+  SELECT model, weighted_avg, stdev_all
+  FROM runs
+  WHERE is_baseline = 1
+  ORDER BY weighted_avg DESC;
+
+  SELECT model, test, semantic_score
+  FROM details
+  WHERE run_id = (SELECT MAX(id) FROM runs WHERE model = 'granite4.1:8b');
+"""
+
+import sqlite3
+import json
+from datetime import datetime
+from config import DB_FILE
+
+
+# ============================================
+# SCHEMA
+# ============================================
+
+SCHEMA = """
+CREATE TABLE IF NOT EXISTS runs (
+    id              INTEGER PRIMARY KEY AUTOINCREMENT,
+    run_date        TEXT    NOT NULL,
+    model           TEXT    NOT NULL,
+    type            TEXT    NOT NULL,
+    is_baseline     INTEGER NOT NULL DEFAULT 0,
+    num_runs        INTEGER NOT NULL DEFAULT 1,
+
+    -- Weighted scores
+    weighted_total  REAL,
+    weighted_avg    REAL,
+
+    -- Format
+    avg_format      REAL,
+
+    -- Variance
+    mean_all        REAL,
+    stdev_all       REAL,
+    min_score       REAL,
+    max_score       REAL,
+    failure_rate_pct REAL,
+
+    -- Compliance (%)
+    compliance_json  REAL,
+    compliance_yaml  REAL,
+    compliance_tool  REAL,
+    compliance_hall  REAL,
+
+    -- Category scores
+    cat_agent_tool   REAL,
+    cat_coding       REAL,
+    cat_rag_context  REAL,
+    cat_structured   REAL,
+    cat_hallucination REAL,
+    cat_reasoning    REAL,
+
+    -- Performance
+    avg_tok_s       REAL,
+    avg_gpu_temp    REAL,
+
+    tests_run       INTEGER
+);
+
+CREATE TABLE IF NOT EXISTS details (
+    id              INTEGER PRIMARY KEY AUTOINCREMENT,
+    run_id          INTEGER NOT NULL REFERENCES runs(id),
+    run_date        TEXT    NOT NULL,
+    run_num         INTEGER NOT NULL DEFAULT 1,
+    model           TEXT    NOT NULL,
+    type            TEXT    NOT NULL,
+    is_baseline     INTEGER NOT NULL DEFAULT 0,
+
+    test            TEXT    NOT NULL,
+    weight          REAL,
+    time_s          REAL,
+    tok_s           REAL,
+    gpu_temp        INTEGER,
+    gpu_mem         INTEGER,
+    gpu_util        INTEGER,
+    gpu_clock       INTEGER,
+    output_length   INTEGER,
+
+    semantic_score  INTEGER,
+    format_score    INTEGER,
+    combined_score  REAL,
+    used_judge      INTEGER,
+    notes           TEXT
+);
+
+CREATE TABLE IF NOT EXISTS variance (
+    id              INTEGER PRIMARY KEY AUTOINCREMENT,
+    run_date        TEXT    NOT NULL,
+    model           TEXT    NOT NULL,
+    test            TEXT    NOT NULL,
+    num_runs        INTEGER NOT NULL,
+    mean            REAL,
+    stdev           REAL,
+    min_score       INTEGER,
+    max_score       INTEGER,
+    failure_rate_pct REAL,
+    scores_raw      TEXT
+);
+
+CREATE INDEX IF NOT EXISTS idx_runs_model    ON runs(model);
+CREATE INDEX IF NOT EXISTS idx_details_run   ON details(run_id);
+CREATE INDEX IF NOT EXISTS idx_details_model ON details(model);
+CREATE INDEX IF NOT EXISTS idx_details_test  ON details(test);
+"""
+
+
+# ============================================
+# CONNECTION
+# ============================================
+
+def get_connection():
+    conn = sqlite3.connect(DB_FILE)
+    conn.row_factory = sqlite3.Row
+    conn.execute("PRAGMA journal_mode=WAL")
+    conn.execute("PRAGMA foreign_keys=ON")
+    return conn
+
+
+def init_db():
+    """Create tables if they don't exist."""
+    with get_connection() as conn:
+        conn.executescript(SCHEMA)
+
+
+# ============================================
+# WRITE
+# ============================================
+
+def insert_run(run_data):
+    """Insert a run summary row. Returns the run_id."""
+    sql = """
+    INSERT INTO runs (
+        run_date, model, type, is_baseline, num_runs,
+        weighted_total, weighted_avg, avg_format,
+        mean_all, stdev_all, min_score, max_score, failure_rate_pct,
+        compliance_json, compliance_yaml, compliance_tool, compliance_hall,
+        cat_agent_tool, cat_coding, cat_rag_context,
+        cat_structured, cat_hallucination, cat_reasoning,
+        avg_tok_s, avg_gpu_temp, tests_run
+    ) VALUES (
+        :run_date, :model, :type, :is_baseline, :num_runs,
+        :weighted_total, :weighted_avg, :avg_format,
+        :mean_all, :stdev_all, :min_score, :max_score, :failure_rate_pct,
+        :compliance_json, :compliance_yaml, :compliance_tool, :compliance_hall,
+        :cat_agent_tool, :cat_coding, :cat_rag_context,
+        :cat_structured, :cat_hallucination, :cat_reasoning,
+        :avg_tok_s, :avg_gpu_temp, :tests_run
+    )
+    """
+    with get_connection() as conn:
+        cursor = conn.execute(sql, run_data)
+        return cursor.lastrowid
+
+
+def insert_details(run_id, detail_rows):
+    """Insert detail rows for a run."""
+    sql = """
+    INSERT INTO details (
+        run_id, run_date, run_num, model, type, is_baseline,
+        test, weight, time_s, tok_s,
+        gpu_temp, gpu_mem, gpu_util, gpu_clock, output_length,
+        semantic_score, format_score, combined_score, used_judge, notes
+    ) VALUES (
+        :run_id, :run_date, :run_num, :model, :type, :is_baseline,
+        :test, :weight, :time_s, :tok_s,
+        :gpu_temp, :gpu_mem, :gpu_util, :gpu_clock, :output_length,
+        :semantic_score, :format_score, :combined_score, :used_judge, :notes
+    )
+    """
+    rows = [{**r, "run_id": run_id} for r in detail_rows]
+    with get_connection() as conn:
+        conn.executemany(sql, rows)
+
+
+def insert_variance(variance_rows):
+    """Insert variance rows."""
+    sql = """
+    INSERT INTO variance (
+        run_date, model, test, num_runs,
+        mean, stdev, min_score, max_score, failure_rate_pct, scores_raw
+    ) VALUES (
+        :run_date, :model, :test, :num_runs,
+        :mean, :stdev, :min_score, :max_score, :failure_rate_pct, :scores_raw
+    )
+    """
+    with get_connection() as conn:
+        conn.executemany(sql, variance_rows)
+
+
+# ============================================
+# READ
+# ============================================
+def load_best_runs():
+    """Load best scoring run per model."""
+    with get_connection() as conn:
+        rows = conn.execute("""
+            SELECT r.*
+            FROM runs r
+            INNER JOIN (
+                SELECT model, MAX(weighted_avg) AS best_w
+                FROM runs
+                GROUP BY model
+            ) best ON r.model = best.model 
+            AND r.weighted_avg = best.best_w
+            ORDER BY r.weighted_avg DESC
+        """).fetchall()
+    return [dict(r) for r in rows]
+
+def load_latest_runs(is_baseline=None):
+    """Load latest run per model."""
+    sql = """
+    SELECT r.*
+    FROM runs r
+    INNER JOIN (
+        SELECT model, MAX(run_date) AS latest
+        FROM runs
+        GROUP BY model
+    ) latest ON r.model = latest.model AND r.run_date = latest.latest
+    """
+    params = []
+    if is_baseline is not None:
+        sql += " WHERE r.is_baseline = ?"
+        params.append(1 if is_baseline else 0)
+
+    sql += " ORDER BY r.weighted_avg DESC"
+
+    with get_connection() as conn:
+        rows = conn.execute(sql, params).fetchall()
+    return [dict(r) for r in rows]
+
+
+def load_all_runs():
+    """Load all run summaries."""
+    with get_connection() as conn:
+        rows = conn.execute(
+            "SELECT * FROM runs ORDER BY run_date DESC"
+        ).fetchall()
+    return [dict(r) for r in rows]
+
+
+def load_details_for_run(run_id):
+    """Load all test details for a specific run."""
+    with get_connection() as conn:
+        rows = conn.execute(
+            "SELECT * FROM details WHERE run_id = ? ORDER BY test",
+            (run_id,)
+        ).fetchall()
+    return [dict(r) for r in rows]
+
+
+def export_summary_csv(filepath="benchmark_summary.csv"):
+    """Export latest run per model to CSV for Excel analysis."""
+    import csv
+    rows = load_latest_runs()
+    if not rows:
+        print("No runs to export.")
+        return
+
+    with open(filepath, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=rows[0].keys())
+        writer.writeheader()
+        writer.writerows(rows)
+
+    print(f"  Exported {len(rows)} rows to {filepath}")