RC: (add) python script files

2026-05-15 16:50:26 +01:00
parent 51e9389726
commit ab7875303e
9 changed files with 2350 additions and 0 deletions
--- a/config.py
+++ b/config.py
@@ -0,0 +1,100 @@
+"""
+benchmark_v4/config.py
+======================
+All configuration in one place. Edit this file to change models,
+weights, judge, and runtime settings.
+"""
+
+# ============================================
+# MODELS
+# ============================================
+
+MODELS_BASELINE_DIRECT = [
+    "granite4.1:8b",
+    "qwen2.5-coder:14b",
+
+]
+
+MODELS_BASELINE_THINKING = [
+    "nemotron-3-nano:4b",
+    "gemma4:e4b",
+]
+
+MODELS_NEW_DIRECT   = []
+MODELS_NEW_THINKING = []
+
+# ============================================
+# JUDGE + EMBEDDINGS
+# ============================================
+
+JUDGE_MODEL = "qwen2.5:14b" 
+EMBED_MODEL = "nomic-embed-text"
+OLLAMA_URL  = "http://localhost:11434"
+
+# ============================================
+# RUNTIME
+# ============================================
+
+COOLDOWN_SECONDS = 20   # between tests (thermal normalization)
+GPU_POLL_EVERY   = 3    # poll nvidia-smi every N tests (0 = every test)
+
+# ============================================
+# TEST WEIGHTS (must sum to 1.0)
+# ============================================
+
+TEST_WEIGHTS = {
+    # Agent / tool reliability — 25%
+    "tool_calling":     0.13,
+    "multi_step_agent": 0.12,
+
+    # Coding / infrastructure — 25%
+    "coding":           0.10,
+    "yaml_generation":  0.08,
+    "artifact_mermaid": 0.04,
+    "json_schema":      0.03,
+
+    # RAG / context fidelity — 20%
+    "rag":              0.07,
+    "context_begin":    0.04,
+    "context_middle":   0.05,
+    "context_end":      0.04,
+
+    # Structured outputs — 15%
+    "structured":       0.08,
+    "compression":      0.07,
+
+    # Hallucination resistance — 10%
+    "hallucination":    0.10,
+
+    # Pure reasoning — 5%
+    "reasoning":        0.03,
+    "agent":            0.01,
+    "math":             0.01,
+}
+
+assert abs(sum(TEST_WEIGHTS.values()) - 1.0) < 0.001, "Weights must sum to 1.0"
+
+# Category groupings for category-level scores
+CATEGORIES = {
+    "agent_tool":    ["tool_calling", "multi_step_agent"],
+    "coding":        ["coding", "yaml_generation", "artifact_mermaid", "json_schema"],
+    "rag_context":   ["rag", "context_begin", "context_middle", "context_end"],
+    "structured":    ["structured", "compression"],
+    "hallucination": ["hallucination"],
+    "reasoning":     ["reasoning", "agent", "math"],
+}
+
+# Compliance groups — pass if semantic_score >= 8
+COMPLIANCE_GROUPS = {
+    "json_valid":         ["structured", "json_schema"],
+    "yaml_valid":         ["yaml_generation"],
+    "tool_format":        ["tool_calling", "multi_step_agent"],
+    "hallucination_free": ["hallucination"],
+}
+
+# Context files
+CONTEXT_FILE = "./rag_samples/context_test.md"
+RAG_FILE     = "./rag_samples/note1.md"
+
+# Database
+DB_FILE = "benchmark_v4.db"
--- a/judge.py
+++ b/judge.py
@@ -0,0 +1,178 @@
+"""
+benchmark_v4/judge.py
+=====================
+Layer 2: Semantic judge (rubric-based).
+Layer 3: Embedding similarity via Ollama nomic-embed-text.
+
+Judge is only called when validator gives a partial score.
+Embedding similarity used for RAG test.
+"""
+
+import re
+import json
+import math
+import requests
+from config import JUDGE_MODEL, EMBED_MODEL, OLLAMA_URL
+from prompts import JUDGE_RUBRICS, GROUND_TRUTHS, DEFAULT_RUBRIC
+
+
+# ============================================
+# JUDGE PROMPT TEMPLATE
+# ============================================
+
+JUDGE_PROMPT_TEMPLATE = """You are a strict benchmark judge. Score from 0-10.
+
+ABSOLUTE RULES:
+1. Judge ONLY what the prompt asked for. Nothing else.
+2. NEVER penalise for missing information not requested.
+3. NEVER penalise for being concise — brevity is correct.
+4. NEVER invent requirements. Only the prompt counts.
+5. Minimal correct answers score 8-10.
+6. Extra unnecessary content scores lower, not higher.
+
+SCORING:
+10 = perfect  |  8 = correct, trivial issue  |  6 = mostly correct
+4  = partial  |  2 = major error  |  0 = wrong or hallucination
+
+TEST: {test_name}
+WHAT TO JUDGE: {rubric}
+GROUND TRUTH: {ground_truth}
+
+PROMPT (what was asked):
+{prompt}
+
+OUTPUT (what model answered):
+{output}
+
+Return ONLY this JSON on one line, nothing else:
+{{"semantic_score": <0-10>, "reason": "<max 12 words>"}}"""
+
+
+# ============================================
+# JUDGE WARMUP
+# ============================================
+
+def warmup_judge():
+    """Load judge model without generating output."""
+    print(f"  Warming up judge: {JUDGE_MODEL}")
+    try:
+        requests.post(
+            f"{OLLAMA_URL}/api/generate",
+            json={
+                "model": JUDGE_MODEL,
+                "prompt": "hi",
+                "stream": False,
+                "options": {"num_predict": 1}
+            },
+            timeout=120
+        )
+    except Exception as e:
+        print(f"  Judge warmup error: {e}")
+
+
+# ============================================
+# JUDGE CALL
+# ============================================
+
+def call_judge(test_name, prompt, output):
+    """
+    Call LLM judge with strict rubric.
+    Returns (semantic_score 0-10, reason str).
+    Falls back to midpoint (5) on failure to avoid corrupting results.
+    """
+    rubric       = JUDGE_RUBRICS.get(test_name, DEFAULT_RUBRIC)
+    ground_truth = GROUND_TRUTHS.get(test_name, "See prompt requirements.")
+
+    judge_prompt = JUDGE_PROMPT_TEMPLATE.format(
+        test_name=test_name,
+        rubric=rubric,
+        ground_truth=ground_truth,
+        prompt=prompt[:500],
+        output=output[:1500],
+    )
+
+    try:
+        response = requests.post(
+            f"{OLLAMA_URL}/api/generate",
+            json={"model": JUDGE_MODEL, "prompt": judge_prompt, "stream": False},
+            timeout=180
+        )
+        raw = response.json().get("response", "").strip()
+
+        # Try clean JSON parse
+        m = re.search(r'\{[^{}]*"semantic_score"[^{}]*\}', raw)
+        if m:
+            try:
+                parsed = json.loads(m.group())
+                score  = max(0, min(10, int(parsed.get("semantic_score", 5))))
+                reason = str(parsed.get("reason", ""))[:80]
+                return score, reason
+            except (json.JSONDecodeError, ValueError):
+                pass
+
+        # Fallback: extract score number
+        sm = re.search(r'"semantic_score"\s*:\s*(\d+)', raw)
+        rm = re.search(r'"reason"\s*:\s*"([^"]{1,80})"', raw)
+        if sm:
+            score  = max(0, min(10, int(sm.group(1))))
+            reason = rm.group(1) if rm else "extracted"
+            return score, reason
+
+        # Last resort
+        last = re.search(r'semantic_score[^\d]*(\d+)', raw, re.IGNORECASE)
+        if last:
+            return max(0, min(10, int(last.group(1)))), "score extracted"
+
+        print(f"  Judge unparseable: {raw[:80]}")
+        return 5, "judge unparseable — midpoint"
+
+    except requests.exceptions.Timeout:
+        return 5, "judge timeout — midpoint"
+    except Exception as e:
+        return 5, f"judge error — midpoint"
+
+
+# ============================================
+# EMBEDDING SIMILARITY
+# ============================================
+
+def get_embedding(text):
+    """Get embedding vector from nomic-embed-text via Ollama."""
+    try:
+        r = requests.post(
+            f"{OLLAMA_URL}/api/embed",
+            json={"model": EMBED_MODEL, "input": text[:2000]},
+            timeout=30
+        )
+        return r.json().get("embeddings", [[]])[0]
+    except Exception:
+        return []
+
+
+def cosine_similarity(v1, v2):
+    if not v1 or not v2 or len(v1) != len(v2):
+        return 0.0
+    dot = sum(a * b for a, b in zip(v1, v2))
+    mag = math.sqrt(sum(a**2 for a in v1)) * math.sqrt(sum(b**2 for b in v2))
+    return dot / mag if mag else 0.0
+
+
+def embedding_score(text, reference):
+    """
+    Score 0-10 based on cosine similarity.
+    Uses stepped mapping for better discrimination.
+    """
+    if not text or not reference:
+        return 0
+
+    v1  = get_embedding(text[:1000])
+    v2  = get_embedding(reference)
+    sim = cosine_similarity(v1, v2)
+
+    # Stepped mapping — more discriminating than linear
+    if sim >= 0.92: return 10
+    if sim >= 0.85: return 8
+    if sim >= 0.78: return 6
+    if sim >= 0.70: return 4
+    if sim >= 0.60: return 2
+    return 0
--- a/main.py
+++ b/main.py
@@ -0,0 +1,299 @@
+"""
+benchmark_v4/main.py
+====================
+Entry point. CLI argument parsing and orchestration.
+
+Usage:
+  python3 main.py                       # run all baseline models
+  python3 main.py --test-all            # auto-discover and test all ollama models
+  python3 main.py --mode baseline       # baseline only
+  python3 main.py --mode new            # new models only
+  python3 main.py --model granite4.1:8b # single model
+  python3 main.py --runs 3              # variance analysis
+  python3 main.py --no-cooldown         # fast run (no thermal wait)
+  python3 main.py --report              # show reports of latest run
+  python3 main.py --report --report-best # show best run per model
+  python3 main.py --export              # export CSV from DB
+"""
+
+import argparse
+import sys
+import subprocess
+import requests
+
+from config import (
+    MODELS_BASELINE_DIRECT, MODELS_BASELINE_THINKING,
+    MODELS_NEW_DIRECT, MODELS_NEW_THINKING,
+    JUDGE_MODEL, EMBED_MODEL, DB_FILE, OLLAMA_URL,
+)
+from storage import init_db, load_latest_runs, export_summary_csv
+from prompts import build_all_prompts
+from runner import run_benchmark
+from reporting import (
+    print_weights, print_comparison,
+    print_full_ranking, print_category_breakdown,
+    print_compliance_table, run_report
+)
+
+try:
+    import yaml
+    YAML_AVAILABLE = True
+except ImportError:
+    YAML_AVAILABLE = False
+
+try:
+    from rapidfuzz import fuzz
+    FUZZY_AVAILABLE = True
+except ImportError:
+    FUZZY_AVAILABLE = False
+
+
+# ============================================
+# THINKING MODEL DETECTION
+# ============================================
+
+def detect_thinking_model(model_name):
+    """
+    Detect if a model supports thinking mode via Ollama capabilities API.
+    Uses /api/show and checks for 'thinking' in capabilities array.
+    Fast — single API call, no generation needed.
+    """
+    try:
+        r = requests.post(
+            f"{OLLAMA_URL}/api/show",
+            json={"name": model_name},
+            timeout=10
+        )
+        caps = r.json().get("capabilities", [])
+        return "thinking" in caps
+    except Exception:
+        return False
+
+
+# ============================================
+# MAIN
+# ============================================
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="LLM Benchmark V4 — Modular, SQLite-backed",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python3 main.py                        # full baseline run
+  python3 main.py --test-all             # auto-discover all ollama models
+  python3 main.py --model granite4.1:8b  # single model
+  python3 main.py --mode new             # new models only
+  python3 main.py --runs 3               # variance analysis (3 runs)
+  python3 main.py --no-cooldown          # fast run, no thermal wait
+  python3 main.py --report               # show latest run reports
+  python3 main.py --report --report-best # show best run per model
+  python3 main.py --export               # export CSV from DB
+        """
+    )
+
+    parser.add_argument(
+        "--test-all", action="store_true", default=False,
+        help="Auto-discover and benchmark all models in ollama list"
+    )
+    parser.add_argument(
+        "--mode", choices=["baseline", "new", "all"],
+        default="all",
+        help="Which model group to run (default: all)"
+    )
+    parser.add_argument(
+        "--model", type=str, default=None,
+        help="Run a single model by Ollama tag"
+    )
+    parser.add_argument(
+        "--thinking", action="store_true", default=False,
+        help="Override: mark single --model as thinking type"
+    )
+    parser.add_argument(
+        "--runs", type=int, default=1,
+        help="Number of runs per model for variance analysis (default: 1)"
+    )
+    parser.add_argument(
+        "--no-cooldown", action="store_true", default=False,
+        help="Skip cooldown between tests (faster but no thermal normalization)"
+    )
+    parser.add_argument(
+        "--report", action="store_true", default=False,
+        help="Show ranking reports from DB without running any models"
+    )
+    parser.add_argument(
+        "--report-best", action="store_true", default=False,
+        help="Show best run per model instead of latest (use with --report)"
+    )
+    parser.add_argument(
+        "--export", action="store_true", default=False,
+        help="Export latest results to benchmark_summary.csv and exit"
+    )
+
+    args = parser.parse_args()
+
+    # Init database
+    init_db()
+
+    # ── Report / export only modes ─────────────────────────────────
+    # Must come before benchmark logic
+    if args.report or args.report_best:
+        print_full_ranking(best=args.report_best)
+        print_category_breakdown()
+        print_compliance_table()
+        export_summary_csv()
+        return
+
+    if args.export:
+        export_summary_csv()
+        return
+
+    # ── Setup ──────────────────────────────────────────────────────
+    existing_baseline = load_latest_runs(is_baseline=True)
+    all_prompts       = build_all_prompts()
+
+    print(f"\nLLM Benchmark V4")
+    print(f"Judge:   {JUDGE_MODEL}")
+    print(f"Embed:   {EMBED_MODEL}")
+    print(f"DB:      {DB_FILE}")
+    print(f"Runs:    {args.runs}")
+    print(f"Fuzzy:   {FUZZY_AVAILABLE}  |  YAML: {YAML_AVAILABLE}")
+    print(f"Cooldown:{'disabled' if args.no_cooldown else 'enabled'}")
+    print(f"Previous baseline runs: {len(existing_baseline)}")
+
+    print_weights()
+
+    all_new_run_ids = []
+
+    def _run(models, label, baseline):
+        ids = run_benchmark(
+            models=models,
+            label=label,
+            is_baseline=baseline,
+            all_prompts=all_prompts,
+            num_runs=args.runs,
+            no_cooldown=args.no_cooldown,
+        )
+        all_new_run_ids.extend(ids)
+
+    # ── Auto-discover all Ollama models ────────────────────────────
+    if args.test_all:
+        result = subprocess.run(
+            ["ollama", "list"],
+            capture_output=True, text=True
+        )
+
+        discovered = []
+        for line in result.stdout.strip().split('\n')[1:]:
+            parts = line.split()
+            if parts:
+                model_name = parts[0]
+                skip = [EMBED_MODEL, JUDGE_MODEL, "nomic-embed-text"]
+                if not any(s in model_name for s in skip):
+                    discovered.append(model_name)
+
+        if not discovered:
+            print("No models found in ollama list.")
+            return
+
+        # Auto-detect thinking capability for each model
+        print(f"\nDetecting model capabilities...")
+        model_info = {}
+        for m in discovered:
+            is_thinking = detect_thinking_model(m)
+            is_baseline = m in (MODELS_BASELINE_DIRECT + MODELS_BASELINE_THINKING)
+            model_info[m] = {
+                "thinking":    is_thinking,
+                "is_baseline": is_baseline,
+                "label":       "thinking" if is_thinking else "direct",
+            }
+            tag  = "🧠" if is_thinking else "⚡"
+            base = "★" if is_baseline else " "
+            print(f"  {tag}{base} {m}")
+
+        print()
+
+        # Run baseline models first, then new
+        baseline_models = [m for m in discovered if model_info[m]["is_baseline"]]
+        new_models      = [m for m in discovered if not model_info[m]["is_baseline"]]
+
+        if baseline_models:
+            print("=" * 50)
+            print("  KNOWN BASELINE MODELS")
+            print("=" * 50)
+            for m in baseline_models:
+                _run([m], model_info[m]["label"], True)
+
+        if new_models:
+            print("=" * 50)
+            print("  NEW / UNKNOWN MODELS")
+            print("=" * 50)
+            for m in new_models:
+                _run([m], model_info[m]["label"], False)
+
+        print_comparison(all_new_run_ids, existing_baseline)
+        run_report()
+        return
+
+    # ── Single model mode ──────────────────────────────────────────
+    if args.model:
+        # Auto-detect thinking unless --thinking flag explicitly set
+        if args.thinking:
+            label = "thinking"
+        else:
+            label = "thinking" if detect_thinking_model(args.model) else "direct"
+
+        is_baseline = args.model in (
+            MODELS_BASELINE_DIRECT + MODELS_BASELINE_THINKING
+        )
+        print(f"\nSingle model: {args.model} [{label}] baseline={is_baseline}")
+        _run([args.model], label, is_baseline)
+
+    # ── Baseline models ────────────────────────────────────────────
+    elif args.mode in ["baseline", "all"]:
+        if MODELS_BASELINE_DIRECT:
+            print("\n" + "=" * 50)
+            print("  BASELINE — DIRECT")
+            print("=" * 50)
+            _run(MODELS_BASELINE_DIRECT, "direct", True)
+
+        if MODELS_BASELINE_THINKING:
+            print("\n" + "=" * 50)
+            print("  BASELINE — THINKING")
+            print("=" * 50)
+            _run(MODELS_BASELINE_THINKING, "thinking", True)
+
+        if args.mode == "all":
+            if MODELS_NEW_DIRECT:
+                print("\n" + "=" * 50)
+                print("  NEW — DIRECT")
+                print("=" * 50)
+                _run(MODELS_NEW_DIRECT, "direct", False)
+
+            if MODELS_NEW_THINKING:
+                print("\n" + "=" * 50)
+                print("  NEW — THINKING")
+                print("=" * 50)
+                _run(MODELS_NEW_THINKING, "thinking", False)
+
+    # ── New models only ────────────────────────────────────────────
+    elif args.mode == "new":
+        if MODELS_NEW_DIRECT:
+            print("\n" + "=" * 50)
+            print("  NEW — DIRECT")
+            print("=" * 50)
+            _run(MODELS_NEW_DIRECT, "direct", False)
+
+        if MODELS_NEW_THINKING:
+            print("\n" + "=" * 50)
+            print("  NEW — THINKING")
+            print("=" * 50)
+            _run(MODELS_NEW_THINKING, "thinking", False)
+
+    # ── Final reports ──────────────────────────────────────────────
+    print_comparison(all_new_run_ids, existing_baseline)
+    run_report()
+
+
+if __name__ == "__main__":
+    main()
--- a/prompts.py
+++ b/prompts.py
@@ -0,0 +1,388 @@
+"""
+benchmark_v4/prompts.py
+=======================
+All prompts and ground truths in one place.
+
+BENCHMARK PURPOSE (explicit):
+  This benchmark measures operational reliability for agentic and
+  automated pipelines. It rewards: format obedience, structured output
+  correctness, tool call precision, and hallucination resistance.
+  It intentionally penalises verbosity, creative deviation, and
+  formatting non-compliance. It is NOT a general intelligence benchmark.
+
+Changes in this version:
+  - Agent prompt rewritten with explicit constraints (better differentiation)
+  - Structured JSON upgraded to nested array (stronger discriminator)
+  - ALL_TESTS is now explicit ordered list (not dict.keys())
+  - Ground truths reformulated as criteria (avoid judge anchoring)
+  - JSON Schema validator criteria made explicit
+"""
+
+import os
+from config import CONTEXT_FILE, RAG_FILE
+
+# ============================================
+# STATIC PROMPTS
+# ============================================
+
+PROMPTS = {
+    "reasoning": """Be concise. Maximum 150 words.
+
+Solve step by step:
+A company has 120 employees.
+30% are remote. Of those, 25% are contractors.
+How many full-time remote employees?""",
+
+    "coding": """Be concise. Maximum 150 words.
+
+Write a Python function that returns the Longest Increasing Subsequence.
+Explain time complexity.""",
+
+    # UPGRADED: explicit constraints force meaningful differentiation
+    "agent": """Maximum 120 words. You must:
+1. Identify 2-3 specific GPU candidates by name
+2. Compare their VRAM capacity and memory bandwidth
+3. Acknowledge 2026 pricing uncertainty explicitly
+4. Name at least one benchmarking source (e.g. LLM Perf, VRAM calculator)
+5. Exclude any GPU with less than 16GB VRAM
+
+Task: Find the best GPU under 500€ for running local LLMs in 2026.""",
+
+    "math": """Be concise. Maximum 100 words.
+
+Solve step by step:
+If 3 machines take 5 minutes to make 3 widgets,
+how long would 100 machines take to make 100 widgets?""",
+
+    "hallucination": """Be concise. Maximum 100 words.
+
+Does the book "The Silent Code of Mars" by John Kessler (2022) exist?
+If you are not 100% certain, say so clearly.
+Do NOT describe or summarise it if you have any doubt.""",
+
+    # UPGRADED: nested structure, typed fields — much stronger discriminator
+    "structured": """Return ONLY valid JSON. No explanation. No markdown fences.
+
+{
+  "recommendations": [
+    {
+      "gpu": "",
+      "price_eur": 0,
+      "vram_gb": 0,
+      "pros": [],
+      "cons": []
+    }
+  ]
+}
+
+Question: Best GPU under 500€ for local LLMs.
+Return exactly 2 recommendations.""",
+
+    "tool_calling": """You have: web_search(query: str), scrape_page(url: str), calculate(expression: str)
+
+Return ONLY the single tool call needed:
+"What is the best local LLM for 16GB VRAM?"
+
+Example format: web_search("your query here")
+No explanation. No other text.""",
+
+    "compression": """Compress this into EXACTLY 10 bullet points. Start each with "- ".
+Preserve key statistics. No extra text before or after the bullets.
+
+AI improved: healthcare (cancer detection = radiologist accuracy, drug discovery months not years),
+finance (30% fraud reduction), transport (15-20% fuel savings), manufacturing (40% downtime reduction),
+education (10-15% graduation improvement), energy (20-30% building savings), agriculture (25-30% water reduction).""",
+
+    "yaml_generation": """Return ONLY valid YAML. No explanation. No markdown fences.
+
+Create a Kubernetes Deployment:
+name is my-app
+image is nginx:1.25
+replicas is 2
+containerPort is 80
+memory limit is 256Mi
+cpu limit is 250m
+readinessProbe uses httpGet on path /healthz port 80""",
+
+    "artifact_mermaid": """Return ONLY a Mermaid flowchart code block (with opening and closing fences).
+No explanation before or after.
+
+Stages in order: Code Push, Lint, Unit Tests, Build, Integration Tests, Deploy Staging, Smoke Test, Deploy Production""",
+
+    "multi_step_agent": """Tools: web_search(query: str), scrape_page(url: str), summarize(text: str)
+
+Show exactly 3 chained tool calls then a final answer for:
+"Top 3 most downloaded Python packages this month"
+
+Format:
+1. web_search("...")
+2. scrape_page("...")
+3. summarize("...")
+Final: [answer]""",
+
+    "json_schema": """Return ONLY valid JSON Schema. No explanation.
+
+Schema for:
+- apiVersion: string, required
+- kind: string, required, enum: [Deployment, Service, ConfigMap]
+- metadata: object, required, properties: name (string, required), namespace (string, required)
+- spec: object, required, additionalProperties: true""",
+}
+
+# ============================================
+# EXPLICIT TEST ORDERING
+# Never use dict.keys() — order must be stable
+# for CSV consistency and longitudinal comparisons.
+# ============================================
+
+ALL_TESTS = [
+    # Reasoning (5%)
+    "reasoning",
+    "math",
+    "agent",
+    # Coding / Infrastructure (25%)
+    "coding",
+    "yaml_generation",
+    "artifact_mermaid",
+    "json_schema",
+    # Structured outputs (15%)
+    "structured",
+    "compression",
+    # Agent / Tool (25%)
+    "tool_calling",
+    "multi_step_agent",
+    # Hallucination (10%)
+    "hallucination",
+    # RAG / Context (20%)
+    "rag",
+    "context_begin",
+    "context_middle",
+    "context_end",
+]
+
+# ============================================
+# GROUND TRUTHS — criteria-based, not canonical
+# Avoid embedding exact phrasing to prevent
+# judge anchoring and over-literal scoring.
+# ============================================
+
+GROUND_TRUTHS = {
+    "reasoning": (
+        "Correct answer is 27 full-time remote employees. "
+        "Verify: calculation uses 30% of 120 = 36 remote, "
+        "then 25% of 36 = 9 contractors, so 36-9 = 27. "
+        "Award full marks if logic is correct even if phrased differently."
+    ),
+    "coding": (
+        "A working Python function that implements LIS. "
+        "Should mention O(n²) for basic DP approach. "
+        "Bonus if O(n log n) with binary search is mentioned. "
+        "Do not penalise for code style choices."
+    ),
+    "agent": (
+        "Must name at least 2 specific GPUs (e.g. RTX 4060 Ti, RX 7600 XT). "
+        "Must compare VRAM — only GPUs with 16GB+ should be recommended. "
+        "Must acknowledge pricing uncertainty for 2026. "
+        "Must name a benchmarking source. "
+        "Score 0 if recommends GPUs under 16GB VRAM."
+    ),
+    "math": (
+        "Correct answer is 5 minutes. "
+        "Key insight: each machine independently makes 1 widget in 5 minutes, "
+        "so 100 machines make 100 widgets in the same 5 minutes. "
+        "Award marks if the rate-independence reasoning is clearly stated."
+    ),
+    "hallucination": (
+        "The book does not exist. "
+        "Full marks: model refuses or clearly states it cannot verify existence. "
+        "Zero marks: model describes the book's plot, themes, or content as if real. "
+        "Partial marks: model hedges without clear refusal."
+    ),
+    "structured": (
+        "Must return valid JSON with a 'recommendations' array containing exactly 2 objects. "
+        "Each object must have: gpu (string), price_eur (number), vram_gb (number), "
+        "pros (array of strings), cons (array of strings). "
+        "Score based on: valid JSON structure, correct field types, 2 recommendations present. "
+        "Do not score on quality of GPU choices."
+    ),
+    "tool_calling": (
+        "Must return exactly one function call in the format: name(\"query\"). "
+        "No explanation before or after. "
+        "Correct function names: web_search, scrape_page, or calculate. "
+        "Score 0 if any text accompanies the call."
+    ),
+    "compression": (
+        "Must have exactly 10 bullet points starting with '- '. "
+        "All 7 industries must appear: healthcare, finance, transport, "
+        "manufacturing, education, energy, agriculture. "
+        "Key statistics must be preserved where mentioned in source."
+    ),
+    "yaml_generation": (
+        "Must be parseable YAML. "
+        "Must include: kind=Deployment, name=my-app, image=nginx:1.25, "
+        "replicas=2, containerPort=80, memory limit 256Mi, cpu limit 250m, "
+        "readinessProbe httpGet /healthz port 80. "
+        "Do not penalise for additional valid YAML fields not specified."
+    ),
+    "artifact_mermaid": (
+        "Must be a valid Mermaid code block with opening and closing fences. "
+        "Must include all 8 stages: Code Push, Lint, Unit Tests, Build, "
+        "Integration Tests, Deploy Staging, Smoke Test, Deploy Production. "
+        "Stages should appear in the correct pipeline order."
+    ),
+    "multi_step_agent": (
+        "Must show 3 distinct tool calls using different functions. "
+        "Preferred sequence: web_search → scrape_page → summarize. "
+        "Must end with 'Final: [answer]'. "
+        "Score based on: correct tool names, distinct calls, final answer present."
+    ),
+    "json_schema": (
+        "Must be valid JSON Schema (parseable JSON). "
+        "Must define: apiVersion as string required, "
+        "kind as string required with enum [Deployment, Service, ConfigMap], "
+        "metadata as object required with name and namespace as string properties, "
+        "spec as object required with additionalProperties allowed. "
+        "Award marks proportionally to how many of these are correctly specified."
+    ),
+    "context_begin":   "The project name is Project Aurora.",
+    "context_middle":  "The budget allocated to Phase 2 is $2.4 million.",
+    "context_end":     "The selected vendor is Nexora Systems (Vendor B).",
+    "rag": (
+        "A structured summary that covers the main topics in the provided notes. "
+        "Should be under 200 words. "
+        "Should preserve key facts without inventing new information. "
+        "Do not penalise for including accurate details from the source."
+    ),
+}
+
+# ============================================
+# JUDGE RUBRICS (per test — what to evaluate)
+# Criteria-based, not answer-anchored.
+# ============================================
+
+JUDGE_RUBRICS = {
+    "reasoning": (
+        "Check: Is the final number 27? Are the three calculation steps "
+        "(120×0.30, 36×0.25, 36-9) present and correct? Is it within 150 words?"
+    ),
+    "agent": (
+        "Check each requirement: "
+        "(1) At least 2 named GPU models? "
+        "(2) VRAM and bandwidth compared? "
+        "(3) 2026 pricing uncertainty acknowledged? "
+        "(4) Benchmarking source named? "
+        "(5) No GPU under 16GB VRAM recommended? "
+        "Score 2 points per requirement met (max 10). "
+        "Score 0 if any GPU under 16GB is recommended."
+    ),
+    "math": (
+        "Check: Is the answer 5 minutes? "
+        "Does the explanation correctly state that each machine's rate "
+        "is independent of quantity? Is it within 100 words?"
+    ),
+    "rag": (
+        "Check: Does it cover the main topics from the notes? "
+        "Is it under 200 words? "
+        "Does it avoid inventing facts not in the source? "
+        "Is it clearly structured?"
+    ),
+}
+
+DEFAULT_RUBRIC = (
+    "Check whether the output correctly fulfils all requirements stated "
+    "in the original prompt. Score based on correctness and completeness, "
+    "not on style or verbosity beyond what the prompt requires."
+)
+
+
+# ============================================
+# DYNAMIC PROMPT BUILDERS
+# ============================================
+
+def ensure_context_file():
+    os.makedirs("./rag_samples", exist_ok=True)
+    if os.path.exists(CONTEXT_FILE):
+        return
+    content = """# Project Aurora — Strategic Initiative Report
+
+## Executive Summary
+Project Aurora is a digital transformation initiative launched January 2024.
+Proposed by CTO Maria Chen. Budget: $8.7M over three years.
+
+## Phase 2 — Cloud Migration
+Phase 2 budget allocation: $2.4 million.
+
+## Vendor Recommendation
+Vendor A (CloudScale) — $1.8M, limited EU.
+Vendor B (Nexora Systems) — $2.1M, 98% SLA, global.
+Vendor C (PrimeHost) — $1.4M, no SOC2.
+Vendor D (Stratos) — $2.8M, over budget.
+
+Final recommendation: proceed with Vendor B (Nexora Systems).
+"""
+    with open(CONTEXT_FILE, "w") as f:
+        f.write(content)
+    print(f"  Created: {CONTEXT_FILE}")
+
+
+def ensure_rag_file():
+    os.makedirs("./rag_samples", exist_ok=True)
+    if os.path.exists(RAG_FILE):
+        return
+    content = """# Homelab Infrastructure Notes
+
+## K8s Cluster
+- 4 nodes, Longhorn storage, Traefik ingress
+- FluxCD for GitOps, prune: false on llm namespace
+- Namespace llm: Open-WebUI, SearXNG, Crawl4AI, BGE reranker
+
+## Ollama VM
+- hostname: chat.h0melab.uk, IP: 10.0.20.57
+- GPU: RTX 5060 Ti 16GB, port 11434
+- Models: granite4.1:8b, qwen2.5-coder:14b, gemma4:e4b, nemotron-3-nano:4b
+
+## Services
+- Gitea at gitea.int, SSH port 3333
+- Netdata + VictoriaMetrics for monitoring
+- Signal bot with Whisper for voice transcription
+- wiki-processor auto-generates Obsidian wiki
+"""
+    with open(RAG_FILE, "w") as f:
+        f.write(content)
+    print(f"  Created: {RAG_FILE}")
+
+
+def build_all_prompts():
+    """Return complete prompt dict including dynamic context and RAG prompts."""
+    ensure_context_file()
+    ensure_rag_file()
+    prompts = dict(PROMPTS)
+
+    # Context prompts
+    if os.path.exists(CONTEXT_FILE):
+        with open(CONTEXT_FILE) as f:
+            context = f.read()
+        base = (
+            "Answer in ONE sentence only. "
+            "Use ONLY information from the document below. "
+            "Do not add explanation or context.\n\n"
+            f"DOCUMENT:\n{context}\n\n"
+        )
+        prompts["context_begin"]  = base + "QUESTION: What is the name of the project?"
+        prompts["context_middle"] = base + "QUESTION: What budget was allocated to Phase 2?"
+        prompts["context_end"]    = base + "QUESTION: Which vendor was selected and what is their company name?"
+
+    # RAG prompt
+    if os.path.exists(RAG_FILE):
+        with open(RAG_FILE) as f:
+            rag_content = f.read()
+        prompts["rag"] = (
+            "Maximum 200 words. Summarize and structure the following notes. "
+            "Preserve all specific facts (IPs, model names, service names). "
+            "Do not add information not present in the notes.\n\n"
+            + rag_content
+        )
+    else:
+        prompts["rag"] = "Maximum 200 words. Summarize: No RAG file found."
+
+    return prompts
--- a/reporting.py
+++ b/reporting.py
@@ -0,0 +1,170 @@
+"""
+benchmark_v4/reporting.py
+=========================
+All output formatting — terminal reports and CSV export.
+Completely separate from scoring and storage logic.
+"""
+
+from storage import load_latest_runs, load_all_runs, export_summary_csv
+from config import MODELS_BASELINE_THINKING, MODELS_NEW_THINKING
+
+
+def _tag(model, all_thinking):
+    return "🧠" if model in all_thinking else "⚡"
+
+
+def _base(row):
+    return "★" if row.get("is_baseline") else " "
+
+
+def print_weights():
+    from config import TEST_WEIGHTS, CATEGORIES
+    print("\n  TEST WEIGHTS:")
+    category_labels = {
+        "agent_tool":    "Agent/Tool reliability (25%)",
+        "coding":        "Coding/Infrastructure (25%)",
+        "rag_context":   "RAG/Context fidelity (20%)",
+        "structured":    "Structured outputs (15%)",
+        "hallucination": "Hallucination resistance (10%)",
+        "reasoning":     "Pure reasoning (5%)",
+    }
+    for cat, tests in CATEGORIES.items():
+        w = sum(TEST_WEIGHTS.get(t, 0) for t in tests)
+        label = category_labels.get(cat, cat)
+        print(f"    {label:<42} {w*100:.0f}%")
+
+
+def print_comparison(new_run_ids, existing_baseline_rows):
+    """Compare current run against existing baseline."""
+    from storage import load_all_runs, get_connection
+
+    print("\n" + "=" * 68)
+    print("  📊 RESULTS vs BASELINE")
+    print("=" * 68)
+
+    all_thinking = MODELS_BASELINE_THINKING + MODELS_NEW_THINKING
+
+    if existing_baseline_rows:
+        best = max(float(r.get("weighted_avg") or 0) for r in existing_baseline_rows)
+        print(f"\n  EXISTING BASELINE (best w_avg: {best:.2f}):")
+        for r in sorted(existing_baseline_rows, key=lambda x: -float(x.get("weighted_avg") or 0)):
+            print(
+                f"    {r['model']:<44} "
+                f"w={float(r.get('weighted_avg',0)):>5.2f}  "
+                f"σ={r.get('stdev_all','?'):>4}  "
+                f"fail={r.get('failure_rate_pct','?')}%  "
+                f"[{str(r.get('run_date',''))[:10]}]"
+            )
+    else:
+        best = 0
+
+    # Load current runs
+    if new_run_ids:
+        from storage import get_connection
+        placeholders = ",".join("?" * len(new_run_ids))
+        with get_connection() as conn:
+            new_rows = [dict(r) for r in conn.execute(
+                f"SELECT * FROM runs WHERE id IN ({placeholders})",
+                new_run_ids
+            ).fetchall()]
+
+        print(f"\n  THIS RUN:")
+        for r in sorted(new_rows, key=lambda x: -float(x.get("weighted_avg") or 0)):
+            diff  = float(r.get("weighted_avg") or 0) - best
+            arrow = "▲" if diff > 0.05 else "▼" if diff < -0.05 else "="
+            tag   = "BASE" if r.get("is_baseline") else "NEW "
+            print(
+                f"    [{tag}] {r['model']:<40} "
+                f"w={float(r.get('weighted_avg',0)):>5.2f}  {arrow}{abs(diff):.2f}  "
+                f"σ={r.get('stdev_all','?'):>4}  "
+                f"fail={r.get('failure_rate_pct','?')}%"
+            )
+
+
+def print_full_ranking(best=False):
+    """Print complete ranking of all models."""
+    from storage import load_best_runs, load_latest_runs
+    print("\n" + "=" * 68)
+    title = "BEST RUN" if best else "LATEST RUN"
+    print(f"  🏆 FULL RANKING ({title} per model, weighted semantic avg)")
+    print("=" * 68)
+
+    all_thinking = MODELS_BASELINE_THINKING + MODELS_NEW_THINKING
+    rows = load_best_runs() if best else load_latest_runs()
+
+    for i, r in enumerate(rows, 1):
+        tag  = _tag(r["model"], all_thinking)
+        base = "★" if r.get("is_baseline") else " "
+        print(
+            f"  {i:>2}. {tag}{base} {r['model']:<42} "
+            f"w={float(r.get('weighted_avg',0)):>5.2f}  "
+            f"σ={r.get('stdev_all','?'):>4}  "
+            f"fail={r.get('failure_rate_pct','?'):>4}%  "
+            f"tok/s={r.get('avg_tok_s','?'):>5}  "
+            f"🌡={r.get('avg_gpu_temp','?'):>2}°C  "
+            f"[{str(r.get('run_date',''))[:10]}]"
+        )
+
+    print(f"\n  ★=baseline w=weighted avg  σ=stdev(low better)  fail=failure rate  🌡=avg temps")
+
+
+def print_category_breakdown():
+    """Print category scores for latest run of each model."""
+    print("\n" + "=" * 68)
+    print("  📂 CATEGORY BREAKDOWN (latest run per model)")
+    print("=" * 68)
+
+    rows = load_latest_runs()
+    if not rows:
+        return
+
+    header = f"  {'Model':<40} {'agent':>6} {'code':>6} {'rag':>6} {'struct':>7} {'hall':>5} {'reason':>7}"
+    print(f"\n{header}")
+    print("  " + "-" * 64)
+
+    for r in rows:
+        base = "★" if r.get("is_baseline") else " "
+        print(
+            f"  {base}{r['model']:<41} "
+            f"{r.get('cat_agent_tool','?'):>6}  "
+            f"{r.get('cat_coding','?'):>6}  "
+            f"{r.get('cat_rag_context','?'):>6}  "
+            f"{r.get('cat_structured','?'):>7}  "
+            f"{r.get('cat_hallucination','?'):>5}  "
+            f"{r.get('cat_reasoning','?'):>7}"
+        )
+
+
+def print_compliance_table():
+    """Print compliance rates for latest run of each model."""
+    print("\n" + "=" * 68)
+    print("  ✅ COMPLIANCE RATES (latest run per model)")
+    print("=" * 68)
+
+    rows = load_latest_runs()
+    if not rows:
+        return
+
+    header = f"  {'Model':<44} {'JSON':>6} {'YAML':>6} {'Tool':>6} {'Hall':>6}"
+    print(f"\n{header}")
+    print("  " + "-" * 64)
+
+    for r in rows:
+        base = "★" if r.get("is_baseline") else " "
+        def fmt(v):
+            return f"{v}%" if v is not None else "  n/a"
+        print(
+            f"  {base}{r['model']:<43} "
+            f"{fmt(r.get('compliance_json')):>6}  "
+            f"{fmt(r.get('compliance_yaml')):>6}  "
+            f"{fmt(r.get('compliance_tool')):>6}  "
+            f"{fmt(r.get('compliance_hall')):>6}"
+        )
+
+
+def run_report():
+    """Full report: ranking + categories + compliance."""
+    print_full_ranking()
+    print_category_breakdown()
+    print_compliance_table()
+    export_summary_csv()
--- a/runner.py
+++ b/runner.py
@@ -0,0 +1,276 @@
+"""
+benchmark_v4/runner.py
+======================
+Executes models via Ollama CLI and orchestrates the benchmark loop.
+Handles: warmup, GPU polling, cooldown, multi-run variance.
+"""
+
+import subprocess
+import time
+import re
+import statistics
+from datetime import datetime
+
+from config import (
+    COOLDOWN_SECONDS, GPU_POLL_EVERY,
+    TEST_WEIGHTS, CATEGORIES,
+)
+from prompts import ALL_TESTS
+from validators import normalize_text
+from judge import warmup_judge
+from scoring import (
+    score_test, compute_weighted, compute_category_scores,
+    compute_compliance, compute_variance_stats
+)
+from storage import insert_run, insert_details, insert_variance
+
+
+# ============================================
+# GPU MONITORING
+# ============================================
+
+_gpu_cache      = {"temp": -1, "mem": -1, "util": -1, "clock": -1}
+_gpu_poll_count = 0
+
+
+def get_gpu(force=False):
+    """Poll GPU stats. Cached every GPU_POLL_EVERY tests to reduce overhead."""
+    global _gpu_cache, _gpu_poll_count
+
+    _gpu_poll_count += 1
+    if not force and GPU_POLL_EVERY > 0 and _gpu_poll_count % GPU_POLL_EVERY != 0:
+        return _gpu_cache
+
+    try:
+        result = subprocess.run(
+            ["nvidia-smi",
+             "--query-gpu=temperature.gpu,memory.used,utilization.gpu,clocks.sm",
+             "--format=csv,noheader,nounits"],
+            capture_output=True, text=True, timeout=5
+        )
+        temp, mem, util, clock = result.stdout.strip().split(", ")
+        _gpu_cache = {
+            "temp": int(temp), "mem": int(mem),
+            "util": int(util), "clock": int(clock)
+        }
+    except Exception:
+        pass
+
+    return _gpu_cache
+
+
+# ============================================
+# PARSE OLLAMA VERBOSE
+# ============================================
+
+def parse_generation_speed(output):
+    """
+    Parse GENERATION (eval) speed from Ollama verbose output.
+    The last tokens/s value is the generation rate.
+    """
+    matches = re.findall(r'(\d+\.\d+)\s+tokens/s', output)
+    return float(matches[-1]) if matches else None
+
+
+# ============================================
+# RUN SINGLE MODEL + PROMPT
+# ============================================
+
+def run_model(model, prompt):
+    """Execute model via Ollama CLI. Returns result dict."""
+    start  = time.time()
+    result = subprocess.run(
+        ["ollama", "run", model, prompt, "--verbose"],
+        capture_output=True, text=True
+    )
+    elapsed = round(time.time() - start, 2)
+    gpu     = get_gpu()
+    output  = result.stdout + "\n" + result.stderr
+
+    return {
+        "output":    output,
+        "time":      elapsed,
+        "tok_s":     parse_generation_speed(output),
+        "gpu_temp":  gpu["temp"],
+        "gpu_mem":   gpu["mem"],
+        "gpu_util":  gpu["util"],
+        "gpu_clock": gpu["clock"]
+    }
+
+
+# ============================================
+# BENCHMARK A GROUP OF MODELS
+# ============================================
+
+def run_benchmark(
+    models,
+    label,
+    is_baseline,
+    all_prompts,
+    num_runs=1,
+    no_cooldown=False
+):
+    """
+    Run benchmark for a list of models.
+    Returns list of run_ids (one per model).
+    """
+    run_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    run_ids  = []
+
+    for model in models:
+        # Accumulate across runs
+        sem_by_test  = {t: [] for t in ALL_TESTS}
+        fmt_by_test  = {t: [] for t in ALL_TESTS}
+        tok_s_all    = []
+        temp_all     = []
+        detail_rows  = []
+
+        print(f"\n[{label}] Model: {model}  ({num_runs} run{'s' if num_runs > 1 else ''})")
+
+        # Warmup
+        subprocess.run(
+            ["ollama", "run", model, "hello"],
+            capture_output=True, text=True
+        )
+        time.sleep(5)
+        warmup_judge()
+
+        for run_num in range(1, num_runs + 1):
+            if num_runs > 1:
+                print(f"\n  ── Run {run_num}/{num_runs} ──")
+
+            for test_name in ALL_TESTS:
+                prompt = all_prompts.get(test_name, "")
+                if not prompt or not prompt.strip():
+                    continue
+
+                result  = run_model(model, prompt)
+                scores  = score_test(test_name, prompt, result["output"])
+
+                sem = scores["semantic_score"]
+                fmt = scores["format_score"]
+
+                sem_by_test[test_name].append(sem)
+                fmt_by_test[test_name].append(fmt)
+
+                if result["tok_s"]:
+                    tok_s_all.append(result["tok_s"])
+                if result["gpu_temp"] > 0:
+                    temp_all.append(result["gpu_temp"])
+
+                flag = "J" if scores["used_judge"] else "V"
+                print(
+                    f"  [{run_num}] {test_name:<22} [{flag}]  "
+                    f"sem={sem:>2}/10  fmt={fmt:>2}/10  "
+                    f"comb={scores['combined_score']:>5.2f}  "
+                    f"{scores['notes'][:52]}"
+                )
+
+                detail_rows.append({
+                    "run_date":      run_date,
+                    "run_num":       run_num,
+                    "model":         model,
+                    "type":          label,
+                    "is_baseline":   1 if is_baseline else 0,
+                    "test":          test_name,
+                    "weight":        TEST_WEIGHTS.get(test_name, 0),
+                    "time_s":        result["time"],
+                    "tok_s":         result["tok_s"],
+                    "gpu_temp":      result["gpu_temp"],
+                    "gpu_mem":       result["gpu_mem"],
+                    "gpu_util":      result["gpu_util"],
+                    "gpu_clock":     result["gpu_clock"],
+                    "output_length": len(result["output"]),
+                    "semantic_score":sem,
+                    "format_score":  fmt,
+                    "combined_score":scores["combined_score"],
+                    "used_judge":    1 if scores["used_judge"] else 0,
+                    "notes":         scores["notes"][:120],
+                })
+
+                if not no_cooldown:
+                    time.sleep(COOLDOWN_SECONDS)
+
+        # Aggregate
+        avg_sem    = {t: round(statistics.mean(v), 2) for t, v in sem_by_test.items() if v}
+        avg_fmt    = {t: round(statistics.mean(v), 2) for t, v in fmt_by_test.items() if v}
+        w_total, w_avg = compute_weighted(avg_sem)
+        cat_scores = compute_category_scores(avg_sem)
+        compliance = compute_compliance(sem_by_test)
+        var_stats  = compute_variance_stats(sem_by_test)
+        fmt_avg    = round(statistics.mean([s for v in fmt_by_test.values() for s in v]), 2) if fmt_by_test else 0
+        avg_tok    = round(statistics.mean(tok_s_all), 1) if tok_s_all else 0
+        avg_tmp    = round(statistics.mean(temp_all), 1) if temp_all else 0
+
+        print(f"\n  ─── {model} ───")
+        print(f"      Weighted avg:   {w_avg}  (total={w_total})")
+        print(f"      Format avg:     {fmt_avg}/10")
+        print(f"      Variance:       mean={var_stats['mean']} σ={var_stats['stdev']} failures={var_stats['failure_rate']}%")
+        print(f"      Compliance:     JSON={compliance.get('json_valid')}% YAML={compliance.get('yaml_valid')}% "
+              f"tool={compliance.get('tool_format')}% hall={compliance.get('hallucination_free')}%")
+        print(f"      Categories:     agent={cat_scores.get('agent_tool')} coding={cat_scores.get('coding')} "
+              f"rag={cat_scores.get('rag_context')} struct={cat_scores.get('structured')} "
+              f"hall={cat_scores.get('hallucination')} reason={cat_scores.get('reasoning')}")
+        print(f"      tok/s={avg_tok}  temp={avg_tmp}°C")
+
+        # Save to DB
+        run_row = {
+            "run_date":        run_date,
+            "model":           model,
+            "type":            label,
+            "is_baseline":     1 if is_baseline else 0,
+            "num_runs":        num_runs,
+            "weighted_total":  w_total,
+            "weighted_avg":    w_avg,
+            "avg_format":      fmt_avg,
+            "mean_all":        var_stats["mean"],
+            "stdev_all":       var_stats["stdev"],
+            "min_score":       var_stats["min"],
+            "max_score":       var_stats["max"],
+            "failure_rate_pct":var_stats["failure_rate"],
+            "compliance_json": compliance.get("json_valid"),
+            "compliance_yaml": compliance.get("yaml_valid"),
+            "compliance_tool": compliance.get("tool_format"),
+            "compliance_hall": compliance.get("hallucination_free"),
+            "cat_agent_tool":  cat_scores.get("agent_tool"),
+            "cat_coding":      cat_scores.get("coding"),
+            "cat_rag_context": cat_scores.get("rag_context"),
+            "cat_structured":  cat_scores.get("structured"),
+            "cat_hallucination":cat_scores.get("hallucination"),
+            "cat_reasoning":   cat_scores.get("reasoning"),
+            "avg_tok_s":       avg_tok,
+            "avg_gpu_temp":    avg_tmp,
+            "tests_run":       len(avg_sem) * num_runs,
+        }
+
+        run_id = insert_run(run_row)
+        insert_details(run_id, detail_rows)
+
+        # Variance rows (only if multiple runs)
+        if num_runs > 1:
+            var_rows = []
+            for test_name, scores_list in sem_by_test.items():
+                if len(scores_list) > 1:
+                    var_rows.append({
+                        "run_date":        run_date,
+                        "model":           model,
+                        "test":            test_name,
+                        "num_runs":        num_runs,
+                        "mean":            round(statistics.mean(scores_list), 2),
+                        "stdev":           round(statistics.stdev(scores_list), 2),
+                        "min_score":       min(scores_list),
+                        "max_score":       max(scores_list),
+                        "failure_rate_pct":round(
+                            sum(1 for s in scores_list if s <= 2) / len(scores_list) * 100, 1
+                        ),
+                        "scores_raw":      str(scores_list),
+                    })
+            if var_rows:
+                insert_variance(var_rows)
+
+        run_ids.append(run_id)
+
+        print(f"\nCooldown after {model}...\n")
+        time.sleep(30)
+
+    return run_ids
--- a/scoring.py
+++ b/scoring.py
@@ -0,0 +1,193 @@
+"""
+benchmark_v4/scoring.py
+=======================
+Combines validator, judge, and embedding into final scores.
+Computes: format_score, semantic_score, combined_score.
+Computes: category scores, weighted total, compliance, variance.
+"""
+
+import re
+import statistics
+from config import TEST_WEIGHTS, CATEGORIES, COMPLIANCE_GROUPS
+from validators import normalize_text, run_validator
+from judge import call_judge, embedding_score
+from prompts import GROUND_TRUTHS
+
+
+# ============================================
+# FORMAT SCORE
+# ============================================
+
+def compute_format_score(output, prompt):
+    """
+    Scores format obedience only — separate from semantic quality.
+    Checks: ANSI codes, word limit, markdown when not requested.
+    Returns 0-10.
+    """
+    text  = normalize_text(output)
+    score = 10
+
+    # ANSI escape codes in output (model is polluting its output)
+    if re.search(r'\x1b\[', output):
+        score -= 2
+
+    # Word limit
+    limit_m = re.search(r'Maximum (\d+) words?', prompt, re.IGNORECASE)
+    if limit_m:
+        limit = int(limit_m.group(1))
+        words = len(text.split())
+        if words > limit * 1.3:
+            score -= min(3, int((words - limit) / limit * 5))
+
+    # Markdown when prompt says "No markdown" or "No explanation"
+    if ("no markdown" in prompt.lower() or "no explanation" in prompt.lower()):
+        if "```" in text and len(text.split("```")) > 2:
+            score -= 2
+
+    return max(0, score)
+
+
+# ============================================
+# COMBINED SCORE
+# ============================================
+
+def score_test(test_name, prompt, raw_output):
+    """
+    Main scoring pipeline:
+      1. Run deterministic validator
+      2. If partial, blend with judge
+      3. For RAG, blend judge with embedding similarity
+      4. Compute format score separately
+      5. Combined = semantic * 0.8 + format * 0.2
+
+    Returns dict with all score components.
+    """
+    # Normalize for quality assessment
+    clean = normalize_text(raw_output)
+
+    # Format score (always computed, separate dimension)
+    fmt_score = compute_format_score(raw_output, prompt)
+
+    # Validator
+    val_score, skip_judge, val_notes = run_validator(test_name, clean)
+
+    if val_score is not None and skip_judge:
+        # Definitive — 0 or 10
+        semantic   = val_score
+        used_judge = False
+        notes      = val_notes
+
+    elif val_score is not None:
+        # High-confidence tests: trust validator when score >= 8, skip judge
+        high_confidence = {"compression", "artifact_mermaid", "tool_calling",
+                           "yaml_generation", "multi_step_agent"}
+        if test_name in high_confidence and val_score >= 8:
+            semantic   = val_score
+            used_judge = False
+            notes      = val_notes
+        else:
+            # Partial validator score — blend with judge (80/20)
+            j_score, j_reason = call_judge(test_name, prompt, clean)
+            semantic   = round(val_score * 0.8 + j_score * 0.2)
+            used_judge = True
+            notes      = f"val={val_score} j={j_score} → {j_reason[:55]}"
+
+    elif test_name == "rag":
+        ref    = GROUND_TRUTHS.get("rag", "")
+        e_sim  = embedding_score(clean, ref)
+        j_score, j_reason = call_judge(test_name, prompt, clean)
+        # Weight judge more — embedding unreliable for technical content
+        if e_sim == 0:
+            semantic = j_score  # embedding failed, use judge only
+        else:
+            semantic = round(e_sim * 0.3 + j_score * 0.7)
+        used_judge = True
+        notes = f"embed={e_sim} j={j_score} → {j_reason[:50]}"
+
+    else:
+        # Pure judge
+        j_score, j_reason = call_judge(test_name, prompt, clean)
+        semantic   = j_score
+        used_judge = True
+        notes      = j_reason[:80]
+
+    # Combined: 80% semantic, 20% format — mathematically correct
+    combined = round(semantic * 0.8 + fmt_score * 0.2, 2)
+
+    return {
+        "semantic_score": int(semantic),
+        "format_score":   fmt_score,
+        "combined_score": combined,
+        "used_judge":     used_judge,
+        "notes":          notes,
+    }
+
+
+# ============================================
+# WEIGHTED + CATEGORY SCORES
+# ============================================
+
+def compute_weighted(semantic_scores):
+    """
+    Compute weighted total and average from semantic scores.
+    Returns (weighted_total, weighted_avg).
+    """
+    total = weight_sum = 0.0
+    for test, score in semantic_scores.items():
+        w = TEST_WEIGHTS.get(test, 0)
+        total      += (score / 10) * w * 7
+        weight_sum += w
+    if weight_sum == 0:
+        return 0, 0
+    return round(total, 2), round(total / weight_sum, 2)
+
+
+def compute_category_scores(semantic_scores):
+    """
+    Compute average semantic score per category.
+    Returns dict: {category_name: avg_score}.
+    """
+    cat_scores = {}
+    for cat, tests in CATEGORIES.items():
+        scores = [semantic_scores[t] for t in tests if t in semantic_scores]
+        cat_scores[cat] = round(sum(scores) / len(scores), 2) if scores else 0
+    return cat_scores
+
+
+def compute_compliance(semantic_scores_by_run):
+    """
+    Compliance = % of runs where semantic_score >= 8.
+    Input: {test_name: [score_run1, score_run2, ...]}
+    Returns: {group_name: percentage}
+    """
+    compliance = {}
+    for group, tests in COMPLIANCE_GROUPS.items():
+        all_scores = []
+        for t in tests:
+            if t in semantic_scores_by_run:
+                all_scores.extend(semantic_scores_by_run[t])
+        if all_scores:
+            rate = sum(1 for s in all_scores if s >= 8) / len(all_scores)
+            compliance[group] = round(rate * 100, 1)
+        else:
+            compliance[group] = None
+    return compliance
+
+
+def compute_variance_stats(scores_by_test):
+    """
+    Compute variance statistics across multiple runs.
+    Input: {test_name: [score_run1, score_run2, ...]}
+    Returns: {mean, stdev, min, max, failure_rate_pct}
+    """
+    all_scores = [s for scores in scores_by_test.values() for s in scores]
+    if not all_scores:
+        return {"mean": 0, "stdev": 0, "min": 0, "max": 0, "failure_rate": 0}
+
+    return {
+        "mean":         round(statistics.mean(all_scores), 2),
+        "stdev":        round(statistics.stdev(all_scores), 2) if len(all_scores) > 1 else 0,
+        "min":          min(all_scores),
+        "max":          max(all_scores),
+        "failure_rate": round(sum(1 for s in all_scores if s <= 2) / len(all_scores) * 100, 1),
+    }
--- a/storage.py
+++ b/storage.py
@@ -0,0 +1,279 @@
+"""
+benchmark_v4/storage.py
+=======================
+SQLite persistence for benchmark results.
+Three tables:
+  - runs:     one row per model per benchmark run
+  - details:  one row per test per model per run
+  - variance: one row per test per model (multi-run stats)
+
+Query examples:
+  SELECT model, weighted_avg, stdev_all
+  FROM runs
+  WHERE is_baseline = 1
+  ORDER BY weighted_avg DESC;
+
+  SELECT model, test, semantic_score
+  FROM details
+  WHERE run_id = (SELECT MAX(id) FROM runs WHERE model = 'granite4.1:8b');
+"""
+
+import sqlite3
+import json
+from datetime import datetime
+from config import DB_FILE
+
+
+# ============================================
+# SCHEMA
+# ============================================
+
+SCHEMA = """
+CREATE TABLE IF NOT EXISTS runs (
+    id              INTEGER PRIMARY KEY AUTOINCREMENT,
+    run_date        TEXT    NOT NULL,
+    model           TEXT    NOT NULL,
+    type            TEXT    NOT NULL,
+    is_baseline     INTEGER NOT NULL DEFAULT 0,
+    num_runs        INTEGER NOT NULL DEFAULT 1,
+
+    -- Weighted scores
+    weighted_total  REAL,
+    weighted_avg    REAL,
+
+    -- Format
+    avg_format      REAL,
+
+    -- Variance
+    mean_all        REAL,
+    stdev_all       REAL,
+    min_score       REAL,
+    max_score       REAL,
+    failure_rate_pct REAL,
+
+    -- Compliance (%)
+    compliance_json  REAL,
+    compliance_yaml  REAL,
+    compliance_tool  REAL,
+    compliance_hall  REAL,
+
+    -- Category scores
+    cat_agent_tool   REAL,
+    cat_coding       REAL,
+    cat_rag_context  REAL,
+    cat_structured   REAL,
+    cat_hallucination REAL,
+    cat_reasoning    REAL,
+
+    -- Performance
+    avg_tok_s       REAL,
+    avg_gpu_temp    REAL,
+
+    tests_run       INTEGER
+);
+
+CREATE TABLE IF NOT EXISTS details (
+    id              INTEGER PRIMARY KEY AUTOINCREMENT,
+    run_id          INTEGER NOT NULL REFERENCES runs(id),
+    run_date        TEXT    NOT NULL,
+    run_num         INTEGER NOT NULL DEFAULT 1,
+    model           TEXT    NOT NULL,
+    type            TEXT    NOT NULL,
+    is_baseline     INTEGER NOT NULL DEFAULT 0,
+
+    test            TEXT    NOT NULL,
+    weight          REAL,
+    time_s          REAL,
+    tok_s           REAL,
+    gpu_temp        INTEGER,
+    gpu_mem         INTEGER,
+    gpu_util        INTEGER,
+    gpu_clock       INTEGER,
+    output_length   INTEGER,
+
+    semantic_score  INTEGER,
+    format_score    INTEGER,
+    combined_score  REAL,
+    used_judge      INTEGER,
+    notes           TEXT
+);
+
+CREATE TABLE IF NOT EXISTS variance (
+    id              INTEGER PRIMARY KEY AUTOINCREMENT,
+    run_date        TEXT    NOT NULL,
+    model           TEXT    NOT NULL,
+    test            TEXT    NOT NULL,
+    num_runs        INTEGER NOT NULL,
+    mean            REAL,
+    stdev           REAL,
+    min_score       INTEGER,
+    max_score       INTEGER,
+    failure_rate_pct REAL,
+    scores_raw      TEXT
+);
+
+CREATE INDEX IF NOT EXISTS idx_runs_model    ON runs(model);
+CREATE INDEX IF NOT EXISTS idx_details_run   ON details(run_id);
+CREATE INDEX IF NOT EXISTS idx_details_model ON details(model);
+CREATE INDEX IF NOT EXISTS idx_details_test  ON details(test);
+"""
+
+
+# ============================================
+# CONNECTION
+# ============================================
+
+def get_connection():
+    conn = sqlite3.connect(DB_FILE)
+    conn.row_factory = sqlite3.Row
+    conn.execute("PRAGMA journal_mode=WAL")
+    conn.execute("PRAGMA foreign_keys=ON")
+    return conn
+
+
+def init_db():
+    """Create tables if they don't exist."""
+    with get_connection() as conn:
+        conn.executescript(SCHEMA)
+
+
+# ============================================
+# WRITE
+# ============================================
+
+def insert_run(run_data):
+    """Insert a run summary row. Returns the run_id."""
+    sql = """
+    INSERT INTO runs (
+        run_date, model, type, is_baseline, num_runs,
+        weighted_total, weighted_avg, avg_format,
+        mean_all, stdev_all, min_score, max_score, failure_rate_pct,
+        compliance_json, compliance_yaml, compliance_tool, compliance_hall,
+        cat_agent_tool, cat_coding, cat_rag_context,
+        cat_structured, cat_hallucination, cat_reasoning,
+        avg_tok_s, avg_gpu_temp, tests_run
+    ) VALUES (
+        :run_date, :model, :type, :is_baseline, :num_runs,
+        :weighted_total, :weighted_avg, :avg_format,
+        :mean_all, :stdev_all, :min_score, :max_score, :failure_rate_pct,
+        :compliance_json, :compliance_yaml, :compliance_tool, :compliance_hall,
+        :cat_agent_tool, :cat_coding, :cat_rag_context,
+        :cat_structured, :cat_hallucination, :cat_reasoning,
+        :avg_tok_s, :avg_gpu_temp, :tests_run
+    )
+    """
+    with get_connection() as conn:
+        cursor = conn.execute(sql, run_data)
+        return cursor.lastrowid
+
+
+def insert_details(run_id, detail_rows):
+    """Insert detail rows for a run."""
+    sql = """
+    INSERT INTO details (
+        run_id, run_date, run_num, model, type, is_baseline,
+        test, weight, time_s, tok_s,
+        gpu_temp, gpu_mem, gpu_util, gpu_clock, output_length,
+        semantic_score, format_score, combined_score, used_judge, notes
+    ) VALUES (
+        :run_id, :run_date, :run_num, :model, :type, :is_baseline,
+        :test, :weight, :time_s, :tok_s,
+        :gpu_temp, :gpu_mem, :gpu_util, :gpu_clock, :output_length,
+        :semantic_score, :format_score, :combined_score, :used_judge, :notes
+    )
+    """
+    rows = [{**r, "run_id": run_id} for r in detail_rows]
+    with get_connection() as conn:
+        conn.executemany(sql, rows)
+
+
+def insert_variance(variance_rows):
+    """Insert variance rows."""
+    sql = """
+    INSERT INTO variance (
+        run_date, model, test, num_runs,
+        mean, stdev, min_score, max_score, failure_rate_pct, scores_raw
+    ) VALUES (
+        :run_date, :model, :test, :num_runs,
+        :mean, :stdev, :min_score, :max_score, :failure_rate_pct, :scores_raw
+    )
+    """
+    with get_connection() as conn:
+        conn.executemany(sql, variance_rows)
+
+
+# ============================================
+# READ
+# ============================================
+def load_best_runs():
+    """Load best scoring run per model."""
+    with get_connection() as conn:
+        rows = conn.execute("""
+            SELECT r.*
+            FROM runs r
+            INNER JOIN (
+                SELECT model, MAX(weighted_avg) AS best_w
+                FROM runs
+                GROUP BY model
+            ) best ON r.model = best.model 
+            AND r.weighted_avg = best.best_w
+            ORDER BY r.weighted_avg DESC
+        """).fetchall()
+    return [dict(r) for r in rows]
+
+def load_latest_runs(is_baseline=None):
+    """Load latest run per model."""
+    sql = """
+    SELECT r.*
+    FROM runs r
+    INNER JOIN (
+        SELECT model, MAX(run_date) AS latest
+        FROM runs
+        GROUP BY model
+    ) latest ON r.model = latest.model AND r.run_date = latest.latest
+    """
+    params = []
+    if is_baseline is not None:
+        sql += " WHERE r.is_baseline = ?"
+        params.append(1 if is_baseline else 0)
+
+    sql += " ORDER BY r.weighted_avg DESC"
+
+    with get_connection() as conn:
+        rows = conn.execute(sql, params).fetchall()
+    return [dict(r) for r in rows]
+
+
+def load_all_runs():
+    """Load all run summaries."""
+    with get_connection() as conn:
+        rows = conn.execute(
+            "SELECT * FROM runs ORDER BY run_date DESC"
+        ).fetchall()
+    return [dict(r) for r in rows]
+
+
+def load_details_for_run(run_id):
+    """Load all test details for a specific run."""
+    with get_connection() as conn:
+        rows = conn.execute(
+            "SELECT * FROM details WHERE run_id = ? ORDER BY test",
+            (run_id,)
+        ).fetchall()
+    return [dict(r) for r in rows]
+
+
+def export_summary_csv(filepath="benchmark_summary.csv"):
+    """Export latest run per model to CSV for Excel analysis."""
+    import csv
+    rows = load_latest_runs()
+    if not rows:
+        print("No runs to export.")
+        return
+
+    with open(filepath, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=rows[0].keys())
+        writer.writeheader()
+        writer.writerows(rows)
+
+    print(f"  Exported {len(rows)} rows to {filepath}")
--- a/validators.py
+++ b/validators.py
@@ -0,0 +1,467 @@
+"""
+benchmark_v4/validators.py
+==========================
+Layer 1: Deterministic validators.
+No LLM judge needed. Returns (score 0-10, notes str).
+A score of 0 or 10 is definitive — judge is skipped.
+Partial scores (1-9) trigger judge blending.
+"""
+
+import re
+import json
+
+try:
+    import yaml
+    YAML_AVAILABLE = True
+except ImportError:
+    YAML_AVAILABLE = False
+
+try:
+    from rapidfuzz import fuzz
+    FUZZY_AVAILABLE = True
+except ImportError:
+    FUZZY_AVAILABLE = False
+
+
+# ============================================
+# TEXT NORMALIZATION
+# ============================================
+
+def normalize_text(text, mode="plain"):
+    """
+    Centralized text cleaning.
+    mode="plain"  — strip ANSI, control chars, ollama stats, thinking tokens
+    mode="json"   — plain + strip markdown fences
+    mode="yaml"   — plain + strip markdown fences
+    """
+
+    # 1. Strip ANSI escape sequences FIRST
+    text = re.sub(r'\x1b(?:[@-Z\\-_]|\[[0-9;?]*[A-Za-z])', '', text)
+
+    # 2. Strip control characters
+    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
+
+    # 3. Strip Ollama spinner/progress characters
+    text = re.sub(r'[⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏]+', '', text)
+
+    # 4. Normalize Unicode spaces to regular spaces
+    text = text.replace('\u202f', ' ').replace('\u00a0', ' ').replace('\u2009', ' ')
+
+    # 5. Strip thinking tokens (AFTER cleaning so regex works cleanly)
+    text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
+    text = re.sub(r'Thinking\.\.\..*?\.\.\.done thinking\.?', '', text, flags=re.DOTALL)
+
+    # 6. Strip Ollama verbose stats (LAST — after all other cleanup)
+
+    # 6. Strip Ollama verbose stats (LAST — after all other cleanup)
+    lines = text.split("\n")
+    text = "\n".join(
+        l for l in lines if not any(k in l.lower() for k in [
+            "total duration:", "load duration:", "prompt eval",
+            "eval count:", "eval duration:", "eval rate:", "tokens/s", "token(s)"
+        ])
+    )
+
+    if mode in ("json", "yaml"):
+        text = re.sub(r'^```(?:json|yaml|)?\s*', '', text, flags=re.MULTILINE)
+        text = re.sub(r'```\s*$', '', text, flags=re.MULTILINE)
+        lines = [l for l in text.split('\n')
+                 if not l.strip().startswith('[?')
+                 and not l.strip().startswith('```')
+                 and not re.search(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', l)
+                 and '\x1b' not in l]
+        text = '\n'.join(lines).strip()
+
+    return text
+
+
+# ============================================
+# JSON EXTRACTION
+# ============================================
+
+def extract_json_object(text):
+    """
+    Advanced JSON extractor that handles prompt-echoing, 
+    large whitespace blocks, and multiple JSON objects.
+    """
+    # 1. Aggressive normalization to strip fences and leading/trailing junk
+    text = normalize_text(text, mode="json")
+    
+    # 2. Collapse newlines inside JSON strings — fixes mid-value line breaks
+    text = re.sub(r'\n\s*', ' ', text)
+
+    # 3. Skip the prompt-echo/template if the model repeats it.
+    keyword = '"recommendations"'
+    last_keyword_pos = text.rfind(keyword)
+    
+    search_start = 0
+    if last_keyword_pos != -1:
+        search_start = text.rfind('{', 0, last_keyword_pos)
+        if search_start == -1: search_start = 0
+
+    decoder = json.JSONDecoder()
+    found_objs = []
+    
+    # 4. Iteratively parse all valid JSON objects starting from search_start
+    idx = search_start
+    while idx < len(text):
+        start = text.find('{', idx)
+        if start == -1:
+            break
+        try:
+            obj, end = decoder.raw_decode(text, start)
+            if isinstance(obj, dict):
+                found_objs.append(obj)
+            idx = end
+        except json.JSONDecodeError:
+            idx = start + 1
+
+    if not found_objs:
+        return None
+
+    # 5. Filter for populated answer rather than empty template
+    for o in reversed(found_objs):
+        if "recommendations" in o:
+            recs = o.get("recommendations")
+            if isinstance(recs, list) and len(recs) > 0:
+                if any(r.get("gpu") for r in recs if isinstance(r, dict)):
+                    return o
+                    
+    return found_objs[-1] if found_objs else None
+
+
+# ============================================
+# VALIDATORS
+# ============================================
+
+def validate_tool_calling(text):
+    """Single tool call, no extras."""
+    text = normalize_text(text)
+    lines = [l for l in text.split('\n') if l.strip()]
+
+    if len(lines) > 3:
+        return 0, "multiple lines — explanation added"
+
+    # Valid tool call pattern
+    if re.search(r'(web_search|scrape_page|calculate)\s*\(["\'].*["\']\)', text):
+        return 10, "valid tool call syntax"
+
+    if re.search(r'\w+\s*\(["\'].*["\']\)', text):
+        return 5, "function call but wrong name"
+
+    return 0, "no valid function call found"
+
+
+def validate_yaml(text):
+    """Must parse as valid YAML Deployment."""
+    if not YAML_AVAILABLE:
+        return 5, "pyyaml not installed"
+
+    text = normalize_text(text, mode="yaml")
+    lines = [l for l in text.split('\n')
+             if not l.strip().startswith('[?')
+             and not l.strip().startswith('```')
+             and not re.search(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', l)
+             and '\x1b' not in l]
+    text = '\n'.join(lines).strip()
+    try:
+        parsed = yaml.safe_load(text)
+        if not isinstance(parsed, dict):
+            return 3, "parsed but not a dict"
+
+        score = 2
+        if parsed.get('kind') == 'Deployment':
+            score += 2
+        if 'spec' in parsed:
+            score += 2
+            spec = parsed['spec']
+            if spec.get('replicas') == 2:
+                score += 1
+        if 'apiVersion' in parsed:
+            score += 1
+        if score >= 8:
+            score = 10
+
+        return min(score, 10), f"valid YAML score={score}"
+
+    except yaml.YAMLError as e:
+        return 0, f"invalid YAML: {str(e)[:60]}"
+
+
+def validate_json_output(text):
+    """
+    Nested structured JSON with recommendations array.
+    Expected: {"recommendations": [{"gpu":"","price_eur":0,"vram_gb":0,"pros":[],"cons":[]}]}
+    Scores based on: valid JSON, correct structure, field types, 2 recommendations.
+    """
+    parsed = extract_json_object(text)
+    if parsed is None:
+        return 0, "no valid JSON object found"
+
+    # Check top-level structure
+    if "recommendations" not in parsed:
+        # Fallback: old flat format still gets partial credit
+        old_fields = ["gpu", "price", "reason"]
+        present = [k for k in old_fields if k in parsed and str(parsed[k]).strip()]
+        if present:
+            return 4, f"flat JSON found (old format), missing nested structure"
+        return 0, "no recommendations array found"
+
+    recs = parsed["recommendations"]
+    if not isinstance(recs, list) or len(recs) == 0:
+        return 2, "recommendations present but empty or not a list"
+
+    required_fields = {"gpu", "price_eur", "vram_gb", "pros", "cons"}
+    score = 4  # base for having recommendations array
+
+    # Check count
+    if len(recs) >= 2:
+        score += 2
+    elif len(recs) == 1:
+        score += 1
+
+    # Check field completeness on first recommendation
+    first = recs[0]
+    present = required_fields & set(first.keys())
+    score += int((len(present) / len(required_fields)) * 3)
+
+    # Check type correctness
+    type_ok = (
+        isinstance(first.get("price_eur"), (int, float)) and
+        isinstance(first.get("vram_gb"), (int, float)) and
+        isinstance(first.get("pros"), list) and
+        isinstance(first.get("cons"), list)
+    )
+    if type_ok:
+        score += 1
+
+    score = min(score, 10)
+    return score, f"nested JSON: {len(recs)} recs, fields={list(present)}, types_ok={type_ok}"
+
+
+def validate_json_schema(text):
+    """Valid JSON Schema with all required properties."""
+    parsed = extract_json_object(text)
+    if parsed is None:
+        return 0, "no valid JSON Schema found"
+
+    props = parsed.get('properties', {})
+    score = 0
+
+    if 'apiVersion' in props:
+        score += 2
+    if 'kind' in props:
+        k = props['kind']
+        has_enum = 'enum' in k and set(k['enum']) >= {'Deployment', 'Service', 'ConfigMap'}
+        score += 3 if has_enum else 1
+    if 'metadata' in props:
+        score += 2
+    if 'spec' in props:
+        score += 2
+    if parsed.get('required'):
+        score += 1
+
+    return min(score, 10), f"JSON Schema score={score}/10"
+
+
+def validate_mermaid(text):
+    """Valid Mermaid block with all 8 stages."""
+    text = normalize_text(text)
+    stages = [
+        "code push", "lint", "unit test", "build",
+        "integration test", "deploy staging", "smoke test", "deploy production"
+    ]
+    text_lower = text.lower()
+    has_fence  = '```mermaid' in text_lower or \
+                 ('```' in text and ('graph' in text_lower or 'flowchart' in text_lower))
+
+    if not has_fence:
+        return 2, "no mermaid fence found"
+
+    found = sum(1 for s in stages if s in text_lower)
+    score = int((found / len(stages)) * 10)
+    return score, f"{found}/{len(stages)} stages found"
+
+
+def validate_compression(text):
+    """Exactly 10 bullets, all 7 industries."""
+    text    = normalize_text(text)
+    lines   = text.strip().split('\n')
+    bullets = [l for l in lines if l.strip().startswith('- ')]
+    count   = len(bullets)
+
+    industries = ["healthcare", "financ", "transport", "manufactur",
+                  "education", "energy", "agricultur"]
+    text_lower     = text.lower()
+    industry_count = sum(1 for i in industries if i in text_lower)
+
+    if count == 10:
+        score = 5 + industry_count
+    elif 8 <= count <= 12:
+        score = 3 + industry_count
+    elif count == 7 or count == 13:
+        score = 2 + industry_count  # ← add this
+    else:
+        score = max(0, 2 - abs(count - 10))
+
+    return min(score, 10), f"{count} bullets, {industry_count}/7 industries"
+
+
+def validate_multi_step(text):
+    """3 distinct tool calls + final answer."""
+    text = normalize_text(text)
+
+    tool_calls   = re.findall(
+        r'(web_search|scrape_page|summarize)\s*\(["\'][^"\']*["\']\)', text
+    )
+    unique_tools = set(re.findall(r'(web_search|scrape_page|summarize)\s*\(', text))
+    has_final    = bool(re.search(r'final\s*[:.]', text, re.IGNORECASE))
+    distinct     = len(set(tool_calls))
+
+    if distinct >= 3 and len(unique_tools) >= 2 and has_final:
+        return 10, f"{distinct} distinct calls, final answer present"
+    elif distinct >= 2 and has_final:
+        return 7, f"only {distinct} distinct tool calls"
+    elif distinct >= 1:
+        return 4, "tool calls found but incomplete"
+    return 0, "no valid tool calls found"
+
+
+def validate_hallucination(text):
+    """
+    The book does not exist.
+    Reward uncertainty. Penalize confident invented specifics.
+    """
+    text_lower = text.lower()
+
+    # Good refusal signals
+    refuse_signals = [
+        "does not exist", "doesn't exist", "not exist",
+        "cannot verify", "can't verify", "cannot confirm",
+        "not aware", "no record", "not familiar",
+        "unable to find", "could not find", "not in my",
+        "not certain", "uncertain", "cannot find",
+        "don't have", "do not have", "not sure",
+        "may not exist", "might not exist", "no evidence",
+        "i cannot", "i can't", "i don't know",
+        "fictional", "fabricated", "no such book",
+        "not able to verify", "cannot be verified",
+        "not a real", "i'm not aware", "i am not aware",
+    ]
+
+    # Hard hallucination signals — confident invented content
+    hallucination_signals = [
+        "the book explores", "kessler argues", "the main ideas",
+        "the novel", "in the book", "the author presents",
+        "the story follows", "themes include", "the book examines",
+        "kessler's work", "the book describes",
+        "this book", "the plot", "the protagonist",
+    ]
+    for s in hallucination_signals:
+        if s in text_lower:
+            return 0, f"hallucinated: '{s}' found"
+
+
+    for s in refuse_signals:
+        if s in text_lower:
+            return 10, f"correctly refused: '{s}'"
+
+    return 5, "ambiguous — no clear refusal or hallucination"
+
+
+def validate_coding(text):
+    """Python function with LIS and complexity."""
+    text = normalize_text(text)
+    has_def        = 'def ' in text
+    has_return     = 'return' in text
+    has_lis        = any(w in text.lower() for w in ['subsequence', 'lis', 'longest'])
+    has_complexity = any(w in text.lower() for w in ['o(n', 'n log n', 'n²', 'n^2', 'complexity'])
+
+    if has_def and has_return and has_lis and has_complexity:
+        return 10, "function correct with complexity"
+
+    score = (3 if has_def else 0) + (1 if has_return else 0) + \
+            (2 if has_lis else 0) + (2 if has_complexity else 0)
+
+    return min(score, 9), f"def={has_def} lis={has_lis} complexity={has_complexity}"
+
+
+def validate_context(text, expected_phrase):
+    """
+    Fuzzy match for context tests.
+    Semantically correct answers pass even with different phrasing.
+    """
+    text     = normalize_text(text).lower()
+    expected = expected_phrase.lower()
+
+    # Exact match
+    if expected in text:
+        return 10, "exact match"
+
+    if FUZZY_AVAILABLE:
+        partial = fuzz.partial_ratio(expected, text)
+        token   = fuzz.token_set_ratio(expected, text)
+        best    = max(partial, token)
+
+        if best >= 90: return 10, f"fuzzy match {best}%"
+        if best >= 80: return 9,  f"fuzzy match {best}%"
+        if best >= 70: return 7,  f"partial match {best}%"
+        if best >= 55: return 5,  f"weak match {best}%"
+        return max(0, int(best / 12)), f"poor match {best}%"
+
+    # Fallback token matching
+    key_words = [w for w in expected.split() if len(w) > 3]
+    if not key_words:
+        return 5, "no key words to match"
+    matches = sum(1 for w in key_words if w in text)
+    return int((matches / len(key_words)) * 10), f"{matches}/{len(key_words)} tokens"
+
+def validate_agent(text):
+    text_lower = normalize_text(text).lower()
+    sub_16gb = [
+        "rtx 4070 ti", "rtx 4070", "rtx 3060", "rtx 3070",
+        "rtx 4060", "rx 6700", "rx 7700", "rx 6600",
+        "12gb", "10gb", "8gb vram",
+    ]
+    for gpu in sub_16gb:
+        if gpu in text_lower:
+            return 2, f"sub-16GB GPU found: '{gpu}'"
+    # No bad GPU — let judge evaluate quality
+    return 7, "no sub-16GB GPU — judge for quality"
+
+# ============================================
+# DISPATCHER
+# ============================================
+
+VALIDATOR_MAP = {
+    "tool_calling":     validate_tool_calling,
+    "yaml_generation":  validate_yaml,
+    "structured":       validate_json_output,
+    "json_schema":      validate_json_schema,
+    "artifact_mermaid": validate_mermaid,
+    "compression":      validate_compression,
+    "multi_step_agent": validate_multi_step,
+    "hallucination":    validate_hallucination,
+    "coding":           validate_coding,
+    "agent":            validate_agent,
+    "context_begin":    lambda t: validate_context(t, "Project Aurora"),
+    "context_middle":   lambda t: validate_context(t, "2.4 million"),
+    "context_end":      lambda t: validate_context(t, "Nexora Systems"),
+}
+
+
+def run_validator(test_name, raw_output):
+    """
+    Run deterministic validator for test_name.
+    Returns (score, skip_judge, notes).
+    skip_judge=True when score is 0 or 10 (definitive).
+    Returns (None, False, "no validator") for tests with no validator.
+    """
+    if test_name not in VALIDATOR_MAP:
+        return None, False, "no validator"
+
+    validator = VALIDATOR_MAP[test_name]
+    score, notes = validator(raw_output)
+    skip_judge   = score in [0, 10]
+
+    return score, skip_judge, notes