RC: (add) python script files

This commit is contained in:
2026-05-15 16:50:26 +01:00
parent 51e9389726
commit ab7875303e
9 changed files with 2350 additions and 0 deletions

100
config.py Normal file
View File

@@ -0,0 +1,100 @@
"""
benchmark_v4/config.py
======================
All configuration in one place. Edit this file to change models,
weights, judge, and runtime settings.
"""
# ============================================
# MODELS
# ============================================
MODELS_BASELINE_DIRECT = [
"granite4.1:8b",
"qwen2.5-coder:14b",
]
MODELS_BASELINE_THINKING = [
"nemotron-3-nano:4b",
"gemma4:e4b",
]
MODELS_NEW_DIRECT = []
MODELS_NEW_THINKING = []
# ============================================
# JUDGE + EMBEDDINGS
# ============================================
JUDGE_MODEL = "qwen2.5:14b"
EMBED_MODEL = "nomic-embed-text"
OLLAMA_URL = "http://localhost:11434"
# ============================================
# RUNTIME
# ============================================
COOLDOWN_SECONDS = 20 # between tests (thermal normalization)
GPU_POLL_EVERY = 3 # poll nvidia-smi every N tests (0 = every test)
# ============================================
# TEST WEIGHTS (must sum to 1.0)
# ============================================
TEST_WEIGHTS = {
# Agent / tool reliability — 25%
"tool_calling": 0.13,
"multi_step_agent": 0.12,
# Coding / infrastructure — 25%
"coding": 0.10,
"yaml_generation": 0.08,
"artifact_mermaid": 0.04,
"json_schema": 0.03,
# RAG / context fidelity — 20%
"rag": 0.07,
"context_begin": 0.04,
"context_middle": 0.05,
"context_end": 0.04,
# Structured outputs — 15%
"structured": 0.08,
"compression": 0.07,
# Hallucination resistance — 10%
"hallucination": 0.10,
# Pure reasoning — 5%
"reasoning": 0.03,
"agent": 0.01,
"math": 0.01,
}
assert abs(sum(TEST_WEIGHTS.values()) - 1.0) < 0.001, "Weights must sum to 1.0"
# Category groupings for category-level scores
CATEGORIES = {
"agent_tool": ["tool_calling", "multi_step_agent"],
"coding": ["coding", "yaml_generation", "artifact_mermaid", "json_schema"],
"rag_context": ["rag", "context_begin", "context_middle", "context_end"],
"structured": ["structured", "compression"],
"hallucination": ["hallucination"],
"reasoning": ["reasoning", "agent", "math"],
}
# Compliance groups — pass if semantic_score >= 8
COMPLIANCE_GROUPS = {
"json_valid": ["structured", "json_schema"],
"yaml_valid": ["yaml_generation"],
"tool_format": ["tool_calling", "multi_step_agent"],
"hallucination_free": ["hallucination"],
}
# Context files
CONTEXT_FILE = "./rag_samples/context_test.md"
RAG_FILE = "./rag_samples/note1.md"
# Database
DB_FILE = "benchmark_v4.db"

178
judge.py Normal file
View File

@@ -0,0 +1,178 @@
"""
benchmark_v4/judge.py
=====================
Layer 2: Semantic judge (rubric-based).
Layer 3: Embedding similarity via Ollama nomic-embed-text.
Judge is only called when validator gives a partial score.
Embedding similarity used for RAG test.
"""
import re
import json
import math
import requests
from config import JUDGE_MODEL, EMBED_MODEL, OLLAMA_URL
from prompts import JUDGE_RUBRICS, GROUND_TRUTHS, DEFAULT_RUBRIC
# ============================================
# JUDGE PROMPT TEMPLATE
# ============================================
JUDGE_PROMPT_TEMPLATE = """You are a strict benchmark judge. Score from 0-10.
ABSOLUTE RULES:
1. Judge ONLY what the prompt asked for. Nothing else.
2. NEVER penalise for missing information not requested.
3. NEVER penalise for being concise — brevity is correct.
4. NEVER invent requirements. Only the prompt counts.
5. Minimal correct answers score 8-10.
6. Extra unnecessary content scores lower, not higher.
SCORING:
10 = perfect | 8 = correct, trivial issue | 6 = mostly correct
4 = partial | 2 = major error | 0 = wrong or hallucination
TEST: {test_name}
WHAT TO JUDGE: {rubric}
GROUND TRUTH: {ground_truth}
PROMPT (what was asked):
{prompt}
OUTPUT (what model answered):
{output}
Return ONLY this JSON on one line, nothing else:
{{"semantic_score": <0-10>, "reason": "<max 12 words>"}}"""
# ============================================
# JUDGE WARMUP
# ============================================
def warmup_judge():
"""Load judge model without generating output."""
print(f" Warming up judge: {JUDGE_MODEL}")
try:
requests.post(
f"{OLLAMA_URL}/api/generate",
json={
"model": JUDGE_MODEL,
"prompt": "hi",
"stream": False,
"options": {"num_predict": 1}
},
timeout=120
)
except Exception as e:
print(f" Judge warmup error: {e}")
# ============================================
# JUDGE CALL
# ============================================
def call_judge(test_name, prompt, output):
"""
Call LLM judge with strict rubric.
Returns (semantic_score 0-10, reason str).
Falls back to midpoint (5) on failure to avoid corrupting results.
"""
rubric = JUDGE_RUBRICS.get(test_name, DEFAULT_RUBRIC)
ground_truth = GROUND_TRUTHS.get(test_name, "See prompt requirements.")
judge_prompt = JUDGE_PROMPT_TEMPLATE.format(
test_name=test_name,
rubric=rubric,
ground_truth=ground_truth,
prompt=prompt[:500],
output=output[:1500],
)
try:
response = requests.post(
f"{OLLAMA_URL}/api/generate",
json={"model": JUDGE_MODEL, "prompt": judge_prompt, "stream": False},
timeout=180
)
raw = response.json().get("response", "").strip()
# Try clean JSON parse
m = re.search(r'\{[^{}]*"semantic_score"[^{}]*\}', raw)
if m:
try:
parsed = json.loads(m.group())
score = max(0, min(10, int(parsed.get("semantic_score", 5))))
reason = str(parsed.get("reason", ""))[:80]
return score, reason
except (json.JSONDecodeError, ValueError):
pass
# Fallback: extract score number
sm = re.search(r'"semantic_score"\s*:\s*(\d+)', raw)
rm = re.search(r'"reason"\s*:\s*"([^"]{1,80})"', raw)
if sm:
score = max(0, min(10, int(sm.group(1))))
reason = rm.group(1) if rm else "extracted"
return score, reason
# Last resort
last = re.search(r'semantic_score[^\d]*(\d+)', raw, re.IGNORECASE)
if last:
return max(0, min(10, int(last.group(1)))), "score extracted"
print(f" Judge unparseable: {raw[:80]}")
return 5, "judge unparseable — midpoint"
except requests.exceptions.Timeout:
return 5, "judge timeout — midpoint"
except Exception as e:
return 5, f"judge error — midpoint"
# ============================================
# EMBEDDING SIMILARITY
# ============================================
def get_embedding(text):
"""Get embedding vector from nomic-embed-text via Ollama."""
try:
r = requests.post(
f"{OLLAMA_URL}/api/embed",
json={"model": EMBED_MODEL, "input": text[:2000]},
timeout=30
)
return r.json().get("embeddings", [[]])[0]
except Exception:
return []
def cosine_similarity(v1, v2):
if not v1 or not v2 or len(v1) != len(v2):
return 0.0
dot = sum(a * b for a, b in zip(v1, v2))
mag = math.sqrt(sum(a**2 for a in v1)) * math.sqrt(sum(b**2 for b in v2))
return dot / mag if mag else 0.0
def embedding_score(text, reference):
"""
Score 0-10 based on cosine similarity.
Uses stepped mapping for better discrimination.
"""
if not text or not reference:
return 0
v1 = get_embedding(text[:1000])
v2 = get_embedding(reference)
sim = cosine_similarity(v1, v2)
# Stepped mapping — more discriminating than linear
if sim >= 0.92: return 10
if sim >= 0.85: return 8
if sim >= 0.78: return 6
if sim >= 0.70: return 4
if sim >= 0.60: return 2
return 0

299
main.py Normal file
View File

@@ -0,0 +1,299 @@
"""
benchmark_v4/main.py
====================
Entry point. CLI argument parsing and orchestration.
Usage:
python3 main.py # run all baseline models
python3 main.py --test-all # auto-discover and test all ollama models
python3 main.py --mode baseline # baseline only
python3 main.py --mode new # new models only
python3 main.py --model granite4.1:8b # single model
python3 main.py --runs 3 # variance analysis
python3 main.py --no-cooldown # fast run (no thermal wait)
python3 main.py --report # show reports of latest run
python3 main.py --report --report-best # show best run per model
python3 main.py --export # export CSV from DB
"""
import argparse
import sys
import subprocess
import requests
from config import (
MODELS_BASELINE_DIRECT, MODELS_BASELINE_THINKING,
MODELS_NEW_DIRECT, MODELS_NEW_THINKING,
JUDGE_MODEL, EMBED_MODEL, DB_FILE, OLLAMA_URL,
)
from storage import init_db, load_latest_runs, export_summary_csv
from prompts import build_all_prompts
from runner import run_benchmark
from reporting import (
print_weights, print_comparison,
print_full_ranking, print_category_breakdown,
print_compliance_table, run_report
)
try:
import yaml
YAML_AVAILABLE = True
except ImportError:
YAML_AVAILABLE = False
try:
from rapidfuzz import fuzz
FUZZY_AVAILABLE = True
except ImportError:
FUZZY_AVAILABLE = False
# ============================================
# THINKING MODEL DETECTION
# ============================================
def detect_thinking_model(model_name):
"""
Detect if a model supports thinking mode via Ollama capabilities API.
Uses /api/show and checks for 'thinking' in capabilities array.
Fast — single API call, no generation needed.
"""
try:
r = requests.post(
f"{OLLAMA_URL}/api/show",
json={"name": model_name},
timeout=10
)
caps = r.json().get("capabilities", [])
return "thinking" in caps
except Exception:
return False
# ============================================
# MAIN
# ============================================
def main():
parser = argparse.ArgumentParser(
description="LLM Benchmark V4 — Modular, SQLite-backed",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python3 main.py # full baseline run
python3 main.py --test-all # auto-discover all ollama models
python3 main.py --model granite4.1:8b # single model
python3 main.py --mode new # new models only
python3 main.py --runs 3 # variance analysis (3 runs)
python3 main.py --no-cooldown # fast run, no thermal wait
python3 main.py --report # show latest run reports
python3 main.py --report --report-best # show best run per model
python3 main.py --export # export CSV from DB
"""
)
parser.add_argument(
"--test-all", action="store_true", default=False,
help="Auto-discover and benchmark all models in ollama list"
)
parser.add_argument(
"--mode", choices=["baseline", "new", "all"],
default="all",
help="Which model group to run (default: all)"
)
parser.add_argument(
"--model", type=str, default=None,
help="Run a single model by Ollama tag"
)
parser.add_argument(
"--thinking", action="store_true", default=False,
help="Override: mark single --model as thinking type"
)
parser.add_argument(
"--runs", type=int, default=1,
help="Number of runs per model for variance analysis (default: 1)"
)
parser.add_argument(
"--no-cooldown", action="store_true", default=False,
help="Skip cooldown between tests (faster but no thermal normalization)"
)
parser.add_argument(
"--report", action="store_true", default=False,
help="Show ranking reports from DB without running any models"
)
parser.add_argument(
"--report-best", action="store_true", default=False,
help="Show best run per model instead of latest (use with --report)"
)
parser.add_argument(
"--export", action="store_true", default=False,
help="Export latest results to benchmark_summary.csv and exit"
)
args = parser.parse_args()
# Init database
init_db()
# ── Report / export only modes ─────────────────────────────────
# Must come before benchmark logic
if args.report or args.report_best:
print_full_ranking(best=args.report_best)
print_category_breakdown()
print_compliance_table()
export_summary_csv()
return
if args.export:
export_summary_csv()
return
# ── Setup ──────────────────────────────────────────────────────
existing_baseline = load_latest_runs(is_baseline=True)
all_prompts = build_all_prompts()
print(f"\nLLM Benchmark V4")
print(f"Judge: {JUDGE_MODEL}")
print(f"Embed: {EMBED_MODEL}")
print(f"DB: {DB_FILE}")
print(f"Runs: {args.runs}")
print(f"Fuzzy: {FUZZY_AVAILABLE} | YAML: {YAML_AVAILABLE}")
print(f"Cooldown:{'disabled' if args.no_cooldown else 'enabled'}")
print(f"Previous baseline runs: {len(existing_baseline)}")
print_weights()
all_new_run_ids = []
def _run(models, label, baseline):
ids = run_benchmark(
models=models,
label=label,
is_baseline=baseline,
all_prompts=all_prompts,
num_runs=args.runs,
no_cooldown=args.no_cooldown,
)
all_new_run_ids.extend(ids)
# ── Auto-discover all Ollama models ────────────────────────────
if args.test_all:
result = subprocess.run(
["ollama", "list"],
capture_output=True, text=True
)
discovered = []
for line in result.stdout.strip().split('\n')[1:]:
parts = line.split()
if parts:
model_name = parts[0]
skip = [EMBED_MODEL, JUDGE_MODEL, "nomic-embed-text"]
if not any(s in model_name for s in skip):
discovered.append(model_name)
if not discovered:
print("No models found in ollama list.")
return
# Auto-detect thinking capability for each model
print(f"\nDetecting model capabilities...")
model_info = {}
for m in discovered:
is_thinking = detect_thinking_model(m)
is_baseline = m in (MODELS_BASELINE_DIRECT + MODELS_BASELINE_THINKING)
model_info[m] = {
"thinking": is_thinking,
"is_baseline": is_baseline,
"label": "thinking" if is_thinking else "direct",
}
tag = "🧠" if is_thinking else ""
base = "" if is_baseline else " "
print(f" {tag}{base} {m}")
print()
# Run baseline models first, then new
baseline_models = [m for m in discovered if model_info[m]["is_baseline"]]
new_models = [m for m in discovered if not model_info[m]["is_baseline"]]
if baseline_models:
print("=" * 50)
print(" KNOWN BASELINE MODELS")
print("=" * 50)
for m in baseline_models:
_run([m], model_info[m]["label"], True)
if new_models:
print("=" * 50)
print(" NEW / UNKNOWN MODELS")
print("=" * 50)
for m in new_models:
_run([m], model_info[m]["label"], False)
print_comparison(all_new_run_ids, existing_baseline)
run_report()
return
# ── Single model mode ──────────────────────────────────────────
if args.model:
# Auto-detect thinking unless --thinking flag explicitly set
if args.thinking:
label = "thinking"
else:
label = "thinking" if detect_thinking_model(args.model) else "direct"
is_baseline = args.model in (
MODELS_BASELINE_DIRECT + MODELS_BASELINE_THINKING
)
print(f"\nSingle model: {args.model} [{label}] baseline={is_baseline}")
_run([args.model], label, is_baseline)
# ── Baseline models ────────────────────────────────────────────
elif args.mode in ["baseline", "all"]:
if MODELS_BASELINE_DIRECT:
print("\n" + "=" * 50)
print(" BASELINE — DIRECT")
print("=" * 50)
_run(MODELS_BASELINE_DIRECT, "direct", True)
if MODELS_BASELINE_THINKING:
print("\n" + "=" * 50)
print(" BASELINE — THINKING")
print("=" * 50)
_run(MODELS_BASELINE_THINKING, "thinking", True)
if args.mode == "all":
if MODELS_NEW_DIRECT:
print("\n" + "=" * 50)
print(" NEW — DIRECT")
print("=" * 50)
_run(MODELS_NEW_DIRECT, "direct", False)
if MODELS_NEW_THINKING:
print("\n" + "=" * 50)
print(" NEW — THINKING")
print("=" * 50)
_run(MODELS_NEW_THINKING, "thinking", False)
# ── New models only ────────────────────────────────────────────
elif args.mode == "new":
if MODELS_NEW_DIRECT:
print("\n" + "=" * 50)
print(" NEW — DIRECT")
print("=" * 50)
_run(MODELS_NEW_DIRECT, "direct", False)
if MODELS_NEW_THINKING:
print("\n" + "=" * 50)
print(" NEW — THINKING")
print("=" * 50)
_run(MODELS_NEW_THINKING, "thinking", False)
# ── Final reports ──────────────────────────────────────────────
print_comparison(all_new_run_ids, existing_baseline)
run_report()
if __name__ == "__main__":
main()

388
prompts.py Normal file
View File

@@ -0,0 +1,388 @@
"""
benchmark_v4/prompts.py
=======================
All prompts and ground truths in one place.
BENCHMARK PURPOSE (explicit):
This benchmark measures operational reliability for agentic and
automated pipelines. It rewards: format obedience, structured output
correctness, tool call precision, and hallucination resistance.
It intentionally penalises verbosity, creative deviation, and
formatting non-compliance. It is NOT a general intelligence benchmark.
Changes in this version:
- Agent prompt rewritten with explicit constraints (better differentiation)
- Structured JSON upgraded to nested array (stronger discriminator)
- ALL_TESTS is now explicit ordered list (not dict.keys())
- Ground truths reformulated as criteria (avoid judge anchoring)
- JSON Schema validator criteria made explicit
"""
import os
from config import CONTEXT_FILE, RAG_FILE
# ============================================
# STATIC PROMPTS
# ============================================
PROMPTS = {
"reasoning": """Be concise. Maximum 150 words.
Solve step by step:
A company has 120 employees.
30% are remote. Of those, 25% are contractors.
How many full-time remote employees?""",
"coding": """Be concise. Maximum 150 words.
Write a Python function that returns the Longest Increasing Subsequence.
Explain time complexity.""",
# UPGRADED: explicit constraints force meaningful differentiation
"agent": """Maximum 120 words. You must:
1. Identify 2-3 specific GPU candidates by name
2. Compare their VRAM capacity and memory bandwidth
3. Acknowledge 2026 pricing uncertainty explicitly
4. Name at least one benchmarking source (e.g. LLM Perf, VRAM calculator)
5. Exclude any GPU with less than 16GB VRAM
Task: Find the best GPU under 500€ for running local LLMs in 2026.""",
"math": """Be concise. Maximum 100 words.
Solve step by step:
If 3 machines take 5 minutes to make 3 widgets,
how long would 100 machines take to make 100 widgets?""",
"hallucination": """Be concise. Maximum 100 words.
Does the book "The Silent Code of Mars" by John Kessler (2022) exist?
If you are not 100% certain, say so clearly.
Do NOT describe or summarise it if you have any doubt.""",
# UPGRADED: nested structure, typed fields — much stronger discriminator
"structured": """Return ONLY valid JSON. No explanation. No markdown fences.
{
"recommendations": [
{
"gpu": "",
"price_eur": 0,
"vram_gb": 0,
"pros": [],
"cons": []
}
]
}
Question: Best GPU under 500€ for local LLMs.
Return exactly 2 recommendations.""",
"tool_calling": """You have: web_search(query: str), scrape_page(url: str), calculate(expression: str)
Return ONLY the single tool call needed:
"What is the best local LLM for 16GB VRAM?"
Example format: web_search("your query here")
No explanation. No other text.""",
"compression": """Compress this into EXACTLY 10 bullet points. Start each with "- ".
Preserve key statistics. No extra text before or after the bullets.
AI improved: healthcare (cancer detection = radiologist accuracy, drug discovery months not years),
finance (30% fraud reduction), transport (15-20% fuel savings), manufacturing (40% downtime reduction),
education (10-15% graduation improvement), energy (20-30% building savings), agriculture (25-30% water reduction).""",
"yaml_generation": """Return ONLY valid YAML. No explanation. No markdown fences.
Create a Kubernetes Deployment:
name is my-app
image is nginx:1.25
replicas is 2
containerPort is 80
memory limit is 256Mi
cpu limit is 250m
readinessProbe uses httpGet on path /healthz port 80""",
"artifact_mermaid": """Return ONLY a Mermaid flowchart code block (with opening and closing fences).
No explanation before or after.
Stages in order: Code Push, Lint, Unit Tests, Build, Integration Tests, Deploy Staging, Smoke Test, Deploy Production""",
"multi_step_agent": """Tools: web_search(query: str), scrape_page(url: str), summarize(text: str)
Show exactly 3 chained tool calls then a final answer for:
"Top 3 most downloaded Python packages this month"
Format:
1. web_search("...")
2. scrape_page("...")
3. summarize("...")
Final: [answer]""",
"json_schema": """Return ONLY valid JSON Schema. No explanation.
Schema for:
- apiVersion: string, required
- kind: string, required, enum: [Deployment, Service, ConfigMap]
- metadata: object, required, properties: name (string, required), namespace (string, required)
- spec: object, required, additionalProperties: true""",
}
# ============================================
# EXPLICIT TEST ORDERING
# Never use dict.keys() — order must be stable
# for CSV consistency and longitudinal comparisons.
# ============================================
ALL_TESTS = [
# Reasoning (5%)
"reasoning",
"math",
"agent",
# Coding / Infrastructure (25%)
"coding",
"yaml_generation",
"artifact_mermaid",
"json_schema",
# Structured outputs (15%)
"structured",
"compression",
# Agent / Tool (25%)
"tool_calling",
"multi_step_agent",
# Hallucination (10%)
"hallucination",
# RAG / Context (20%)
"rag",
"context_begin",
"context_middle",
"context_end",
]
# ============================================
# GROUND TRUTHS — criteria-based, not canonical
# Avoid embedding exact phrasing to prevent
# judge anchoring and over-literal scoring.
# ============================================
GROUND_TRUTHS = {
"reasoning": (
"Correct answer is 27 full-time remote employees. "
"Verify: calculation uses 30% of 120 = 36 remote, "
"then 25% of 36 = 9 contractors, so 36-9 = 27. "
"Award full marks if logic is correct even if phrased differently."
),
"coding": (
"A working Python function that implements LIS. "
"Should mention O(n²) for basic DP approach. "
"Bonus if O(n log n) with binary search is mentioned. "
"Do not penalise for code style choices."
),
"agent": (
"Must name at least 2 specific GPUs (e.g. RTX 4060 Ti, RX 7600 XT). "
"Must compare VRAM — only GPUs with 16GB+ should be recommended. "
"Must acknowledge pricing uncertainty for 2026. "
"Must name a benchmarking source. "
"Score 0 if recommends GPUs under 16GB VRAM."
),
"math": (
"Correct answer is 5 minutes. "
"Key insight: each machine independently makes 1 widget in 5 minutes, "
"so 100 machines make 100 widgets in the same 5 minutes. "
"Award marks if the rate-independence reasoning is clearly stated."
),
"hallucination": (
"The book does not exist. "
"Full marks: model refuses or clearly states it cannot verify existence. "
"Zero marks: model describes the book's plot, themes, or content as if real. "
"Partial marks: model hedges without clear refusal."
),
"structured": (
"Must return valid JSON with a 'recommendations' array containing exactly 2 objects. "
"Each object must have: gpu (string), price_eur (number), vram_gb (number), "
"pros (array of strings), cons (array of strings). "
"Score based on: valid JSON structure, correct field types, 2 recommendations present. "
"Do not score on quality of GPU choices."
),
"tool_calling": (
"Must return exactly one function call in the format: name(\"query\"). "
"No explanation before or after. "
"Correct function names: web_search, scrape_page, or calculate. "
"Score 0 if any text accompanies the call."
),
"compression": (
"Must have exactly 10 bullet points starting with '- '. "
"All 7 industries must appear: healthcare, finance, transport, "
"manufacturing, education, energy, agriculture. "
"Key statistics must be preserved where mentioned in source."
),
"yaml_generation": (
"Must be parseable YAML. "
"Must include: kind=Deployment, name=my-app, image=nginx:1.25, "
"replicas=2, containerPort=80, memory limit 256Mi, cpu limit 250m, "
"readinessProbe httpGet /healthz port 80. "
"Do not penalise for additional valid YAML fields not specified."
),
"artifact_mermaid": (
"Must be a valid Mermaid code block with opening and closing fences. "
"Must include all 8 stages: Code Push, Lint, Unit Tests, Build, "
"Integration Tests, Deploy Staging, Smoke Test, Deploy Production. "
"Stages should appear in the correct pipeline order."
),
"multi_step_agent": (
"Must show 3 distinct tool calls using different functions. "
"Preferred sequence: web_search → scrape_page → summarize. "
"Must end with 'Final: [answer]'. "
"Score based on: correct tool names, distinct calls, final answer present."
),
"json_schema": (
"Must be valid JSON Schema (parseable JSON). "
"Must define: apiVersion as string required, "
"kind as string required with enum [Deployment, Service, ConfigMap], "
"metadata as object required with name and namespace as string properties, "
"spec as object required with additionalProperties allowed. "
"Award marks proportionally to how many of these are correctly specified."
),
"context_begin": "The project name is Project Aurora.",
"context_middle": "The budget allocated to Phase 2 is $2.4 million.",
"context_end": "The selected vendor is Nexora Systems (Vendor B).",
"rag": (
"A structured summary that covers the main topics in the provided notes. "
"Should be under 200 words. "
"Should preserve key facts without inventing new information. "
"Do not penalise for including accurate details from the source."
),
}
# ============================================
# JUDGE RUBRICS (per test — what to evaluate)
# Criteria-based, not answer-anchored.
# ============================================
JUDGE_RUBRICS = {
"reasoning": (
"Check: Is the final number 27? Are the three calculation steps "
"(120×0.30, 36×0.25, 36-9) present and correct? Is it within 150 words?"
),
"agent": (
"Check each requirement: "
"(1) At least 2 named GPU models? "
"(2) VRAM and bandwidth compared? "
"(3) 2026 pricing uncertainty acknowledged? "
"(4) Benchmarking source named? "
"(5) No GPU under 16GB VRAM recommended? "
"Score 2 points per requirement met (max 10). "
"Score 0 if any GPU under 16GB is recommended."
),
"math": (
"Check: Is the answer 5 minutes? "
"Does the explanation correctly state that each machine's rate "
"is independent of quantity? Is it within 100 words?"
),
"rag": (
"Check: Does it cover the main topics from the notes? "
"Is it under 200 words? "
"Does it avoid inventing facts not in the source? "
"Is it clearly structured?"
),
}
DEFAULT_RUBRIC = (
"Check whether the output correctly fulfils all requirements stated "
"in the original prompt. Score based on correctness and completeness, "
"not on style or verbosity beyond what the prompt requires."
)
# ============================================
# DYNAMIC PROMPT BUILDERS
# ============================================
def ensure_context_file():
os.makedirs("./rag_samples", exist_ok=True)
if os.path.exists(CONTEXT_FILE):
return
content = """# Project Aurora — Strategic Initiative Report
## Executive Summary
Project Aurora is a digital transformation initiative launched January 2024.
Proposed by CTO Maria Chen. Budget: $8.7M over three years.
## Phase 2 — Cloud Migration
Phase 2 budget allocation: $2.4 million.
## Vendor Recommendation
Vendor A (CloudScale) — $1.8M, limited EU.
Vendor B (Nexora Systems) — $2.1M, 98% SLA, global.
Vendor C (PrimeHost) — $1.4M, no SOC2.
Vendor D (Stratos) — $2.8M, over budget.
Final recommendation: proceed with Vendor B (Nexora Systems).
"""
with open(CONTEXT_FILE, "w") as f:
f.write(content)
print(f" Created: {CONTEXT_FILE}")
def ensure_rag_file():
os.makedirs("./rag_samples", exist_ok=True)
if os.path.exists(RAG_FILE):
return
content = """# Homelab Infrastructure Notes
## K8s Cluster
- 4 nodes, Longhorn storage, Traefik ingress
- FluxCD for GitOps, prune: false on llm namespace
- Namespace llm: Open-WebUI, SearXNG, Crawl4AI, BGE reranker
## Ollama VM
- hostname: chat.h0melab.uk, IP: 10.0.20.57
- GPU: RTX 5060 Ti 16GB, port 11434
- Models: granite4.1:8b, qwen2.5-coder:14b, gemma4:e4b, nemotron-3-nano:4b
## Services
- Gitea at gitea.int, SSH port 3333
- Netdata + VictoriaMetrics for monitoring
- Signal bot with Whisper for voice transcription
- wiki-processor auto-generates Obsidian wiki
"""
with open(RAG_FILE, "w") as f:
f.write(content)
print(f" Created: {RAG_FILE}")
def build_all_prompts():
"""Return complete prompt dict including dynamic context and RAG prompts."""
ensure_context_file()
ensure_rag_file()
prompts = dict(PROMPTS)
# Context prompts
if os.path.exists(CONTEXT_FILE):
with open(CONTEXT_FILE) as f:
context = f.read()
base = (
"Answer in ONE sentence only. "
"Use ONLY information from the document below. "
"Do not add explanation or context.\n\n"
f"DOCUMENT:\n{context}\n\n"
)
prompts["context_begin"] = base + "QUESTION: What is the name of the project?"
prompts["context_middle"] = base + "QUESTION: What budget was allocated to Phase 2?"
prompts["context_end"] = base + "QUESTION: Which vendor was selected and what is their company name?"
# RAG prompt
if os.path.exists(RAG_FILE):
with open(RAG_FILE) as f:
rag_content = f.read()
prompts["rag"] = (
"Maximum 200 words. Summarize and structure the following notes. "
"Preserve all specific facts (IPs, model names, service names). "
"Do not add information not present in the notes.\n\n"
+ rag_content
)
else:
prompts["rag"] = "Maximum 200 words. Summarize: No RAG file found."
return prompts

170
reporting.py Normal file
View File

@@ -0,0 +1,170 @@
"""
benchmark_v4/reporting.py
=========================
All output formatting — terminal reports and CSV export.
Completely separate from scoring and storage logic.
"""
from storage import load_latest_runs, load_all_runs, export_summary_csv
from config import MODELS_BASELINE_THINKING, MODELS_NEW_THINKING
def _tag(model, all_thinking):
return "🧠" if model in all_thinking else ""
def _base(row):
return "" if row.get("is_baseline") else " "
def print_weights():
from config import TEST_WEIGHTS, CATEGORIES
print("\n TEST WEIGHTS:")
category_labels = {
"agent_tool": "Agent/Tool reliability (25%)",
"coding": "Coding/Infrastructure (25%)",
"rag_context": "RAG/Context fidelity (20%)",
"structured": "Structured outputs (15%)",
"hallucination": "Hallucination resistance (10%)",
"reasoning": "Pure reasoning (5%)",
}
for cat, tests in CATEGORIES.items():
w = sum(TEST_WEIGHTS.get(t, 0) for t in tests)
label = category_labels.get(cat, cat)
print(f" {label:<42} {w*100:.0f}%")
def print_comparison(new_run_ids, existing_baseline_rows):
"""Compare current run against existing baseline."""
from storage import load_all_runs, get_connection
print("\n" + "=" * 68)
print(" 📊 RESULTS vs BASELINE")
print("=" * 68)
all_thinking = MODELS_BASELINE_THINKING + MODELS_NEW_THINKING
if existing_baseline_rows:
best = max(float(r.get("weighted_avg") or 0) for r in existing_baseline_rows)
print(f"\n EXISTING BASELINE (best w_avg: {best:.2f}):")
for r in sorted(existing_baseline_rows, key=lambda x: -float(x.get("weighted_avg") or 0)):
print(
f" {r['model']:<44} "
f"w={float(r.get('weighted_avg',0)):>5.2f} "
f"σ={r.get('stdev_all','?'):>4} "
f"fail={r.get('failure_rate_pct','?')}% "
f"[{str(r.get('run_date',''))[:10]}]"
)
else:
best = 0
# Load current runs
if new_run_ids:
from storage import get_connection
placeholders = ",".join("?" * len(new_run_ids))
with get_connection() as conn:
new_rows = [dict(r) for r in conn.execute(
f"SELECT * FROM runs WHERE id IN ({placeholders})",
new_run_ids
).fetchall()]
print(f"\n THIS RUN:")
for r in sorted(new_rows, key=lambda x: -float(x.get("weighted_avg") or 0)):
diff = float(r.get("weighted_avg") or 0) - best
arrow = "" if diff > 0.05 else "" if diff < -0.05 else "="
tag = "BASE" if r.get("is_baseline") else "NEW "
print(
f" [{tag}] {r['model']:<40} "
f"w={float(r.get('weighted_avg',0)):>5.2f} {arrow}{abs(diff):.2f} "
f"σ={r.get('stdev_all','?'):>4} "
f"fail={r.get('failure_rate_pct','?')}%"
)
def print_full_ranking(best=False):
"""Print complete ranking of all models."""
from storage import load_best_runs, load_latest_runs
print("\n" + "=" * 68)
title = "BEST RUN" if best else "LATEST RUN"
print(f" 🏆 FULL RANKING ({title} per model, weighted semantic avg)")
print("=" * 68)
all_thinking = MODELS_BASELINE_THINKING + MODELS_NEW_THINKING
rows = load_best_runs() if best else load_latest_runs()
for i, r in enumerate(rows, 1):
tag = _tag(r["model"], all_thinking)
base = "" if r.get("is_baseline") else " "
print(
f" {i:>2}. {tag}{base} {r['model']:<42} "
f"w={float(r.get('weighted_avg',0)):>5.2f} "
f"σ={r.get('stdev_all','?'):>4} "
f"fail={r.get('failure_rate_pct','?'):>4}% "
f"tok/s={r.get('avg_tok_s','?'):>5} "
f"🌡={r.get('avg_gpu_temp','?'):>2}°C "
f"[{str(r.get('run_date',''))[:10]}]"
)
print(f"\n ★=baseline w=weighted avg σ=stdev(low better) fail=failure rate 🌡=avg temps")
def print_category_breakdown():
"""Print category scores for latest run of each model."""
print("\n" + "=" * 68)
print(" 📂 CATEGORY BREAKDOWN (latest run per model)")
print("=" * 68)
rows = load_latest_runs()
if not rows:
return
header = f" {'Model':<40} {'agent':>6} {'code':>6} {'rag':>6} {'struct':>7} {'hall':>5} {'reason':>7}"
print(f"\n{header}")
print(" " + "-" * 64)
for r in rows:
base = "" if r.get("is_baseline") else " "
print(
f" {base}{r['model']:<41} "
f"{r.get('cat_agent_tool','?'):>6} "
f"{r.get('cat_coding','?'):>6} "
f"{r.get('cat_rag_context','?'):>6} "
f"{r.get('cat_structured','?'):>7} "
f"{r.get('cat_hallucination','?'):>5} "
f"{r.get('cat_reasoning','?'):>7}"
)
def print_compliance_table():
"""Print compliance rates for latest run of each model."""
print("\n" + "=" * 68)
print(" ✅ COMPLIANCE RATES (latest run per model)")
print("=" * 68)
rows = load_latest_runs()
if not rows:
return
header = f" {'Model':<44} {'JSON':>6} {'YAML':>6} {'Tool':>6} {'Hall':>6}"
print(f"\n{header}")
print(" " + "-" * 64)
for r in rows:
base = "" if r.get("is_baseline") else " "
def fmt(v):
return f"{v}%" if v is not None else " n/a"
print(
f" {base}{r['model']:<43} "
f"{fmt(r.get('compliance_json')):>6} "
f"{fmt(r.get('compliance_yaml')):>6} "
f"{fmt(r.get('compliance_tool')):>6} "
f"{fmt(r.get('compliance_hall')):>6}"
)
def run_report():
"""Full report: ranking + categories + compliance."""
print_full_ranking()
print_category_breakdown()
print_compliance_table()
export_summary_csv()

276
runner.py Normal file
View File

@@ -0,0 +1,276 @@
"""
benchmark_v4/runner.py
======================
Executes models via Ollama CLI and orchestrates the benchmark loop.
Handles: warmup, GPU polling, cooldown, multi-run variance.
"""
import subprocess
import time
import re
import statistics
from datetime import datetime
from config import (
COOLDOWN_SECONDS, GPU_POLL_EVERY,
TEST_WEIGHTS, CATEGORIES,
)
from prompts import ALL_TESTS
from validators import normalize_text
from judge import warmup_judge
from scoring import (
score_test, compute_weighted, compute_category_scores,
compute_compliance, compute_variance_stats
)
from storage import insert_run, insert_details, insert_variance
# ============================================
# GPU MONITORING
# ============================================
_gpu_cache = {"temp": -1, "mem": -1, "util": -1, "clock": -1}
_gpu_poll_count = 0
def get_gpu(force=False):
"""Poll GPU stats. Cached every GPU_POLL_EVERY tests to reduce overhead."""
global _gpu_cache, _gpu_poll_count
_gpu_poll_count += 1
if not force and GPU_POLL_EVERY > 0 and _gpu_poll_count % GPU_POLL_EVERY != 0:
return _gpu_cache
try:
result = subprocess.run(
["nvidia-smi",
"--query-gpu=temperature.gpu,memory.used,utilization.gpu,clocks.sm",
"--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=5
)
temp, mem, util, clock = result.stdout.strip().split(", ")
_gpu_cache = {
"temp": int(temp), "mem": int(mem),
"util": int(util), "clock": int(clock)
}
except Exception:
pass
return _gpu_cache
# ============================================
# PARSE OLLAMA VERBOSE
# ============================================
def parse_generation_speed(output):
"""
Parse GENERATION (eval) speed from Ollama verbose output.
The last tokens/s value is the generation rate.
"""
matches = re.findall(r'(\d+\.\d+)\s+tokens/s', output)
return float(matches[-1]) if matches else None
# ============================================
# RUN SINGLE MODEL + PROMPT
# ============================================
def run_model(model, prompt):
"""Execute model via Ollama CLI. Returns result dict."""
start = time.time()
result = subprocess.run(
["ollama", "run", model, prompt, "--verbose"],
capture_output=True, text=True
)
elapsed = round(time.time() - start, 2)
gpu = get_gpu()
output = result.stdout + "\n" + result.stderr
return {
"output": output,
"time": elapsed,
"tok_s": parse_generation_speed(output),
"gpu_temp": gpu["temp"],
"gpu_mem": gpu["mem"],
"gpu_util": gpu["util"],
"gpu_clock": gpu["clock"]
}
# ============================================
# BENCHMARK A GROUP OF MODELS
# ============================================
def run_benchmark(
models,
label,
is_baseline,
all_prompts,
num_runs=1,
no_cooldown=False
):
"""
Run benchmark for a list of models.
Returns list of run_ids (one per model).
"""
run_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
run_ids = []
for model in models:
# Accumulate across runs
sem_by_test = {t: [] for t in ALL_TESTS}
fmt_by_test = {t: [] for t in ALL_TESTS}
tok_s_all = []
temp_all = []
detail_rows = []
print(f"\n[{label}] Model: {model} ({num_runs} run{'s' if num_runs > 1 else ''})")
# Warmup
subprocess.run(
["ollama", "run", model, "hello"],
capture_output=True, text=True
)
time.sleep(5)
warmup_judge()
for run_num in range(1, num_runs + 1):
if num_runs > 1:
print(f"\n ── Run {run_num}/{num_runs} ──")
for test_name in ALL_TESTS:
prompt = all_prompts.get(test_name, "")
if not prompt or not prompt.strip():
continue
result = run_model(model, prompt)
scores = score_test(test_name, prompt, result["output"])
sem = scores["semantic_score"]
fmt = scores["format_score"]
sem_by_test[test_name].append(sem)
fmt_by_test[test_name].append(fmt)
if result["tok_s"]:
tok_s_all.append(result["tok_s"])
if result["gpu_temp"] > 0:
temp_all.append(result["gpu_temp"])
flag = "J" if scores["used_judge"] else "V"
print(
f" [{run_num}] {test_name:<22} [{flag}] "
f"sem={sem:>2}/10 fmt={fmt:>2}/10 "
f"comb={scores['combined_score']:>5.2f} "
f"{scores['notes'][:52]}"
)
detail_rows.append({
"run_date": run_date,
"run_num": run_num,
"model": model,
"type": label,
"is_baseline": 1 if is_baseline else 0,
"test": test_name,
"weight": TEST_WEIGHTS.get(test_name, 0),
"time_s": result["time"],
"tok_s": result["tok_s"],
"gpu_temp": result["gpu_temp"],
"gpu_mem": result["gpu_mem"],
"gpu_util": result["gpu_util"],
"gpu_clock": result["gpu_clock"],
"output_length": len(result["output"]),
"semantic_score":sem,
"format_score": fmt,
"combined_score":scores["combined_score"],
"used_judge": 1 if scores["used_judge"] else 0,
"notes": scores["notes"][:120],
})
if not no_cooldown:
time.sleep(COOLDOWN_SECONDS)
# Aggregate
avg_sem = {t: round(statistics.mean(v), 2) for t, v in sem_by_test.items() if v}
avg_fmt = {t: round(statistics.mean(v), 2) for t, v in fmt_by_test.items() if v}
w_total, w_avg = compute_weighted(avg_sem)
cat_scores = compute_category_scores(avg_sem)
compliance = compute_compliance(sem_by_test)
var_stats = compute_variance_stats(sem_by_test)
fmt_avg = round(statistics.mean([s for v in fmt_by_test.values() for s in v]), 2) if fmt_by_test else 0
avg_tok = round(statistics.mean(tok_s_all), 1) if tok_s_all else 0
avg_tmp = round(statistics.mean(temp_all), 1) if temp_all else 0
print(f"\n ─── {model} ───")
print(f" Weighted avg: {w_avg} (total={w_total})")
print(f" Format avg: {fmt_avg}/10")
print(f" Variance: mean={var_stats['mean']} σ={var_stats['stdev']} failures={var_stats['failure_rate']}%")
print(f" Compliance: JSON={compliance.get('json_valid')}% YAML={compliance.get('yaml_valid')}% "
f"tool={compliance.get('tool_format')}% hall={compliance.get('hallucination_free')}%")
print(f" Categories: agent={cat_scores.get('agent_tool')} coding={cat_scores.get('coding')} "
f"rag={cat_scores.get('rag_context')} struct={cat_scores.get('structured')} "
f"hall={cat_scores.get('hallucination')} reason={cat_scores.get('reasoning')}")
print(f" tok/s={avg_tok} temp={avg_tmp}°C")
# Save to DB
run_row = {
"run_date": run_date,
"model": model,
"type": label,
"is_baseline": 1 if is_baseline else 0,
"num_runs": num_runs,
"weighted_total": w_total,
"weighted_avg": w_avg,
"avg_format": fmt_avg,
"mean_all": var_stats["mean"],
"stdev_all": var_stats["stdev"],
"min_score": var_stats["min"],
"max_score": var_stats["max"],
"failure_rate_pct":var_stats["failure_rate"],
"compliance_json": compliance.get("json_valid"),
"compliance_yaml": compliance.get("yaml_valid"),
"compliance_tool": compliance.get("tool_format"),
"compliance_hall": compliance.get("hallucination_free"),
"cat_agent_tool": cat_scores.get("agent_tool"),
"cat_coding": cat_scores.get("coding"),
"cat_rag_context": cat_scores.get("rag_context"),
"cat_structured": cat_scores.get("structured"),
"cat_hallucination":cat_scores.get("hallucination"),
"cat_reasoning": cat_scores.get("reasoning"),
"avg_tok_s": avg_tok,
"avg_gpu_temp": avg_tmp,
"tests_run": len(avg_sem) * num_runs,
}
run_id = insert_run(run_row)
insert_details(run_id, detail_rows)
# Variance rows (only if multiple runs)
if num_runs > 1:
var_rows = []
for test_name, scores_list in sem_by_test.items():
if len(scores_list) > 1:
var_rows.append({
"run_date": run_date,
"model": model,
"test": test_name,
"num_runs": num_runs,
"mean": round(statistics.mean(scores_list), 2),
"stdev": round(statistics.stdev(scores_list), 2),
"min_score": min(scores_list),
"max_score": max(scores_list),
"failure_rate_pct":round(
sum(1 for s in scores_list if s <= 2) / len(scores_list) * 100, 1
),
"scores_raw": str(scores_list),
})
if var_rows:
insert_variance(var_rows)
run_ids.append(run_id)
print(f"\nCooldown after {model}...\n")
time.sleep(30)
return run_ids

193
scoring.py Normal file
View File

@@ -0,0 +1,193 @@
"""
benchmark_v4/scoring.py
=======================
Combines validator, judge, and embedding into final scores.
Computes: format_score, semantic_score, combined_score.
Computes: category scores, weighted total, compliance, variance.
"""
import re
import statistics
from config import TEST_WEIGHTS, CATEGORIES, COMPLIANCE_GROUPS
from validators import normalize_text, run_validator
from judge import call_judge, embedding_score
from prompts import GROUND_TRUTHS
# ============================================
# FORMAT SCORE
# ============================================
def compute_format_score(output, prompt):
"""
Scores format obedience only — separate from semantic quality.
Checks: ANSI codes, word limit, markdown when not requested.
Returns 0-10.
"""
text = normalize_text(output)
score = 10
# ANSI escape codes in output (model is polluting its output)
if re.search(r'\x1b\[', output):
score -= 2
# Word limit
limit_m = re.search(r'Maximum (\d+) words?', prompt, re.IGNORECASE)
if limit_m:
limit = int(limit_m.group(1))
words = len(text.split())
if words > limit * 1.3:
score -= min(3, int((words - limit) / limit * 5))
# Markdown when prompt says "No markdown" or "No explanation"
if ("no markdown" in prompt.lower() or "no explanation" in prompt.lower()):
if "```" in text and len(text.split("```")) > 2:
score -= 2
return max(0, score)
# ============================================
# COMBINED SCORE
# ============================================
def score_test(test_name, prompt, raw_output):
"""
Main scoring pipeline:
1. Run deterministic validator
2. If partial, blend with judge
3. For RAG, blend judge with embedding similarity
4. Compute format score separately
5. Combined = semantic * 0.8 + format * 0.2
Returns dict with all score components.
"""
# Normalize for quality assessment
clean = normalize_text(raw_output)
# Format score (always computed, separate dimension)
fmt_score = compute_format_score(raw_output, prompt)
# Validator
val_score, skip_judge, val_notes = run_validator(test_name, clean)
if val_score is not None and skip_judge:
# Definitive — 0 or 10
semantic = val_score
used_judge = False
notes = val_notes
elif val_score is not None:
# High-confidence tests: trust validator when score >= 8, skip judge
high_confidence = {"compression", "artifact_mermaid", "tool_calling",
"yaml_generation", "multi_step_agent"}
if test_name in high_confidence and val_score >= 8:
semantic = val_score
used_judge = False
notes = val_notes
else:
# Partial validator score — blend with judge (80/20)
j_score, j_reason = call_judge(test_name, prompt, clean)
semantic = round(val_score * 0.8 + j_score * 0.2)
used_judge = True
notes = f"val={val_score} j={j_score}{j_reason[:55]}"
elif test_name == "rag":
ref = GROUND_TRUTHS.get("rag", "")
e_sim = embedding_score(clean, ref)
j_score, j_reason = call_judge(test_name, prompt, clean)
# Weight judge more — embedding unreliable for technical content
if e_sim == 0:
semantic = j_score # embedding failed, use judge only
else:
semantic = round(e_sim * 0.3 + j_score * 0.7)
used_judge = True
notes = f"embed={e_sim} j={j_score}{j_reason[:50]}"
else:
# Pure judge
j_score, j_reason = call_judge(test_name, prompt, clean)
semantic = j_score
used_judge = True
notes = j_reason[:80]
# Combined: 80% semantic, 20% format — mathematically correct
combined = round(semantic * 0.8 + fmt_score * 0.2, 2)
return {
"semantic_score": int(semantic),
"format_score": fmt_score,
"combined_score": combined,
"used_judge": used_judge,
"notes": notes,
}
# ============================================
# WEIGHTED + CATEGORY SCORES
# ============================================
def compute_weighted(semantic_scores):
"""
Compute weighted total and average from semantic scores.
Returns (weighted_total, weighted_avg).
"""
total = weight_sum = 0.0
for test, score in semantic_scores.items():
w = TEST_WEIGHTS.get(test, 0)
total += (score / 10) * w * 7
weight_sum += w
if weight_sum == 0:
return 0, 0
return round(total, 2), round(total / weight_sum, 2)
def compute_category_scores(semantic_scores):
"""
Compute average semantic score per category.
Returns dict: {category_name: avg_score}.
"""
cat_scores = {}
for cat, tests in CATEGORIES.items():
scores = [semantic_scores[t] for t in tests if t in semantic_scores]
cat_scores[cat] = round(sum(scores) / len(scores), 2) if scores else 0
return cat_scores
def compute_compliance(semantic_scores_by_run):
"""
Compliance = % of runs where semantic_score >= 8.
Input: {test_name: [score_run1, score_run2, ...]}
Returns: {group_name: percentage}
"""
compliance = {}
for group, tests in COMPLIANCE_GROUPS.items():
all_scores = []
for t in tests:
if t in semantic_scores_by_run:
all_scores.extend(semantic_scores_by_run[t])
if all_scores:
rate = sum(1 for s in all_scores if s >= 8) / len(all_scores)
compliance[group] = round(rate * 100, 1)
else:
compliance[group] = None
return compliance
def compute_variance_stats(scores_by_test):
"""
Compute variance statistics across multiple runs.
Input: {test_name: [score_run1, score_run2, ...]}
Returns: {mean, stdev, min, max, failure_rate_pct}
"""
all_scores = [s for scores in scores_by_test.values() for s in scores]
if not all_scores:
return {"mean": 0, "stdev": 0, "min": 0, "max": 0, "failure_rate": 0}
return {
"mean": round(statistics.mean(all_scores), 2),
"stdev": round(statistics.stdev(all_scores), 2) if len(all_scores) > 1 else 0,
"min": min(all_scores),
"max": max(all_scores),
"failure_rate": round(sum(1 for s in all_scores if s <= 2) / len(all_scores) * 100, 1),
}

279
storage.py Normal file
View File

@@ -0,0 +1,279 @@
"""
benchmark_v4/storage.py
=======================
SQLite persistence for benchmark results.
Three tables:
- runs: one row per model per benchmark run
- details: one row per test per model per run
- variance: one row per test per model (multi-run stats)
Query examples:
SELECT model, weighted_avg, stdev_all
FROM runs
WHERE is_baseline = 1
ORDER BY weighted_avg DESC;
SELECT model, test, semantic_score
FROM details
WHERE run_id = (SELECT MAX(id) FROM runs WHERE model = 'granite4.1:8b');
"""
import sqlite3
import json
from datetime import datetime
from config import DB_FILE
# ============================================
# SCHEMA
# ============================================
SCHEMA = """
CREATE TABLE IF NOT EXISTS runs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
run_date TEXT NOT NULL,
model TEXT NOT NULL,
type TEXT NOT NULL,
is_baseline INTEGER NOT NULL DEFAULT 0,
num_runs INTEGER NOT NULL DEFAULT 1,
-- Weighted scores
weighted_total REAL,
weighted_avg REAL,
-- Format
avg_format REAL,
-- Variance
mean_all REAL,
stdev_all REAL,
min_score REAL,
max_score REAL,
failure_rate_pct REAL,
-- Compliance (%)
compliance_json REAL,
compliance_yaml REAL,
compliance_tool REAL,
compliance_hall REAL,
-- Category scores
cat_agent_tool REAL,
cat_coding REAL,
cat_rag_context REAL,
cat_structured REAL,
cat_hallucination REAL,
cat_reasoning REAL,
-- Performance
avg_tok_s REAL,
avg_gpu_temp REAL,
tests_run INTEGER
);
CREATE TABLE IF NOT EXISTS details (
id INTEGER PRIMARY KEY AUTOINCREMENT,
run_id INTEGER NOT NULL REFERENCES runs(id),
run_date TEXT NOT NULL,
run_num INTEGER NOT NULL DEFAULT 1,
model TEXT NOT NULL,
type TEXT NOT NULL,
is_baseline INTEGER NOT NULL DEFAULT 0,
test TEXT NOT NULL,
weight REAL,
time_s REAL,
tok_s REAL,
gpu_temp INTEGER,
gpu_mem INTEGER,
gpu_util INTEGER,
gpu_clock INTEGER,
output_length INTEGER,
semantic_score INTEGER,
format_score INTEGER,
combined_score REAL,
used_judge INTEGER,
notes TEXT
);
CREATE TABLE IF NOT EXISTS variance (
id INTEGER PRIMARY KEY AUTOINCREMENT,
run_date TEXT NOT NULL,
model TEXT NOT NULL,
test TEXT NOT NULL,
num_runs INTEGER NOT NULL,
mean REAL,
stdev REAL,
min_score INTEGER,
max_score INTEGER,
failure_rate_pct REAL,
scores_raw TEXT
);
CREATE INDEX IF NOT EXISTS idx_runs_model ON runs(model);
CREATE INDEX IF NOT EXISTS idx_details_run ON details(run_id);
CREATE INDEX IF NOT EXISTS idx_details_model ON details(model);
CREATE INDEX IF NOT EXISTS idx_details_test ON details(test);
"""
# ============================================
# CONNECTION
# ============================================
def get_connection():
conn = sqlite3.connect(DB_FILE)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA foreign_keys=ON")
return conn
def init_db():
"""Create tables if they don't exist."""
with get_connection() as conn:
conn.executescript(SCHEMA)
# ============================================
# WRITE
# ============================================
def insert_run(run_data):
"""Insert a run summary row. Returns the run_id."""
sql = """
INSERT INTO runs (
run_date, model, type, is_baseline, num_runs,
weighted_total, weighted_avg, avg_format,
mean_all, stdev_all, min_score, max_score, failure_rate_pct,
compliance_json, compliance_yaml, compliance_tool, compliance_hall,
cat_agent_tool, cat_coding, cat_rag_context,
cat_structured, cat_hallucination, cat_reasoning,
avg_tok_s, avg_gpu_temp, tests_run
) VALUES (
:run_date, :model, :type, :is_baseline, :num_runs,
:weighted_total, :weighted_avg, :avg_format,
:mean_all, :stdev_all, :min_score, :max_score, :failure_rate_pct,
:compliance_json, :compliance_yaml, :compliance_tool, :compliance_hall,
:cat_agent_tool, :cat_coding, :cat_rag_context,
:cat_structured, :cat_hallucination, :cat_reasoning,
:avg_tok_s, :avg_gpu_temp, :tests_run
)
"""
with get_connection() as conn:
cursor = conn.execute(sql, run_data)
return cursor.lastrowid
def insert_details(run_id, detail_rows):
"""Insert detail rows for a run."""
sql = """
INSERT INTO details (
run_id, run_date, run_num, model, type, is_baseline,
test, weight, time_s, tok_s,
gpu_temp, gpu_mem, gpu_util, gpu_clock, output_length,
semantic_score, format_score, combined_score, used_judge, notes
) VALUES (
:run_id, :run_date, :run_num, :model, :type, :is_baseline,
:test, :weight, :time_s, :tok_s,
:gpu_temp, :gpu_mem, :gpu_util, :gpu_clock, :output_length,
:semantic_score, :format_score, :combined_score, :used_judge, :notes
)
"""
rows = [{**r, "run_id": run_id} for r in detail_rows]
with get_connection() as conn:
conn.executemany(sql, rows)
def insert_variance(variance_rows):
"""Insert variance rows."""
sql = """
INSERT INTO variance (
run_date, model, test, num_runs,
mean, stdev, min_score, max_score, failure_rate_pct, scores_raw
) VALUES (
:run_date, :model, :test, :num_runs,
:mean, :stdev, :min_score, :max_score, :failure_rate_pct, :scores_raw
)
"""
with get_connection() as conn:
conn.executemany(sql, variance_rows)
# ============================================
# READ
# ============================================
def load_best_runs():
"""Load best scoring run per model."""
with get_connection() as conn:
rows = conn.execute("""
SELECT r.*
FROM runs r
INNER JOIN (
SELECT model, MAX(weighted_avg) AS best_w
FROM runs
GROUP BY model
) best ON r.model = best.model
AND r.weighted_avg = best.best_w
ORDER BY r.weighted_avg DESC
""").fetchall()
return [dict(r) for r in rows]
def load_latest_runs(is_baseline=None):
"""Load latest run per model."""
sql = """
SELECT r.*
FROM runs r
INNER JOIN (
SELECT model, MAX(run_date) AS latest
FROM runs
GROUP BY model
) latest ON r.model = latest.model AND r.run_date = latest.latest
"""
params = []
if is_baseline is not None:
sql += " WHERE r.is_baseline = ?"
params.append(1 if is_baseline else 0)
sql += " ORDER BY r.weighted_avg DESC"
with get_connection() as conn:
rows = conn.execute(sql, params).fetchall()
return [dict(r) for r in rows]
def load_all_runs():
"""Load all run summaries."""
with get_connection() as conn:
rows = conn.execute(
"SELECT * FROM runs ORDER BY run_date DESC"
).fetchall()
return [dict(r) for r in rows]
def load_details_for_run(run_id):
"""Load all test details for a specific run."""
with get_connection() as conn:
rows = conn.execute(
"SELECT * FROM details WHERE run_id = ? ORDER BY test",
(run_id,)
).fetchall()
return [dict(r) for r in rows]
def export_summary_csv(filepath="benchmark_summary.csv"):
"""Export latest run per model to CSV for Excel analysis."""
import csv
rows = load_latest_runs()
if not rows:
print("No runs to export.")
return
with open(filepath, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=rows[0].keys())
writer.writeheader()
writer.writerows(rows)
print(f" Exported {len(rows)} rows to {filepath}")

467
validators.py Normal file
View File

@@ -0,0 +1,467 @@
"""
benchmark_v4/validators.py
==========================
Layer 1: Deterministic validators.
No LLM judge needed. Returns (score 0-10, notes str).
A score of 0 or 10 is definitive — judge is skipped.
Partial scores (1-9) trigger judge blending.
"""
import re
import json
try:
import yaml
YAML_AVAILABLE = True
except ImportError:
YAML_AVAILABLE = False
try:
from rapidfuzz import fuzz
FUZZY_AVAILABLE = True
except ImportError:
FUZZY_AVAILABLE = False
# ============================================
# TEXT NORMALIZATION
# ============================================
def normalize_text(text, mode="plain"):
"""
Centralized text cleaning.
mode="plain" — strip ANSI, control chars, ollama stats, thinking tokens
mode="json" — plain + strip markdown fences
mode="yaml" — plain + strip markdown fences
"""
# 1. Strip ANSI escape sequences FIRST
text = re.sub(r'\x1b(?:[@-Z\\-_]|\[[0-9;?]*[A-Za-z])', '', text)
# 2. Strip control characters
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
# 3. Strip Ollama spinner/progress characters
text = re.sub(r'[⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏]+', '', text)
# 4. Normalize Unicode spaces to regular spaces
text = text.replace('\u202f', ' ').replace('\u00a0', ' ').replace('\u2009', ' ')
# 5. Strip thinking tokens (AFTER cleaning so regex works cleanly)
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
text = re.sub(r'Thinking\.\.\..*?\.\.\.done thinking\.?', '', text, flags=re.DOTALL)
# 6. Strip Ollama verbose stats (LAST — after all other cleanup)
# 6. Strip Ollama verbose stats (LAST — after all other cleanup)
lines = text.split("\n")
text = "\n".join(
l for l in lines if not any(k in l.lower() for k in [
"total duration:", "load duration:", "prompt eval",
"eval count:", "eval duration:", "eval rate:", "tokens/s", "token(s)"
])
)
if mode in ("json", "yaml"):
text = re.sub(r'^```(?:json|yaml|)?\s*', '', text, flags=re.MULTILINE)
text = re.sub(r'```\s*$', '', text, flags=re.MULTILINE)
lines = [l for l in text.split('\n')
if not l.strip().startswith('[?')
and not l.strip().startswith('```')
and not re.search(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', l)
and '\x1b' not in l]
text = '\n'.join(lines).strip()
return text
# ============================================
# JSON EXTRACTION
# ============================================
def extract_json_object(text):
"""
Advanced JSON extractor that handles prompt-echoing,
large whitespace blocks, and multiple JSON objects.
"""
# 1. Aggressive normalization to strip fences and leading/trailing junk
text = normalize_text(text, mode="json")
# 2. Collapse newlines inside JSON strings — fixes mid-value line breaks
text = re.sub(r'\n\s*', ' ', text)
# 3. Skip the prompt-echo/template if the model repeats it.
keyword = '"recommendations"'
last_keyword_pos = text.rfind(keyword)
search_start = 0
if last_keyword_pos != -1:
search_start = text.rfind('{', 0, last_keyword_pos)
if search_start == -1: search_start = 0
decoder = json.JSONDecoder()
found_objs = []
# 4. Iteratively parse all valid JSON objects starting from search_start
idx = search_start
while idx < len(text):
start = text.find('{', idx)
if start == -1:
break
try:
obj, end = decoder.raw_decode(text, start)
if isinstance(obj, dict):
found_objs.append(obj)
idx = end
except json.JSONDecodeError:
idx = start + 1
if not found_objs:
return None
# 5. Filter for populated answer rather than empty template
for o in reversed(found_objs):
if "recommendations" in o:
recs = o.get("recommendations")
if isinstance(recs, list) and len(recs) > 0:
if any(r.get("gpu") for r in recs if isinstance(r, dict)):
return o
return found_objs[-1] if found_objs else None
# ============================================
# VALIDATORS
# ============================================
def validate_tool_calling(text):
"""Single tool call, no extras."""
text = normalize_text(text)
lines = [l for l in text.split('\n') if l.strip()]
if len(lines) > 3:
return 0, "multiple lines — explanation added"
# Valid tool call pattern
if re.search(r'(web_search|scrape_page|calculate)\s*\(["\'].*["\']\)', text):
return 10, "valid tool call syntax"
if re.search(r'\w+\s*\(["\'].*["\']\)', text):
return 5, "function call but wrong name"
return 0, "no valid function call found"
def validate_yaml(text):
"""Must parse as valid YAML Deployment."""
if not YAML_AVAILABLE:
return 5, "pyyaml not installed"
text = normalize_text(text, mode="yaml")
lines = [l for l in text.split('\n')
if not l.strip().startswith('[?')
and not l.strip().startswith('```')
and not re.search(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', l)
and '\x1b' not in l]
text = '\n'.join(lines).strip()
try:
parsed = yaml.safe_load(text)
if not isinstance(parsed, dict):
return 3, "parsed but not a dict"
score = 2
if parsed.get('kind') == 'Deployment':
score += 2
if 'spec' in parsed:
score += 2
spec = parsed['spec']
if spec.get('replicas') == 2:
score += 1
if 'apiVersion' in parsed:
score += 1
if score >= 8:
score = 10
return min(score, 10), f"valid YAML score={score}"
except yaml.YAMLError as e:
return 0, f"invalid YAML: {str(e)[:60]}"
def validate_json_output(text):
"""
Nested structured JSON with recommendations array.
Expected: {"recommendations": [{"gpu":"","price_eur":0,"vram_gb":0,"pros":[],"cons":[]}]}
Scores based on: valid JSON, correct structure, field types, 2 recommendations.
"""
parsed = extract_json_object(text)
if parsed is None:
return 0, "no valid JSON object found"
# Check top-level structure
if "recommendations" not in parsed:
# Fallback: old flat format still gets partial credit
old_fields = ["gpu", "price", "reason"]
present = [k for k in old_fields if k in parsed and str(parsed[k]).strip()]
if present:
return 4, f"flat JSON found (old format), missing nested structure"
return 0, "no recommendations array found"
recs = parsed["recommendations"]
if not isinstance(recs, list) or len(recs) == 0:
return 2, "recommendations present but empty or not a list"
required_fields = {"gpu", "price_eur", "vram_gb", "pros", "cons"}
score = 4 # base for having recommendations array
# Check count
if len(recs) >= 2:
score += 2
elif len(recs) == 1:
score += 1
# Check field completeness on first recommendation
first = recs[0]
present = required_fields & set(first.keys())
score += int((len(present) / len(required_fields)) * 3)
# Check type correctness
type_ok = (
isinstance(first.get("price_eur"), (int, float)) and
isinstance(first.get("vram_gb"), (int, float)) and
isinstance(first.get("pros"), list) and
isinstance(first.get("cons"), list)
)
if type_ok:
score += 1
score = min(score, 10)
return score, f"nested JSON: {len(recs)} recs, fields={list(present)}, types_ok={type_ok}"
def validate_json_schema(text):
"""Valid JSON Schema with all required properties."""
parsed = extract_json_object(text)
if parsed is None:
return 0, "no valid JSON Schema found"
props = parsed.get('properties', {})
score = 0
if 'apiVersion' in props:
score += 2
if 'kind' in props:
k = props['kind']
has_enum = 'enum' in k and set(k['enum']) >= {'Deployment', 'Service', 'ConfigMap'}
score += 3 if has_enum else 1
if 'metadata' in props:
score += 2
if 'spec' in props:
score += 2
if parsed.get('required'):
score += 1
return min(score, 10), f"JSON Schema score={score}/10"
def validate_mermaid(text):
"""Valid Mermaid block with all 8 stages."""
text = normalize_text(text)
stages = [
"code push", "lint", "unit test", "build",
"integration test", "deploy staging", "smoke test", "deploy production"
]
text_lower = text.lower()
has_fence = '```mermaid' in text_lower or \
('```' in text and ('graph' in text_lower or 'flowchart' in text_lower))
if not has_fence:
return 2, "no mermaid fence found"
found = sum(1 for s in stages if s in text_lower)
score = int((found / len(stages)) * 10)
return score, f"{found}/{len(stages)} stages found"
def validate_compression(text):
"""Exactly 10 bullets, all 7 industries."""
text = normalize_text(text)
lines = text.strip().split('\n')
bullets = [l for l in lines if l.strip().startswith('- ')]
count = len(bullets)
industries = ["healthcare", "financ", "transport", "manufactur",
"education", "energy", "agricultur"]
text_lower = text.lower()
industry_count = sum(1 for i in industries if i in text_lower)
if count == 10:
score = 5 + industry_count
elif 8 <= count <= 12:
score = 3 + industry_count
elif count == 7 or count == 13:
score = 2 + industry_count # ← add this
else:
score = max(0, 2 - abs(count - 10))
return min(score, 10), f"{count} bullets, {industry_count}/7 industries"
def validate_multi_step(text):
"""3 distinct tool calls + final answer."""
text = normalize_text(text)
tool_calls = re.findall(
r'(web_search|scrape_page|summarize)\s*\(["\'][^"\']*["\']\)', text
)
unique_tools = set(re.findall(r'(web_search|scrape_page|summarize)\s*\(', text))
has_final = bool(re.search(r'final\s*[:.]', text, re.IGNORECASE))
distinct = len(set(tool_calls))
if distinct >= 3 and len(unique_tools) >= 2 and has_final:
return 10, f"{distinct} distinct calls, final answer present"
elif distinct >= 2 and has_final:
return 7, f"only {distinct} distinct tool calls"
elif distinct >= 1:
return 4, "tool calls found but incomplete"
return 0, "no valid tool calls found"
def validate_hallucination(text):
"""
The book does not exist.
Reward uncertainty. Penalize confident invented specifics.
"""
text_lower = text.lower()
# Good refusal signals
refuse_signals = [
"does not exist", "doesn't exist", "not exist",
"cannot verify", "can't verify", "cannot confirm",
"not aware", "no record", "not familiar",
"unable to find", "could not find", "not in my",
"not certain", "uncertain", "cannot find",
"don't have", "do not have", "not sure",
"may not exist", "might not exist", "no evidence",
"i cannot", "i can't", "i don't know",
"fictional", "fabricated", "no such book",
"not able to verify", "cannot be verified",
"not a real", "i'm not aware", "i am not aware",
]
# Hard hallucination signals — confident invented content
hallucination_signals = [
"the book explores", "kessler argues", "the main ideas",
"the novel", "in the book", "the author presents",
"the story follows", "themes include", "the book examines",
"kessler's work", "the book describes",
"this book", "the plot", "the protagonist",
]
for s in hallucination_signals:
if s in text_lower:
return 0, f"hallucinated: '{s}' found"
for s in refuse_signals:
if s in text_lower:
return 10, f"correctly refused: '{s}'"
return 5, "ambiguous — no clear refusal or hallucination"
def validate_coding(text):
"""Python function with LIS and complexity."""
text = normalize_text(text)
has_def = 'def ' in text
has_return = 'return' in text
has_lis = any(w in text.lower() for w in ['subsequence', 'lis', 'longest'])
has_complexity = any(w in text.lower() for w in ['o(n', 'n log n', '', 'n^2', 'complexity'])
if has_def and has_return and has_lis and has_complexity:
return 10, "function correct with complexity"
score = (3 if has_def else 0) + (1 if has_return else 0) + \
(2 if has_lis else 0) + (2 if has_complexity else 0)
return min(score, 9), f"def={has_def} lis={has_lis} complexity={has_complexity}"
def validate_context(text, expected_phrase):
"""
Fuzzy match for context tests.
Semantically correct answers pass even with different phrasing.
"""
text = normalize_text(text).lower()
expected = expected_phrase.lower()
# Exact match
if expected in text:
return 10, "exact match"
if FUZZY_AVAILABLE:
partial = fuzz.partial_ratio(expected, text)
token = fuzz.token_set_ratio(expected, text)
best = max(partial, token)
if best >= 90: return 10, f"fuzzy match {best}%"
if best >= 80: return 9, f"fuzzy match {best}%"
if best >= 70: return 7, f"partial match {best}%"
if best >= 55: return 5, f"weak match {best}%"
return max(0, int(best / 12)), f"poor match {best}%"
# Fallback token matching
key_words = [w for w in expected.split() if len(w) > 3]
if not key_words:
return 5, "no key words to match"
matches = sum(1 for w in key_words if w in text)
return int((matches / len(key_words)) * 10), f"{matches}/{len(key_words)} tokens"
def validate_agent(text):
text_lower = normalize_text(text).lower()
sub_16gb = [
"rtx 4070 ti", "rtx 4070", "rtx 3060", "rtx 3070",
"rtx 4060", "rx 6700", "rx 7700", "rx 6600",
"12gb", "10gb", "8gb vram",
]
for gpu in sub_16gb:
if gpu in text_lower:
return 2, f"sub-16GB GPU found: '{gpu}'"
# No bad GPU — let judge evaluate quality
return 7, "no sub-16GB GPU — judge for quality"
# ============================================
# DISPATCHER
# ============================================
VALIDATOR_MAP = {
"tool_calling": validate_tool_calling,
"yaml_generation": validate_yaml,
"structured": validate_json_output,
"json_schema": validate_json_schema,
"artifact_mermaid": validate_mermaid,
"compression": validate_compression,
"multi_step_agent": validate_multi_step,
"hallucination": validate_hallucination,
"coding": validate_coding,
"agent": validate_agent,
"context_begin": lambda t: validate_context(t, "Project Aurora"),
"context_middle": lambda t: validate_context(t, "2.4 million"),
"context_end": lambda t: validate_context(t, "Nexora Systems"),
}
def run_validator(test_name, raw_output):
"""
Run deterministic validator for test_name.
Returns (score, skip_judge, notes).
skip_judge=True when score is 0 or 10 (definitive).
Returns (None, False, "no validator") for tests with no validator.
"""
if test_name not in VALIDATOR_MAP:
return None, False, "no validator"
validator = VALIDATOR_MAP[test_name]
score, notes = validator(raw_output)
skip_judge = score in [0, 10]
return score, skip_judge, notes