Files
llm-benchmark/main.py

299 lines
10 KiB
Python

"""
benchmark_v4/main.py
====================
Entry point. CLI argument parsing and orchestration.
Usage:
python3 main.py # run all baseline models
python3 main.py --test-all # auto-discover and test all ollama models
python3 main.py --mode baseline # baseline only
python3 main.py --mode new # new models only
python3 main.py --model granite4.1:8b # single model
python3 main.py --runs 3 # variance analysis
python3 main.py --no-cooldown # fast run (no thermal wait)
python3 main.py --report # show reports of latest run
python3 main.py --report --report-best # show best run per model
python3 main.py --export # export CSV from DB
"""
import argparse
import sys
import subprocess
import requests
from config import (
MODELS_BASELINE_DIRECT, MODELS_BASELINE_THINKING,
MODELS_NEW_DIRECT, MODELS_NEW_THINKING,
JUDGE_MODEL, EMBED_MODEL, DB_FILE, OLLAMA_URL,
)
from storage import init_db, load_latest_runs, export_summary_csv
from prompts import build_all_prompts
from runner import run_benchmark
from reporting import (
print_weights, print_comparison,
print_full_ranking, print_category_breakdown,
print_compliance_table, run_report
)
try:
import yaml
YAML_AVAILABLE = True
except ImportError:
YAML_AVAILABLE = False
try:
from rapidfuzz import fuzz
FUZZY_AVAILABLE = True
except ImportError:
FUZZY_AVAILABLE = False
# ============================================
# THINKING MODEL DETECTION
# ============================================
def detect_thinking_model(model_name):
"""
Detect if a model supports thinking mode via Ollama capabilities API.
Uses /api/show and checks for 'thinking' in capabilities array.
Fast — single API call, no generation needed.
"""
try:
r = requests.post(
f"{OLLAMA_URL}/api/show",
json={"name": model_name},
timeout=10
)
caps = r.json().get("capabilities", [])
return "thinking" in caps
except Exception:
return False
# ============================================
# MAIN
# ============================================
def main():
parser = argparse.ArgumentParser(
description="LLM Benchmark V4 — Modular, SQLite-backed",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python3 main.py # full baseline run
python3 main.py --test-all # auto-discover all ollama models
python3 main.py --model granite4.1:8b # single model
python3 main.py --mode new # new models only
python3 main.py --runs 3 # variance analysis (3 runs)
python3 main.py --no-cooldown # fast run, no thermal wait
python3 main.py --report # show latest run reports
python3 main.py --report --report-best # show best run per model
python3 main.py --export # export CSV from DB
"""
)
parser.add_argument(
"--test-all", action="store_true", default=False,
help="Auto-discover and benchmark all models in ollama list"
)
parser.add_argument(
"--mode", choices=["baseline", "new", "all"],
default="all",
help="Which model group to run (default: all)"
)
parser.add_argument(
"--model", type=str, default=None,
help="Run a single model by Ollama tag"
)
parser.add_argument(
"--thinking", action="store_true", default=False,
help="Override: mark single --model as thinking type"
)
parser.add_argument(
"--runs", type=int, default=1,
help="Number of runs per model for variance analysis (default: 1)"
)
parser.add_argument(
"--no-cooldown", action="store_true", default=False,
help="Skip cooldown between tests (faster but no thermal normalization)"
)
parser.add_argument(
"--report", action="store_true", default=False,
help="Show ranking reports from DB without running any models"
)
parser.add_argument(
"--report-best", action="store_true", default=False,
help="Show best run per model instead of latest (use with --report)"
)
parser.add_argument(
"--export", action="store_true", default=False,
help="Export latest results to benchmark_summary.csv and exit"
)
args = parser.parse_args()
# Init database
init_db()
# ── Report / export only modes ─────────────────────────────────
# Must come before benchmark logic
if args.report or args.report_best:
print_full_ranking(best=args.report_best)
print_category_breakdown()
print_compliance_table()
export_summary_csv()
return
if args.export:
export_summary_csv()
return
# ── Setup ──────────────────────────────────────────────────────
existing_baseline = load_latest_runs(is_baseline=True)
all_prompts = build_all_prompts()
print(f"\nLLM Benchmark V4")
print(f"Judge: {JUDGE_MODEL}")
print(f"Embed: {EMBED_MODEL}")
print(f"DB: {DB_FILE}")
print(f"Runs: {args.runs}")
print(f"Fuzzy: {FUZZY_AVAILABLE} | YAML: {YAML_AVAILABLE}")
print(f"Cooldown:{'disabled' if args.no_cooldown else 'enabled'}")
print(f"Previous baseline runs: {len(existing_baseline)}")
print_weights()
all_new_run_ids = []
def _run(models, label, baseline):
ids = run_benchmark(
models=models,
label=label,
is_baseline=baseline,
all_prompts=all_prompts,
num_runs=args.runs,
no_cooldown=args.no_cooldown,
)
all_new_run_ids.extend(ids)
# ── Auto-discover all Ollama models ────────────────────────────
if args.test_all:
result = subprocess.run(
["ollama", "list"],
capture_output=True, text=True
)
discovered = []
for line in result.stdout.strip().split('\n')[1:]:
parts = line.split()
if parts:
model_name = parts[0]
skip = [EMBED_MODEL, JUDGE_MODEL, "nomic-embed-text"]
if not any(s in model_name for s in skip):
discovered.append(model_name)
if not discovered:
print("No models found in ollama list.")
return
# Auto-detect thinking capability for each model
print(f"\nDetecting model capabilities...")
model_info = {}
for m in discovered:
is_thinking = detect_thinking_model(m)
is_baseline = m in (MODELS_BASELINE_DIRECT + MODELS_BASELINE_THINKING)
model_info[m] = {
"thinking": is_thinking,
"is_baseline": is_baseline,
"label": "thinking" if is_thinking else "direct",
}
tag = "🧠" if is_thinking else ""
base = "" if is_baseline else " "
print(f" {tag}{base} {m}")
print()
# Run baseline models first, then new
baseline_models = [m for m in discovered if model_info[m]["is_baseline"]]
new_models = [m for m in discovered if not model_info[m]["is_baseline"]]
if baseline_models:
print("=" * 50)
print(" KNOWN BASELINE MODELS")
print("=" * 50)
for m in baseline_models:
_run([m], model_info[m]["label"], True)
if new_models:
print("=" * 50)
print(" NEW / UNKNOWN MODELS")
print("=" * 50)
for m in new_models:
_run([m], model_info[m]["label"], False)
print_comparison(all_new_run_ids, existing_baseline)
run_report()
return
# ── Single model mode ──────────────────────────────────────────
if args.model:
# Auto-detect thinking unless --thinking flag explicitly set
if args.thinking:
label = "thinking"
else:
label = "thinking" if detect_thinking_model(args.model) else "direct"
is_baseline = args.model in (
MODELS_BASELINE_DIRECT + MODELS_BASELINE_THINKING
)
print(f"\nSingle model: {args.model} [{label}] baseline={is_baseline}")
_run([args.model], label, is_baseline)
# ── Baseline models ────────────────────────────────────────────
elif args.mode in ["baseline", "all"]:
if MODELS_BASELINE_DIRECT:
print("\n" + "=" * 50)
print(" BASELINE — DIRECT")
print("=" * 50)
_run(MODELS_BASELINE_DIRECT, "direct", True)
if MODELS_BASELINE_THINKING:
print("\n" + "=" * 50)
print(" BASELINE — THINKING")
print("=" * 50)
_run(MODELS_BASELINE_THINKING, "thinking", True)
if args.mode == "all":
if MODELS_NEW_DIRECT:
print("\n" + "=" * 50)
print(" NEW — DIRECT")
print("=" * 50)
_run(MODELS_NEW_DIRECT, "direct", False)
if MODELS_NEW_THINKING:
print("\n" + "=" * 50)
print(" NEW — THINKING")
print("=" * 50)
_run(MODELS_NEW_THINKING, "thinking", False)
# ── New models only ────────────────────────────────────────────
elif args.mode == "new":
if MODELS_NEW_DIRECT:
print("\n" + "=" * 50)
print(" NEW — DIRECT")
print("=" * 50)
_run(MODELS_NEW_DIRECT, "direct", False)
if MODELS_NEW_THINKING:
print("\n" + "=" * 50)
print(" NEW — THINKING")
print("=" * 50)
_run(MODELS_NEW_THINKING, "thinking", False)
# ── Final reports ──────────────────────────────────────────────
print_comparison(all_new_run_ids, existing_baseline)
run_report()
if __name__ == "__main__":
main()