299 lines
10 KiB
Python
299 lines
10 KiB
Python
"""
|
|
benchmark_v4/main.py
|
|
====================
|
|
Entry point. CLI argument parsing and orchestration.
|
|
|
|
Usage:
|
|
python3 main.py # run all baseline models
|
|
python3 main.py --test-all # auto-discover and test all ollama models
|
|
python3 main.py --mode baseline # baseline only
|
|
python3 main.py --mode new # new models only
|
|
python3 main.py --model granite4.1:8b # single model
|
|
python3 main.py --runs 3 # variance analysis
|
|
python3 main.py --no-cooldown # fast run (no thermal wait)
|
|
python3 main.py --report # show reports of latest run
|
|
python3 main.py --report --report-best # show best run per model
|
|
python3 main.py --export # export CSV from DB
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
import subprocess
|
|
import requests
|
|
|
|
from config import (
|
|
MODELS_BASELINE_DIRECT, MODELS_BASELINE_THINKING,
|
|
MODELS_NEW_DIRECT, MODELS_NEW_THINKING,
|
|
JUDGE_MODEL, EMBED_MODEL, DB_FILE, OLLAMA_URL,
|
|
)
|
|
from storage import init_db, load_latest_runs, export_summary_csv
|
|
from prompts import build_all_prompts
|
|
from runner import run_benchmark
|
|
from reporting import (
|
|
print_weights, print_comparison,
|
|
print_full_ranking, print_category_breakdown,
|
|
print_compliance_table, run_report
|
|
)
|
|
|
|
try:
|
|
import yaml
|
|
YAML_AVAILABLE = True
|
|
except ImportError:
|
|
YAML_AVAILABLE = False
|
|
|
|
try:
|
|
from rapidfuzz import fuzz
|
|
FUZZY_AVAILABLE = True
|
|
except ImportError:
|
|
FUZZY_AVAILABLE = False
|
|
|
|
|
|
# ============================================
|
|
# THINKING MODEL DETECTION
|
|
# ============================================
|
|
|
|
def detect_thinking_model(model_name):
|
|
"""
|
|
Detect if a model supports thinking mode via Ollama capabilities API.
|
|
Uses /api/show and checks for 'thinking' in capabilities array.
|
|
Fast β single API call, no generation needed.
|
|
"""
|
|
try:
|
|
r = requests.post(
|
|
f"{OLLAMA_URL}/api/show",
|
|
json={"name": model_name},
|
|
timeout=10
|
|
)
|
|
caps = r.json().get("capabilities", [])
|
|
return "thinking" in caps
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
# ============================================
|
|
# MAIN
|
|
# ============================================
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="LLM Benchmark V4 β Modular, SQLite-backed",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
python3 main.py # full baseline run
|
|
python3 main.py --test-all # auto-discover all ollama models
|
|
python3 main.py --model granite4.1:8b # single model
|
|
python3 main.py --mode new # new models only
|
|
python3 main.py --runs 3 # variance analysis (3 runs)
|
|
python3 main.py --no-cooldown # fast run, no thermal wait
|
|
python3 main.py --report # show latest run reports
|
|
python3 main.py --report --report-best # show best run per model
|
|
python3 main.py --export # export CSV from DB
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--test-all", action="store_true", default=False,
|
|
help="Auto-discover and benchmark all models in ollama list"
|
|
)
|
|
parser.add_argument(
|
|
"--mode", choices=["baseline", "new", "all"],
|
|
default="all",
|
|
help="Which model group to run (default: all)"
|
|
)
|
|
parser.add_argument(
|
|
"--model", type=str, default=None,
|
|
help="Run a single model by Ollama tag"
|
|
)
|
|
parser.add_argument(
|
|
"--thinking", action="store_true", default=False,
|
|
help="Override: mark single --model as thinking type"
|
|
)
|
|
parser.add_argument(
|
|
"--runs", type=int, default=1,
|
|
help="Number of runs per model for variance analysis (default: 1)"
|
|
)
|
|
parser.add_argument(
|
|
"--no-cooldown", action="store_true", default=False,
|
|
help="Skip cooldown between tests (faster but no thermal normalization)"
|
|
)
|
|
parser.add_argument(
|
|
"--report", action="store_true", default=False,
|
|
help="Show ranking reports from DB without running any models"
|
|
)
|
|
parser.add_argument(
|
|
"--report-best", action="store_true", default=False,
|
|
help="Show best run per model instead of latest (use with --report)"
|
|
)
|
|
parser.add_argument(
|
|
"--export", action="store_true", default=False,
|
|
help="Export latest results to benchmark_summary.csv and exit"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Init database
|
|
init_db()
|
|
|
|
# ββ Report / export only modes βββββββββββββββββββββββββββββββββ
|
|
# Must come before benchmark logic
|
|
if args.report or args.report_best:
|
|
print_full_ranking(best=args.report_best)
|
|
print_category_breakdown()
|
|
print_compliance_table()
|
|
export_summary_csv()
|
|
return
|
|
|
|
if args.export:
|
|
export_summary_csv()
|
|
return
|
|
|
|
# ββ Setup ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
existing_baseline = load_latest_runs(is_baseline=True)
|
|
all_prompts = build_all_prompts()
|
|
|
|
print(f"\nLLM Benchmark V4")
|
|
print(f"Judge: {JUDGE_MODEL}")
|
|
print(f"Embed: {EMBED_MODEL}")
|
|
print(f"DB: {DB_FILE}")
|
|
print(f"Runs: {args.runs}")
|
|
print(f"Fuzzy: {FUZZY_AVAILABLE} | YAML: {YAML_AVAILABLE}")
|
|
print(f"Cooldown:{'disabled' if args.no_cooldown else 'enabled'}")
|
|
print(f"Previous baseline runs: {len(existing_baseline)}")
|
|
|
|
print_weights()
|
|
|
|
all_new_run_ids = []
|
|
|
|
def _run(models, label, baseline):
|
|
ids = run_benchmark(
|
|
models=models,
|
|
label=label,
|
|
is_baseline=baseline,
|
|
all_prompts=all_prompts,
|
|
num_runs=args.runs,
|
|
no_cooldown=args.no_cooldown,
|
|
)
|
|
all_new_run_ids.extend(ids)
|
|
|
|
# ββ Auto-discover all Ollama models ββββββββββββββββββββββββββββ
|
|
if args.test_all:
|
|
result = subprocess.run(
|
|
["ollama", "list"],
|
|
capture_output=True, text=True
|
|
)
|
|
|
|
discovered = []
|
|
for line in result.stdout.strip().split('\n')[1:]:
|
|
parts = line.split()
|
|
if parts:
|
|
model_name = parts[0]
|
|
skip = [EMBED_MODEL, JUDGE_MODEL, "nomic-embed-text"]
|
|
if not any(s in model_name for s in skip):
|
|
discovered.append(model_name)
|
|
|
|
if not discovered:
|
|
print("No models found in ollama list.")
|
|
return
|
|
|
|
# Auto-detect thinking capability for each model
|
|
print(f"\nDetecting model capabilities...")
|
|
model_info = {}
|
|
for m in discovered:
|
|
is_thinking = detect_thinking_model(m)
|
|
is_baseline = m in (MODELS_BASELINE_DIRECT + MODELS_BASELINE_THINKING)
|
|
model_info[m] = {
|
|
"thinking": is_thinking,
|
|
"is_baseline": is_baseline,
|
|
"label": "thinking" if is_thinking else "direct",
|
|
}
|
|
tag = "π§ " if is_thinking else "β‘"
|
|
base = "β
" if is_baseline else " "
|
|
print(f" {tag}{base} {m}")
|
|
|
|
print()
|
|
|
|
# Run baseline models first, then new
|
|
baseline_models = [m for m in discovered if model_info[m]["is_baseline"]]
|
|
new_models = [m for m in discovered if not model_info[m]["is_baseline"]]
|
|
|
|
if baseline_models:
|
|
print("=" * 50)
|
|
print(" KNOWN BASELINE MODELS")
|
|
print("=" * 50)
|
|
for m in baseline_models:
|
|
_run([m], model_info[m]["label"], True)
|
|
|
|
if new_models:
|
|
print("=" * 50)
|
|
print(" NEW / UNKNOWN MODELS")
|
|
print("=" * 50)
|
|
for m in new_models:
|
|
_run([m], model_info[m]["label"], False)
|
|
|
|
print_comparison(all_new_run_ids, existing_baseline)
|
|
run_report()
|
|
return
|
|
|
|
# ββ Single model mode ββββββββββββββββββββββββββββββββββββββββββ
|
|
if args.model:
|
|
# Auto-detect thinking unless --thinking flag explicitly set
|
|
if args.thinking:
|
|
label = "thinking"
|
|
else:
|
|
label = "thinking" if detect_thinking_model(args.model) else "direct"
|
|
|
|
is_baseline = args.model in (
|
|
MODELS_BASELINE_DIRECT + MODELS_BASELINE_THINKING
|
|
)
|
|
print(f"\nSingle model: {args.model} [{label}] baseline={is_baseline}")
|
|
_run([args.model], label, is_baseline)
|
|
|
|
# ββ Baseline models ββββββββββββββββββββββββββββββββββββββββββββ
|
|
elif args.mode in ["baseline", "all"]:
|
|
if MODELS_BASELINE_DIRECT:
|
|
print("\n" + "=" * 50)
|
|
print(" BASELINE β DIRECT")
|
|
print("=" * 50)
|
|
_run(MODELS_BASELINE_DIRECT, "direct", True)
|
|
|
|
if MODELS_BASELINE_THINKING:
|
|
print("\n" + "=" * 50)
|
|
print(" BASELINE β THINKING")
|
|
print("=" * 50)
|
|
_run(MODELS_BASELINE_THINKING, "thinking", True)
|
|
|
|
if args.mode == "all":
|
|
if MODELS_NEW_DIRECT:
|
|
print("\n" + "=" * 50)
|
|
print(" NEW β DIRECT")
|
|
print("=" * 50)
|
|
_run(MODELS_NEW_DIRECT, "direct", False)
|
|
|
|
if MODELS_NEW_THINKING:
|
|
print("\n" + "=" * 50)
|
|
print(" NEW β THINKING")
|
|
print("=" * 50)
|
|
_run(MODELS_NEW_THINKING, "thinking", False)
|
|
|
|
# ββ New models only ββββββββββββββββββββββββββββββββββββββββββββ
|
|
elif args.mode == "new":
|
|
if MODELS_NEW_DIRECT:
|
|
print("\n" + "=" * 50)
|
|
print(" NEW β DIRECT")
|
|
print("=" * 50)
|
|
_run(MODELS_NEW_DIRECT, "direct", False)
|
|
|
|
if MODELS_NEW_THINKING:
|
|
print("\n" + "=" * 50)
|
|
print(" NEW β THINKING")
|
|
print("=" * 50)
|
|
_run(MODELS_NEW_THINKING, "thinking", False)
|
|
|
|
# ββ Final reports ββββββββββββββββββββββββββββββββββββββββββββββ
|
|
print_comparison(all_new_run_ids, existing_baseline)
|
|
run_report()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |