""" benchmark_v4/main.py ==================== Entry point. CLI argument parsing and orchestration. Usage: python3 main.py # run all baseline models python3 main.py --test-all # auto-discover and test all ollama models python3 main.py --mode baseline # baseline only python3 main.py --mode new # new models only python3 main.py --model granite4.1:8b # single model python3 main.py --runs 3 # variance analysis python3 main.py --no-cooldown # fast run (no thermal wait) python3 main.py --report # show reports of latest run python3 main.py --report --report-best # show best run per model python3 main.py --export # export CSV from DB """ import argparse import sys import subprocess import requests from config import ( MODELS_BASELINE_DIRECT, MODELS_BASELINE_THINKING, MODELS_NEW_DIRECT, MODELS_NEW_THINKING, JUDGE_MODEL, EMBED_MODEL, DB_FILE, OLLAMA_URL, ) from storage import init_db, load_latest_runs, export_summary_csv from prompts import build_all_prompts from runner import run_benchmark from reporting import ( print_weights, print_comparison, print_full_ranking, print_category_breakdown, print_compliance_table, run_report ) try: import yaml YAML_AVAILABLE = True except ImportError: YAML_AVAILABLE = False try: from rapidfuzz import fuzz FUZZY_AVAILABLE = True except ImportError: FUZZY_AVAILABLE = False # ============================================ # THINKING MODEL DETECTION # ============================================ def detect_thinking_model(model_name): """ Detect if a model supports thinking mode via Ollama capabilities API. Uses /api/show and checks for 'thinking' in capabilities array. Fast — single API call, no generation needed. """ try: r = requests.post( f"{OLLAMA_URL}/api/show", json={"name": model_name}, timeout=10 ) caps = r.json().get("capabilities", []) return "thinking" in caps except Exception: return False # ============================================ # MAIN # ============================================ def main(): parser = argparse.ArgumentParser( description="LLM Benchmark V4 — Modular, SQLite-backed", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python3 main.py # full baseline run python3 main.py --test-all # auto-discover all ollama models python3 main.py --model granite4.1:8b # single model python3 main.py --mode new # new models only python3 main.py --runs 3 # variance analysis (3 runs) python3 main.py --no-cooldown # fast run, no thermal wait python3 main.py --report # show latest run reports python3 main.py --report --report-best # show best run per model python3 main.py --export # export CSV from DB """ ) parser.add_argument( "--test-all", action="store_true", default=False, help="Auto-discover and benchmark all models in ollama list" ) parser.add_argument( "--mode", choices=["baseline", "new", "all"], default="all", help="Which model group to run (default: all)" ) parser.add_argument( "--model", type=str, default=None, help="Run a single model by Ollama tag" ) parser.add_argument( "--thinking", action="store_true", default=False, help="Override: mark single --model as thinking type" ) parser.add_argument( "--runs", type=int, default=1, help="Number of runs per model for variance analysis (default: 1)" ) parser.add_argument( "--no-cooldown", action="store_true", default=False, help="Skip cooldown between tests (faster but no thermal normalization)" ) parser.add_argument( "--report", action="store_true", default=False, help="Show ranking reports from DB without running any models" ) parser.add_argument( "--report-best", action="store_true", default=False, help="Show best run per model instead of latest (use with --report)" ) parser.add_argument( "--export", action="store_true", default=False, help="Export latest results to benchmark_summary.csv and exit" ) args = parser.parse_args() # Init database init_db() # ── Report / export only modes ───────────────────────────────── # Must come before benchmark logic if args.report or args.report_best: print_full_ranking(best=args.report_best) print_category_breakdown() print_compliance_table() export_summary_csv() return if args.export: export_summary_csv() return # ── Setup ────────────────────────────────────────────────────── existing_baseline = load_latest_runs(is_baseline=True) all_prompts = build_all_prompts() print(f"\nLLM Benchmark V4") print(f"Judge: {JUDGE_MODEL}") print(f"Embed: {EMBED_MODEL}") print(f"DB: {DB_FILE}") print(f"Runs: {args.runs}") print(f"Fuzzy: {FUZZY_AVAILABLE} | YAML: {YAML_AVAILABLE}") print(f"Cooldown:{'disabled' if args.no_cooldown else 'enabled'}") print(f"Previous baseline runs: {len(existing_baseline)}") print_weights() all_new_run_ids = [] def _run(models, label, baseline): ids = run_benchmark( models=models, label=label, is_baseline=baseline, all_prompts=all_prompts, num_runs=args.runs, no_cooldown=args.no_cooldown, ) all_new_run_ids.extend(ids) # ── Auto-discover all Ollama models ──────────────────────────── if args.test_all: result = subprocess.run( ["ollama", "list"], capture_output=True, text=True ) discovered = [] for line in result.stdout.strip().split('\n')[1:]: parts = line.split() if parts: model_name = parts[0] skip = [EMBED_MODEL, JUDGE_MODEL, "nomic-embed-text"] if not any(s in model_name for s in skip): discovered.append(model_name) if not discovered: print("No models found in ollama list.") return # Auto-detect thinking capability for each model print(f"\nDetecting model capabilities...") model_info = {} for m in discovered: is_thinking = detect_thinking_model(m) is_baseline = m in (MODELS_BASELINE_DIRECT + MODELS_BASELINE_THINKING) model_info[m] = { "thinking": is_thinking, "is_baseline": is_baseline, "label": "thinking" if is_thinking else "direct", } tag = "🧠" if is_thinking else "⚡" base = "★" if is_baseline else " " print(f" {tag}{base} {m}") print() # Run baseline models first, then new baseline_models = [m for m in discovered if model_info[m]["is_baseline"]] new_models = [m for m in discovered if not model_info[m]["is_baseline"]] if baseline_models: print("=" * 50) print(" KNOWN BASELINE MODELS") print("=" * 50) for m in baseline_models: _run([m], model_info[m]["label"], True) if new_models: print("=" * 50) print(" NEW / UNKNOWN MODELS") print("=" * 50) for m in new_models: _run([m], model_info[m]["label"], False) print_comparison(all_new_run_ids, existing_baseline) run_report() return # ── Single model mode ────────────────────────────────────────── if args.model: # Auto-detect thinking unless --thinking flag explicitly set if args.thinking: label = "thinking" else: label = "thinking" if detect_thinking_model(args.model) else "direct" is_baseline = args.model in ( MODELS_BASELINE_DIRECT + MODELS_BASELINE_THINKING ) print(f"\nSingle model: {args.model} [{label}] baseline={is_baseline}") _run([args.model], label, is_baseline) # ── Baseline models ──────────────────────────────────────────── elif args.mode in ["baseline", "all"]: if MODELS_BASELINE_DIRECT: print("\n" + "=" * 50) print(" BASELINE — DIRECT") print("=" * 50) _run(MODELS_BASELINE_DIRECT, "direct", True) if MODELS_BASELINE_THINKING: print("\n" + "=" * 50) print(" BASELINE — THINKING") print("=" * 50) _run(MODELS_BASELINE_THINKING, "thinking", True) if args.mode == "all": if MODELS_NEW_DIRECT: print("\n" + "=" * 50) print(" NEW — DIRECT") print("=" * 50) _run(MODELS_NEW_DIRECT, "direct", False) if MODELS_NEW_THINKING: print("\n" + "=" * 50) print(" NEW — THINKING") print("=" * 50) _run(MODELS_NEW_THINKING, "thinking", False) # ── New models only ──────────────────────────────────────────── elif args.mode == "new": if MODELS_NEW_DIRECT: print("\n" + "=" * 50) print(" NEW — DIRECT") print("=" * 50) _run(MODELS_NEW_DIRECT, "direct", False) if MODELS_NEW_THINKING: print("\n" + "=" * 50) print(" NEW — THINKING") print("=" * 50) _run(MODELS_NEW_THINKING, "thinking", False) # ── Final reports ────────────────────────────────────────────── print_comparison(all_new_run_ids, existing_baseline) run_report() if __name__ == "__main__": main()