RC: (add) python script files
This commit is contained in:
299
main.py
Normal file
299
main.py
Normal file
@@ -0,0 +1,299 @@
|
||||
"""
|
||||
benchmark_v4/main.py
|
||||
====================
|
||||
Entry point. CLI argument parsing and orchestration.
|
||||
|
||||
Usage:
|
||||
python3 main.py # run all baseline models
|
||||
python3 main.py --test-all # auto-discover and test all ollama models
|
||||
python3 main.py --mode baseline # baseline only
|
||||
python3 main.py --mode new # new models only
|
||||
python3 main.py --model granite4.1:8b # single model
|
||||
python3 main.py --runs 3 # variance analysis
|
||||
python3 main.py --no-cooldown # fast run (no thermal wait)
|
||||
python3 main.py --report # show reports of latest run
|
||||
python3 main.py --report --report-best # show best run per model
|
||||
python3 main.py --export # export CSV from DB
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import subprocess
|
||||
import requests
|
||||
|
||||
from config import (
|
||||
MODELS_BASELINE_DIRECT, MODELS_BASELINE_THINKING,
|
||||
MODELS_NEW_DIRECT, MODELS_NEW_THINKING,
|
||||
JUDGE_MODEL, EMBED_MODEL, DB_FILE, OLLAMA_URL,
|
||||
)
|
||||
from storage import init_db, load_latest_runs, export_summary_csv
|
||||
from prompts import build_all_prompts
|
||||
from runner import run_benchmark
|
||||
from reporting import (
|
||||
print_weights, print_comparison,
|
||||
print_full_ranking, print_category_breakdown,
|
||||
print_compliance_table, run_report
|
||||
)
|
||||
|
||||
try:
|
||||
import yaml
|
||||
YAML_AVAILABLE = True
|
||||
except ImportError:
|
||||
YAML_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from rapidfuzz import fuzz
|
||||
FUZZY_AVAILABLE = True
|
||||
except ImportError:
|
||||
FUZZY_AVAILABLE = False
|
||||
|
||||
|
||||
# ============================================
|
||||
# THINKING MODEL DETECTION
|
||||
# ============================================
|
||||
|
||||
def detect_thinking_model(model_name):
|
||||
"""
|
||||
Detect if a model supports thinking mode via Ollama capabilities API.
|
||||
Uses /api/show and checks for 'thinking' in capabilities array.
|
||||
Fast — single API call, no generation needed.
|
||||
"""
|
||||
try:
|
||||
r = requests.post(
|
||||
f"{OLLAMA_URL}/api/show",
|
||||
json={"name": model_name},
|
||||
timeout=10
|
||||
)
|
||||
caps = r.json().get("capabilities", [])
|
||||
return "thinking" in caps
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
# ============================================
|
||||
# MAIN
|
||||
# ============================================
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="LLM Benchmark V4 — Modular, SQLite-backed",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python3 main.py # full baseline run
|
||||
python3 main.py --test-all # auto-discover all ollama models
|
||||
python3 main.py --model granite4.1:8b # single model
|
||||
python3 main.py --mode new # new models only
|
||||
python3 main.py --runs 3 # variance analysis (3 runs)
|
||||
python3 main.py --no-cooldown # fast run, no thermal wait
|
||||
python3 main.py --report # show latest run reports
|
||||
python3 main.py --report --report-best # show best run per model
|
||||
python3 main.py --export # export CSV from DB
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--test-all", action="store_true", default=False,
|
||||
help="Auto-discover and benchmark all models in ollama list"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mode", choices=["baseline", "new", "all"],
|
||||
default="all",
|
||||
help="Which model group to run (default: all)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model", type=str, default=None,
|
||||
help="Run a single model by Ollama tag"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--thinking", action="store_true", default=False,
|
||||
help="Override: mark single --model as thinking type"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--runs", type=int, default=1,
|
||||
help="Number of runs per model for variance analysis (default: 1)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-cooldown", action="store_true", default=False,
|
||||
help="Skip cooldown between tests (faster but no thermal normalization)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--report", action="store_true", default=False,
|
||||
help="Show ranking reports from DB without running any models"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--report-best", action="store_true", default=False,
|
||||
help="Show best run per model instead of latest (use with --report)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--export", action="store_true", default=False,
|
||||
help="Export latest results to benchmark_summary.csv and exit"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Init database
|
||||
init_db()
|
||||
|
||||
# ── Report / export only modes ─────────────────────────────────
|
||||
# Must come before benchmark logic
|
||||
if args.report or args.report_best:
|
||||
print_full_ranking(best=args.report_best)
|
||||
print_category_breakdown()
|
||||
print_compliance_table()
|
||||
export_summary_csv()
|
||||
return
|
||||
|
||||
if args.export:
|
||||
export_summary_csv()
|
||||
return
|
||||
|
||||
# ── Setup ──────────────────────────────────────────────────────
|
||||
existing_baseline = load_latest_runs(is_baseline=True)
|
||||
all_prompts = build_all_prompts()
|
||||
|
||||
print(f"\nLLM Benchmark V4")
|
||||
print(f"Judge: {JUDGE_MODEL}")
|
||||
print(f"Embed: {EMBED_MODEL}")
|
||||
print(f"DB: {DB_FILE}")
|
||||
print(f"Runs: {args.runs}")
|
||||
print(f"Fuzzy: {FUZZY_AVAILABLE} | YAML: {YAML_AVAILABLE}")
|
||||
print(f"Cooldown:{'disabled' if args.no_cooldown else 'enabled'}")
|
||||
print(f"Previous baseline runs: {len(existing_baseline)}")
|
||||
|
||||
print_weights()
|
||||
|
||||
all_new_run_ids = []
|
||||
|
||||
def _run(models, label, baseline):
|
||||
ids = run_benchmark(
|
||||
models=models,
|
||||
label=label,
|
||||
is_baseline=baseline,
|
||||
all_prompts=all_prompts,
|
||||
num_runs=args.runs,
|
||||
no_cooldown=args.no_cooldown,
|
||||
)
|
||||
all_new_run_ids.extend(ids)
|
||||
|
||||
# ── Auto-discover all Ollama models ────────────────────────────
|
||||
if args.test_all:
|
||||
result = subprocess.run(
|
||||
["ollama", "list"],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
|
||||
discovered = []
|
||||
for line in result.stdout.strip().split('\n')[1:]:
|
||||
parts = line.split()
|
||||
if parts:
|
||||
model_name = parts[0]
|
||||
skip = [EMBED_MODEL, JUDGE_MODEL, "nomic-embed-text"]
|
||||
if not any(s in model_name for s in skip):
|
||||
discovered.append(model_name)
|
||||
|
||||
if not discovered:
|
||||
print("No models found in ollama list.")
|
||||
return
|
||||
|
||||
# Auto-detect thinking capability for each model
|
||||
print(f"\nDetecting model capabilities...")
|
||||
model_info = {}
|
||||
for m in discovered:
|
||||
is_thinking = detect_thinking_model(m)
|
||||
is_baseline = m in (MODELS_BASELINE_DIRECT + MODELS_BASELINE_THINKING)
|
||||
model_info[m] = {
|
||||
"thinking": is_thinking,
|
||||
"is_baseline": is_baseline,
|
||||
"label": "thinking" if is_thinking else "direct",
|
||||
}
|
||||
tag = "🧠" if is_thinking else "⚡"
|
||||
base = "★" if is_baseline else " "
|
||||
print(f" {tag}{base} {m}")
|
||||
|
||||
print()
|
||||
|
||||
# Run baseline models first, then new
|
||||
baseline_models = [m for m in discovered if model_info[m]["is_baseline"]]
|
||||
new_models = [m for m in discovered if not model_info[m]["is_baseline"]]
|
||||
|
||||
if baseline_models:
|
||||
print("=" * 50)
|
||||
print(" KNOWN BASELINE MODELS")
|
||||
print("=" * 50)
|
||||
for m in baseline_models:
|
||||
_run([m], model_info[m]["label"], True)
|
||||
|
||||
if new_models:
|
||||
print("=" * 50)
|
||||
print(" NEW / UNKNOWN MODELS")
|
||||
print("=" * 50)
|
||||
for m in new_models:
|
||||
_run([m], model_info[m]["label"], False)
|
||||
|
||||
print_comparison(all_new_run_ids, existing_baseline)
|
||||
run_report()
|
||||
return
|
||||
|
||||
# ── Single model mode ──────────────────────────────────────────
|
||||
if args.model:
|
||||
# Auto-detect thinking unless --thinking flag explicitly set
|
||||
if args.thinking:
|
||||
label = "thinking"
|
||||
else:
|
||||
label = "thinking" if detect_thinking_model(args.model) else "direct"
|
||||
|
||||
is_baseline = args.model in (
|
||||
MODELS_BASELINE_DIRECT + MODELS_BASELINE_THINKING
|
||||
)
|
||||
print(f"\nSingle model: {args.model} [{label}] baseline={is_baseline}")
|
||||
_run([args.model], label, is_baseline)
|
||||
|
||||
# ── Baseline models ────────────────────────────────────────────
|
||||
elif args.mode in ["baseline", "all"]:
|
||||
if MODELS_BASELINE_DIRECT:
|
||||
print("\n" + "=" * 50)
|
||||
print(" BASELINE — DIRECT")
|
||||
print("=" * 50)
|
||||
_run(MODELS_BASELINE_DIRECT, "direct", True)
|
||||
|
||||
if MODELS_BASELINE_THINKING:
|
||||
print("\n" + "=" * 50)
|
||||
print(" BASELINE — THINKING")
|
||||
print("=" * 50)
|
||||
_run(MODELS_BASELINE_THINKING, "thinking", True)
|
||||
|
||||
if args.mode == "all":
|
||||
if MODELS_NEW_DIRECT:
|
||||
print("\n" + "=" * 50)
|
||||
print(" NEW — DIRECT")
|
||||
print("=" * 50)
|
||||
_run(MODELS_NEW_DIRECT, "direct", False)
|
||||
|
||||
if MODELS_NEW_THINKING:
|
||||
print("\n" + "=" * 50)
|
||||
print(" NEW — THINKING")
|
||||
print("=" * 50)
|
||||
_run(MODELS_NEW_THINKING, "thinking", False)
|
||||
|
||||
# ── New models only ────────────────────────────────────────────
|
||||
elif args.mode == "new":
|
||||
if MODELS_NEW_DIRECT:
|
||||
print("\n" + "=" * 50)
|
||||
print(" NEW — DIRECT")
|
||||
print("=" * 50)
|
||||
_run(MODELS_NEW_DIRECT, "direct", False)
|
||||
|
||||
if MODELS_NEW_THINKING:
|
||||
print("\n" + "=" * 50)
|
||||
print(" NEW — THINKING")
|
||||
print("=" * 50)
|
||||
_run(MODELS_NEW_THINKING, "thinking", False)
|
||||
|
||||
# ── Final reports ──────────────────────────────────────────────
|
||||
print_comparison(all_new_run_ids, existing_baseline)
|
||||
run_report()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user