""" benchmark_v4/config.py ====================== All configuration in one place. Edit this file to change models, weights, judge, and runtime settings. """ # ============================================ # MODELS # ============================================ MODELS_BASELINE_DIRECT = [ "granite4.1:8b", "qwen2.5-coder:14b", ] MODELS_BASELINE_THINKING = [ "nemotron-3-nano:4b", "gemma4:e4b", ] MODELS_NEW_DIRECT = [] MODELS_NEW_THINKING = [] # ============================================ # JUDGE + EMBEDDINGS # ============================================ JUDGE_MODEL = "qwen2.5:14b" EMBED_MODEL = "nomic-embed-text" OLLAMA_URL = "http://localhost:11434" # ============================================ # RUNTIME # ============================================ COOLDOWN_SECONDS = 20 # between tests (thermal normalization) GPU_POLL_EVERY = 3 # poll nvidia-smi every N tests (0 = every test) # ============================================ # TEST WEIGHTS (must sum to 1.0) # ============================================ TEST_WEIGHTS = { # Agent / tool reliability — 25% "tool_calling": 0.13, "multi_step_agent": 0.12, # Coding / infrastructure — 25% "coding": 0.10, "yaml_generation": 0.08, "artifact_mermaid": 0.04, "json_schema": 0.03, # RAG / context fidelity — 20% "rag": 0.07, "context_begin": 0.04, "context_middle": 0.05, "context_end": 0.04, # Structured outputs — 15% "structured": 0.08, "compression": 0.07, # Hallucination resistance — 10% "hallucination": 0.10, # Pure reasoning — 5% "reasoning": 0.03, "agent": 0.01, "math": 0.01, } assert abs(sum(TEST_WEIGHTS.values()) - 1.0) < 0.001, "Weights must sum to 1.0" # Category groupings for category-level scores CATEGORIES = { "agent_tool": ["tool_calling", "multi_step_agent"], "coding": ["coding", "yaml_generation", "artifact_mermaid", "json_schema"], "rag_context": ["rag", "context_begin", "context_middle", "context_end"], "structured": ["structured", "compression"], "hallucination": ["hallucination"], "reasoning": ["reasoning", "agent", "math"], } # Compliance groups — pass if semantic_score >= 8 COMPLIANCE_GROUPS = { "json_valid": ["structured", "json_schema"], "yaml_valid": ["yaml_generation"], "tool_format": ["tool_calling", "multi_step_agent"], "hallucination_free": ["hallucination"], } # Context files CONTEXT_FILE = "./rag_samples/context_test.md" RAG_FILE = "./rag_samples/note1.md" # Database DB_FILE = "benchmark_v4.db"