"""
benchmark_v4/config.py
======================
All configuration in one place. Edit this file to change models,
weights, judge, and runtime settings.
"""

# ============================================
# MODELS
# ============================================

MODELS_BASELINE_DIRECT = [
    "granite4.1:8b",
    "qwen2.5-coder:14b",

]

MODELS_BASELINE_THINKING = [
    "nemotron-3-nano:4b",
    "gemma4:e4b",
]

MODELS_NEW_DIRECT   = []
MODELS_NEW_THINKING = []

# ============================================
# JUDGE + EMBEDDINGS
# ============================================

JUDGE_MODEL = "qwen2.5:14b" 
EMBED_MODEL = "nomic-embed-text"
OLLAMA_URL  = "http://localhost:11434"

# ============================================
# RUNTIME
# ============================================

COOLDOWN_SECONDS = 20   # between tests (thermal normalization)
GPU_POLL_EVERY   = 3    # poll nvidia-smi every N tests (0 = every test)

# ============================================
# TEST WEIGHTS (must sum to 1.0)
# ============================================

TEST_WEIGHTS = {
    # Agent / tool reliability — 25%
    "tool_calling":     0.13,
    "multi_step_agent": 0.12,

    # Coding / infrastructure — 25%
    "coding":           0.10,
    "yaml_generation":  0.08,
    "artifact_mermaid": 0.04,
    "json_schema":      0.03,

    # RAG / context fidelity — 20%
    "rag":              0.07,
    "context_begin":    0.04,
    "context_middle":   0.05,
    "context_end":      0.04,

    # Structured outputs — 15%
    "structured":       0.08,
    "compression":      0.07,

    # Hallucination resistance — 10%
    "hallucination":    0.10,

    # Pure reasoning — 5%
    "reasoning":        0.03,
    "agent":            0.01,
    "math":             0.01,
}

assert abs(sum(TEST_WEIGHTS.values()) - 1.0) < 0.001, "Weights must sum to 1.0"

# Category groupings for category-level scores
CATEGORIES = {
    "agent_tool":    ["tool_calling", "multi_step_agent"],
    "coding":        ["coding", "yaml_generation", "artifact_mermaid", "json_schema"],
    "rag_context":   ["rag", "context_begin", "context_middle", "context_end"],
    "structured":    ["structured", "compression"],
    "hallucination": ["hallucination"],
    "reasoning":     ["reasoning", "agent", "math"],
}

# Compliance groups — pass if semantic_score >= 8
COMPLIANCE_GROUPS = {
    "json_valid":         ["structured", "json_schema"],
    "yaml_valid":         ["yaml_generation"],
    "tool_format":        ["tool_calling", "multi_step_agent"],
    "hallucination_free": ["hallucination"],
}

# Context files
CONTEXT_FILE = "./rag_samples/context_test.md"
RAG_FILE     = "./rag_samples/note1.md"

# Database
DB_FILE = "benchmark_v4.db"