RC: (add) python script files
This commit is contained in:
100
config.py
Normal file
100
config.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""
|
||||
benchmark_v4/config.py
|
||||
======================
|
||||
All configuration in one place. Edit this file to change models,
|
||||
weights, judge, and runtime settings.
|
||||
"""
|
||||
|
||||
# ============================================
|
||||
# MODELS
|
||||
# ============================================
|
||||
|
||||
MODELS_BASELINE_DIRECT = [
|
||||
"granite4.1:8b",
|
||||
"qwen2.5-coder:14b",
|
||||
|
||||
]
|
||||
|
||||
MODELS_BASELINE_THINKING = [
|
||||
"nemotron-3-nano:4b",
|
||||
"gemma4:e4b",
|
||||
]
|
||||
|
||||
MODELS_NEW_DIRECT = []
|
||||
MODELS_NEW_THINKING = []
|
||||
|
||||
# ============================================
|
||||
# JUDGE + EMBEDDINGS
|
||||
# ============================================
|
||||
|
||||
JUDGE_MODEL = "qwen2.5:14b"
|
||||
EMBED_MODEL = "nomic-embed-text"
|
||||
OLLAMA_URL = "http://localhost:11434"
|
||||
|
||||
# ============================================
|
||||
# RUNTIME
|
||||
# ============================================
|
||||
|
||||
COOLDOWN_SECONDS = 20 # between tests (thermal normalization)
|
||||
GPU_POLL_EVERY = 3 # poll nvidia-smi every N tests (0 = every test)
|
||||
|
||||
# ============================================
|
||||
# TEST WEIGHTS (must sum to 1.0)
|
||||
# ============================================
|
||||
|
||||
TEST_WEIGHTS = {
|
||||
# Agent / tool reliability — 25%
|
||||
"tool_calling": 0.13,
|
||||
"multi_step_agent": 0.12,
|
||||
|
||||
# Coding / infrastructure — 25%
|
||||
"coding": 0.10,
|
||||
"yaml_generation": 0.08,
|
||||
"artifact_mermaid": 0.04,
|
||||
"json_schema": 0.03,
|
||||
|
||||
# RAG / context fidelity — 20%
|
||||
"rag": 0.07,
|
||||
"context_begin": 0.04,
|
||||
"context_middle": 0.05,
|
||||
"context_end": 0.04,
|
||||
|
||||
# Structured outputs — 15%
|
||||
"structured": 0.08,
|
||||
"compression": 0.07,
|
||||
|
||||
# Hallucination resistance — 10%
|
||||
"hallucination": 0.10,
|
||||
|
||||
# Pure reasoning — 5%
|
||||
"reasoning": 0.03,
|
||||
"agent": 0.01,
|
||||
"math": 0.01,
|
||||
}
|
||||
|
||||
assert abs(sum(TEST_WEIGHTS.values()) - 1.0) < 0.001, "Weights must sum to 1.0"
|
||||
|
||||
# Category groupings for category-level scores
|
||||
CATEGORIES = {
|
||||
"agent_tool": ["tool_calling", "multi_step_agent"],
|
||||
"coding": ["coding", "yaml_generation", "artifact_mermaid", "json_schema"],
|
||||
"rag_context": ["rag", "context_begin", "context_middle", "context_end"],
|
||||
"structured": ["structured", "compression"],
|
||||
"hallucination": ["hallucination"],
|
||||
"reasoning": ["reasoning", "agent", "math"],
|
||||
}
|
||||
|
||||
# Compliance groups — pass if semantic_score >= 8
|
||||
COMPLIANCE_GROUPS = {
|
||||
"json_valid": ["structured", "json_schema"],
|
||||
"yaml_valid": ["yaml_generation"],
|
||||
"tool_format": ["tool_calling", "multi_step_agent"],
|
||||
"hallucination_free": ["hallucination"],
|
||||
}
|
||||
|
||||
# Context files
|
||||
CONTEXT_FILE = "./rag_samples/context_test.md"
|
||||
RAG_FILE = "./rag_samples/note1.md"
|
||||
|
||||
# Database
|
||||
DB_FILE = "benchmark_v4.db"
|
||||
Reference in New Issue
Block a user