RC: (add) python script files

This commit is contained in:
2026-05-15 16:50:26 +01:00
parent 51e9389726
commit ab7875303e
9 changed files with 2350 additions and 0 deletions

100
config.py Normal file
View File

@@ -0,0 +1,100 @@
"""
benchmark_v4/config.py
======================
All configuration in one place. Edit this file to change models,
weights, judge, and runtime settings.
"""
# ============================================
# MODELS
# ============================================
MODELS_BASELINE_DIRECT = [
"granite4.1:8b",
"qwen2.5-coder:14b",
]
MODELS_BASELINE_THINKING = [
"nemotron-3-nano:4b",
"gemma4:e4b",
]
MODELS_NEW_DIRECT = []
MODELS_NEW_THINKING = []
# ============================================
# JUDGE + EMBEDDINGS
# ============================================
JUDGE_MODEL = "qwen2.5:14b"
EMBED_MODEL = "nomic-embed-text"
OLLAMA_URL = "http://localhost:11434"
# ============================================
# RUNTIME
# ============================================
COOLDOWN_SECONDS = 20 # between tests (thermal normalization)
GPU_POLL_EVERY = 3 # poll nvidia-smi every N tests (0 = every test)
# ============================================
# TEST WEIGHTS (must sum to 1.0)
# ============================================
TEST_WEIGHTS = {
# Agent / tool reliability — 25%
"tool_calling": 0.13,
"multi_step_agent": 0.12,
# Coding / infrastructure — 25%
"coding": 0.10,
"yaml_generation": 0.08,
"artifact_mermaid": 0.04,
"json_schema": 0.03,
# RAG / context fidelity — 20%
"rag": 0.07,
"context_begin": 0.04,
"context_middle": 0.05,
"context_end": 0.04,
# Structured outputs — 15%
"structured": 0.08,
"compression": 0.07,
# Hallucination resistance — 10%
"hallucination": 0.10,
# Pure reasoning — 5%
"reasoning": 0.03,
"agent": 0.01,
"math": 0.01,
}
assert abs(sum(TEST_WEIGHTS.values()) - 1.0) < 0.001, "Weights must sum to 1.0"
# Category groupings for category-level scores
CATEGORIES = {
"agent_tool": ["tool_calling", "multi_step_agent"],
"coding": ["coding", "yaml_generation", "artifact_mermaid", "json_schema"],
"rag_context": ["rag", "context_begin", "context_middle", "context_end"],
"structured": ["structured", "compression"],
"hallucination": ["hallucination"],
"reasoning": ["reasoning", "agent", "math"],
}
# Compliance groups — pass if semantic_score >= 8
COMPLIANCE_GROUPS = {
"json_valid": ["structured", "json_schema"],
"yaml_valid": ["yaml_generation"],
"tool_format": ["tool_calling", "multi_step_agent"],
"hallucination_free": ["hallucination"],
}
# Context files
CONTEXT_FILE = "./rag_samples/context_test.md"
RAG_FILE = "./rag_samples/note1.md"
# Database
DB_FILE = "benchmark_v4.db"