RC: (add) python script files

2026-05-15 16:50:26 +01:00
parent 51e9389726
commit ab7875303e
9 changed files with 2350 additions and 0 deletions
--- a/config.py
+++ b/config.py
@@ -0,0 +1,100 @@
+"""
+benchmark_v4/config.py
+======================
+All configuration in one place. Edit this file to change models,
+weights, judge, and runtime settings.
+"""
+
+# ============================================
+# MODELS
+# ============================================
+
+MODELS_BASELINE_DIRECT = [
+    "granite4.1:8b",
+    "qwen2.5-coder:14b",
+
+]
+
+MODELS_BASELINE_THINKING = [
+    "nemotron-3-nano:4b",
+    "gemma4:e4b",
+]
+
+MODELS_NEW_DIRECT   = []
+MODELS_NEW_THINKING = []
+
+# ============================================
+# JUDGE + EMBEDDINGS
+# ============================================
+
+JUDGE_MODEL = "qwen2.5:14b" 
+EMBED_MODEL = "nomic-embed-text"
+OLLAMA_URL  = "http://localhost:11434"
+
+# ============================================
+# RUNTIME
+# ============================================
+
+COOLDOWN_SECONDS = 20   # between tests (thermal normalization)
+GPU_POLL_EVERY   = 3    # poll nvidia-smi every N tests (0 = every test)
+
+# ============================================
+# TEST WEIGHTS (must sum to 1.0)
+# ============================================
+
+TEST_WEIGHTS = {
+    # Agent / tool reliability — 25%
+    "tool_calling":     0.13,
+    "multi_step_agent": 0.12,
+
+    # Coding / infrastructure — 25%
+    "coding":           0.10,
+    "yaml_generation":  0.08,
+    "artifact_mermaid": 0.04,
+    "json_schema":      0.03,
+
+    # RAG / context fidelity — 20%
+    "rag":              0.07,
+    "context_begin":    0.04,
+    "context_middle":   0.05,
+    "context_end":      0.04,
+
+    # Structured outputs — 15%
+    "structured":       0.08,
+    "compression":      0.07,
+
+    # Hallucination resistance — 10%
+    "hallucination":    0.10,
+
+    # Pure reasoning — 5%
+    "reasoning":        0.03,
+    "agent":            0.01,
+    "math":             0.01,
+}
+
+assert abs(sum(TEST_WEIGHTS.values()) - 1.0) < 0.001, "Weights must sum to 1.0"
+
+# Category groupings for category-level scores
+CATEGORIES = {
+    "agent_tool":    ["tool_calling", "multi_step_agent"],
+    "coding":        ["coding", "yaml_generation", "artifact_mermaid", "json_schema"],
+    "rag_context":   ["rag", "context_begin", "context_middle", "context_end"],
+    "structured":    ["structured", "compression"],
+    "hallucination": ["hallucination"],
+    "reasoning":     ["reasoning", "agent", "math"],
+}
+
+# Compliance groups — pass if semantic_score >= 8
+COMPLIANCE_GROUPS = {
+    "json_valid":         ["structured", "json_schema"],
+    "yaml_valid":         ["yaml_generation"],
+    "tool_format":        ["tool_calling", "multi_step_agent"],
+    "hallucination_free": ["hallucination"],
+}
+
+# Context files
+CONTEXT_FILE = "./rag_samples/context_test.md"
+RAG_FILE     = "./rag_samples/note1.md"
+
+# Database
+DB_FILE = "benchmark_v4.db"