RC: (add) python script files

This commit is contained in:
2026-05-15 16:50:26 +01:00
parent 51e9389726
commit ab7875303e
9 changed files with 2350 additions and 0 deletions

178
judge.py Normal file
View File

@@ -0,0 +1,178 @@
"""
benchmark_v4/judge.py
=====================
Layer 2: Semantic judge (rubric-based).
Layer 3: Embedding similarity via Ollama nomic-embed-text.
Judge is only called when validator gives a partial score.
Embedding similarity used for RAG test.
"""
import re
import json
import math
import requests
from config import JUDGE_MODEL, EMBED_MODEL, OLLAMA_URL
from prompts import JUDGE_RUBRICS, GROUND_TRUTHS, DEFAULT_RUBRIC
# ============================================
# JUDGE PROMPT TEMPLATE
# ============================================
JUDGE_PROMPT_TEMPLATE = """You are a strict benchmark judge. Score from 0-10.
ABSOLUTE RULES:
1. Judge ONLY what the prompt asked for. Nothing else.
2. NEVER penalise for missing information not requested.
3. NEVER penalise for being concise — brevity is correct.
4. NEVER invent requirements. Only the prompt counts.
5. Minimal correct answers score 8-10.
6. Extra unnecessary content scores lower, not higher.
SCORING:
10 = perfect | 8 = correct, trivial issue | 6 = mostly correct
4 = partial | 2 = major error | 0 = wrong or hallucination
TEST: {test_name}
WHAT TO JUDGE: {rubric}
GROUND TRUTH: {ground_truth}
PROMPT (what was asked):
{prompt}
OUTPUT (what model answered):
{output}
Return ONLY this JSON on one line, nothing else:
{{"semantic_score": <0-10>, "reason": "<max 12 words>"}}"""
# ============================================
# JUDGE WARMUP
# ============================================
def warmup_judge():
"""Load judge model without generating output."""
print(f" Warming up judge: {JUDGE_MODEL}")
try:
requests.post(
f"{OLLAMA_URL}/api/generate",
json={
"model": JUDGE_MODEL,
"prompt": "hi",
"stream": False,
"options": {"num_predict": 1}
},
timeout=120
)
except Exception as e:
print(f" Judge warmup error: {e}")
# ============================================
# JUDGE CALL
# ============================================
def call_judge(test_name, prompt, output):
"""
Call LLM judge with strict rubric.
Returns (semantic_score 0-10, reason str).
Falls back to midpoint (5) on failure to avoid corrupting results.
"""
rubric = JUDGE_RUBRICS.get(test_name, DEFAULT_RUBRIC)
ground_truth = GROUND_TRUTHS.get(test_name, "See prompt requirements.")
judge_prompt = JUDGE_PROMPT_TEMPLATE.format(
test_name=test_name,
rubric=rubric,
ground_truth=ground_truth,
prompt=prompt[:500],
output=output[:1500],
)
try:
response = requests.post(
f"{OLLAMA_URL}/api/generate",
json={"model": JUDGE_MODEL, "prompt": judge_prompt, "stream": False},
timeout=180
)
raw = response.json().get("response", "").strip()
# Try clean JSON parse
m = re.search(r'\{[^{}]*"semantic_score"[^{}]*\}', raw)
if m:
try:
parsed = json.loads(m.group())
score = max(0, min(10, int(parsed.get("semantic_score", 5))))
reason = str(parsed.get("reason", ""))[:80]
return score, reason
except (json.JSONDecodeError, ValueError):
pass
# Fallback: extract score number
sm = re.search(r'"semantic_score"\s*:\s*(\d+)', raw)
rm = re.search(r'"reason"\s*:\s*"([^"]{1,80})"', raw)
if sm:
score = max(0, min(10, int(sm.group(1))))
reason = rm.group(1) if rm else "extracted"
return score, reason
# Last resort
last = re.search(r'semantic_score[^\d]*(\d+)', raw, re.IGNORECASE)
if last:
return max(0, min(10, int(last.group(1)))), "score extracted"
print(f" Judge unparseable: {raw[:80]}")
return 5, "judge unparseable — midpoint"
except requests.exceptions.Timeout:
return 5, "judge timeout — midpoint"
except Exception as e:
return 5, f"judge error — midpoint"
# ============================================
# EMBEDDING SIMILARITY
# ============================================
def get_embedding(text):
"""Get embedding vector from nomic-embed-text via Ollama."""
try:
r = requests.post(
f"{OLLAMA_URL}/api/embed",
json={"model": EMBED_MODEL, "input": text[:2000]},
timeout=30
)
return r.json().get("embeddings", [[]])[0]
except Exception:
return []
def cosine_similarity(v1, v2):
if not v1 or not v2 or len(v1) != len(v2):
return 0.0
dot = sum(a * b for a, b in zip(v1, v2))
mag = math.sqrt(sum(a**2 for a in v1)) * math.sqrt(sum(b**2 for b in v2))
return dot / mag if mag else 0.0
def embedding_score(text, reference):
"""
Score 0-10 based on cosine similarity.
Uses stepped mapping for better discrimination.
"""
if not text or not reference:
return 0
v1 = get_embedding(text[:1000])
v2 = get_embedding(reference)
sim = cosine_similarity(v1, v2)
# Stepped mapping — more discriminating than linear
if sim >= 0.92: return 10
if sim >= 0.85: return 8
if sim >= 0.78: return 6
if sim >= 0.70: return 4
if sim >= 0.60: return 2
return 0