179 lines
5.2 KiB
Python
179 lines
5.2 KiB
Python
"""
|
|
benchmark_v4/judge.py
|
|
=====================
|
|
Layer 2: Semantic judge (rubric-based).
|
|
Layer 3: Embedding similarity via Ollama nomic-embed-text.
|
|
|
|
Judge is only called when validator gives a partial score.
|
|
Embedding similarity used for RAG test.
|
|
"""
|
|
|
|
import re
|
|
import json
|
|
import math
|
|
import requests
|
|
from config import JUDGE_MODEL, EMBED_MODEL, OLLAMA_URL
|
|
from prompts import JUDGE_RUBRICS, GROUND_TRUTHS, DEFAULT_RUBRIC
|
|
|
|
|
|
# ============================================
|
|
# JUDGE PROMPT TEMPLATE
|
|
# ============================================
|
|
|
|
JUDGE_PROMPT_TEMPLATE = """You are a strict benchmark judge. Score from 0-10.
|
|
|
|
ABSOLUTE RULES:
|
|
1. Judge ONLY what the prompt asked for. Nothing else.
|
|
2. NEVER penalise for missing information not requested.
|
|
3. NEVER penalise for being concise — brevity is correct.
|
|
4. NEVER invent requirements. Only the prompt counts.
|
|
5. Minimal correct answers score 8-10.
|
|
6. Extra unnecessary content scores lower, not higher.
|
|
|
|
SCORING:
|
|
10 = perfect | 8 = correct, trivial issue | 6 = mostly correct
|
|
4 = partial | 2 = major error | 0 = wrong or hallucination
|
|
|
|
TEST: {test_name}
|
|
WHAT TO JUDGE: {rubric}
|
|
GROUND TRUTH: {ground_truth}
|
|
|
|
PROMPT (what was asked):
|
|
{prompt}
|
|
|
|
OUTPUT (what model answered):
|
|
{output}
|
|
|
|
Return ONLY this JSON on one line, nothing else:
|
|
{{"semantic_score": <0-10>, "reason": "<max 12 words>"}}"""
|
|
|
|
|
|
# ============================================
|
|
# JUDGE WARMUP
|
|
# ============================================
|
|
|
|
def warmup_judge():
|
|
"""Load judge model without generating output."""
|
|
print(f" Warming up judge: {JUDGE_MODEL}")
|
|
try:
|
|
requests.post(
|
|
f"{OLLAMA_URL}/api/generate",
|
|
json={
|
|
"model": JUDGE_MODEL,
|
|
"prompt": "hi",
|
|
"stream": False,
|
|
"options": {"num_predict": 1}
|
|
},
|
|
timeout=120
|
|
)
|
|
except Exception as e:
|
|
print(f" Judge warmup error: {e}")
|
|
|
|
|
|
# ============================================
|
|
# JUDGE CALL
|
|
# ============================================
|
|
|
|
def call_judge(test_name, prompt, output):
|
|
"""
|
|
Call LLM judge with strict rubric.
|
|
Returns (semantic_score 0-10, reason str).
|
|
Falls back to midpoint (5) on failure to avoid corrupting results.
|
|
"""
|
|
rubric = JUDGE_RUBRICS.get(test_name, DEFAULT_RUBRIC)
|
|
ground_truth = GROUND_TRUTHS.get(test_name, "See prompt requirements.")
|
|
|
|
judge_prompt = JUDGE_PROMPT_TEMPLATE.format(
|
|
test_name=test_name,
|
|
rubric=rubric,
|
|
ground_truth=ground_truth,
|
|
prompt=prompt[:500],
|
|
output=output[:1500],
|
|
)
|
|
|
|
try:
|
|
response = requests.post(
|
|
f"{OLLAMA_URL}/api/generate",
|
|
json={"model": JUDGE_MODEL, "prompt": judge_prompt, "stream": False},
|
|
timeout=180
|
|
)
|
|
raw = response.json().get("response", "").strip()
|
|
|
|
# Try clean JSON parse
|
|
m = re.search(r'\{[^{}]*"semantic_score"[^{}]*\}', raw)
|
|
if m:
|
|
try:
|
|
parsed = json.loads(m.group())
|
|
score = max(0, min(10, int(parsed.get("semantic_score", 5))))
|
|
reason = str(parsed.get("reason", ""))[:80]
|
|
return score, reason
|
|
except (json.JSONDecodeError, ValueError):
|
|
pass
|
|
|
|
# Fallback: extract score number
|
|
sm = re.search(r'"semantic_score"\s*:\s*(\d+)', raw)
|
|
rm = re.search(r'"reason"\s*:\s*"([^"]{1,80})"', raw)
|
|
if sm:
|
|
score = max(0, min(10, int(sm.group(1))))
|
|
reason = rm.group(1) if rm else "extracted"
|
|
return score, reason
|
|
|
|
# Last resort
|
|
last = re.search(r'semantic_score[^\d]*(\d+)', raw, re.IGNORECASE)
|
|
if last:
|
|
return max(0, min(10, int(last.group(1)))), "score extracted"
|
|
|
|
print(f" Judge unparseable: {raw[:80]}")
|
|
return 5, "judge unparseable — midpoint"
|
|
|
|
except requests.exceptions.Timeout:
|
|
return 5, "judge timeout — midpoint"
|
|
except Exception as e:
|
|
return 5, f"judge error — midpoint"
|
|
|
|
|
|
# ============================================
|
|
# EMBEDDING SIMILARITY
|
|
# ============================================
|
|
|
|
def get_embedding(text):
|
|
"""Get embedding vector from nomic-embed-text via Ollama."""
|
|
try:
|
|
r = requests.post(
|
|
f"{OLLAMA_URL}/api/embed",
|
|
json={"model": EMBED_MODEL, "input": text[:2000]},
|
|
timeout=30
|
|
)
|
|
return r.json().get("embeddings", [[]])[0]
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
def cosine_similarity(v1, v2):
|
|
if not v1 or not v2 or len(v1) != len(v2):
|
|
return 0.0
|
|
dot = sum(a * b for a, b in zip(v1, v2))
|
|
mag = math.sqrt(sum(a**2 for a in v1)) * math.sqrt(sum(b**2 for b in v2))
|
|
return dot / mag if mag else 0.0
|
|
|
|
|
|
def embedding_score(text, reference):
|
|
"""
|
|
Score 0-10 based on cosine similarity.
|
|
Uses stepped mapping for better discrimination.
|
|
"""
|
|
if not text or not reference:
|
|
return 0
|
|
|
|
v1 = get_embedding(text[:1000])
|
|
v2 = get_embedding(reference)
|
|
sim = cosine_similarity(v1, v2)
|
|
|
|
# Stepped mapping — more discriminating than linear
|
|
if sim >= 0.92: return 10
|
|
if sim >= 0.85: return 8
|
|
if sim >= 0.78: return 6
|
|
if sim >= 0.70: return 4
|
|
if sim >= 0.60: return 2
|
|
return 0
|