RC: (add) python script files
This commit is contained in:
178
judge.py
Normal file
178
judge.py
Normal file
@@ -0,0 +1,178 @@
|
||||
"""
|
||||
benchmark_v4/judge.py
|
||||
=====================
|
||||
Layer 2: Semantic judge (rubric-based).
|
||||
Layer 3: Embedding similarity via Ollama nomic-embed-text.
|
||||
|
||||
Judge is only called when validator gives a partial score.
|
||||
Embedding similarity used for RAG test.
|
||||
"""
|
||||
|
||||
import re
|
||||
import json
|
||||
import math
|
||||
import requests
|
||||
from config import JUDGE_MODEL, EMBED_MODEL, OLLAMA_URL
|
||||
from prompts import JUDGE_RUBRICS, GROUND_TRUTHS, DEFAULT_RUBRIC
|
||||
|
||||
|
||||
# ============================================
|
||||
# JUDGE PROMPT TEMPLATE
|
||||
# ============================================
|
||||
|
||||
JUDGE_PROMPT_TEMPLATE = """You are a strict benchmark judge. Score from 0-10.
|
||||
|
||||
ABSOLUTE RULES:
|
||||
1. Judge ONLY what the prompt asked for. Nothing else.
|
||||
2. NEVER penalise for missing information not requested.
|
||||
3. NEVER penalise for being concise — brevity is correct.
|
||||
4. NEVER invent requirements. Only the prompt counts.
|
||||
5. Minimal correct answers score 8-10.
|
||||
6. Extra unnecessary content scores lower, not higher.
|
||||
|
||||
SCORING:
|
||||
10 = perfect | 8 = correct, trivial issue | 6 = mostly correct
|
||||
4 = partial | 2 = major error | 0 = wrong or hallucination
|
||||
|
||||
TEST: {test_name}
|
||||
WHAT TO JUDGE: {rubric}
|
||||
GROUND TRUTH: {ground_truth}
|
||||
|
||||
PROMPT (what was asked):
|
||||
{prompt}
|
||||
|
||||
OUTPUT (what model answered):
|
||||
{output}
|
||||
|
||||
Return ONLY this JSON on one line, nothing else:
|
||||
{{"semantic_score": <0-10>, "reason": "<max 12 words>"}}"""
|
||||
|
||||
|
||||
# ============================================
|
||||
# JUDGE WARMUP
|
||||
# ============================================
|
||||
|
||||
def warmup_judge():
|
||||
"""Load judge model without generating output."""
|
||||
print(f" Warming up judge: {JUDGE_MODEL}")
|
||||
try:
|
||||
requests.post(
|
||||
f"{OLLAMA_URL}/api/generate",
|
||||
json={
|
||||
"model": JUDGE_MODEL,
|
||||
"prompt": "hi",
|
||||
"stream": False,
|
||||
"options": {"num_predict": 1}
|
||||
},
|
||||
timeout=120
|
||||
)
|
||||
except Exception as e:
|
||||
print(f" Judge warmup error: {e}")
|
||||
|
||||
|
||||
# ============================================
|
||||
# JUDGE CALL
|
||||
# ============================================
|
||||
|
||||
def call_judge(test_name, prompt, output):
|
||||
"""
|
||||
Call LLM judge with strict rubric.
|
||||
Returns (semantic_score 0-10, reason str).
|
||||
Falls back to midpoint (5) on failure to avoid corrupting results.
|
||||
"""
|
||||
rubric = JUDGE_RUBRICS.get(test_name, DEFAULT_RUBRIC)
|
||||
ground_truth = GROUND_TRUTHS.get(test_name, "See prompt requirements.")
|
||||
|
||||
judge_prompt = JUDGE_PROMPT_TEMPLATE.format(
|
||||
test_name=test_name,
|
||||
rubric=rubric,
|
||||
ground_truth=ground_truth,
|
||||
prompt=prompt[:500],
|
||||
output=output[:1500],
|
||||
)
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{OLLAMA_URL}/api/generate",
|
||||
json={"model": JUDGE_MODEL, "prompt": judge_prompt, "stream": False},
|
||||
timeout=180
|
||||
)
|
||||
raw = response.json().get("response", "").strip()
|
||||
|
||||
# Try clean JSON parse
|
||||
m = re.search(r'\{[^{}]*"semantic_score"[^{}]*\}', raw)
|
||||
if m:
|
||||
try:
|
||||
parsed = json.loads(m.group())
|
||||
score = max(0, min(10, int(parsed.get("semantic_score", 5))))
|
||||
reason = str(parsed.get("reason", ""))[:80]
|
||||
return score, reason
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
|
||||
# Fallback: extract score number
|
||||
sm = re.search(r'"semantic_score"\s*:\s*(\d+)', raw)
|
||||
rm = re.search(r'"reason"\s*:\s*"([^"]{1,80})"', raw)
|
||||
if sm:
|
||||
score = max(0, min(10, int(sm.group(1))))
|
||||
reason = rm.group(1) if rm else "extracted"
|
||||
return score, reason
|
||||
|
||||
# Last resort
|
||||
last = re.search(r'semantic_score[^\d]*(\d+)', raw, re.IGNORECASE)
|
||||
if last:
|
||||
return max(0, min(10, int(last.group(1)))), "score extracted"
|
||||
|
||||
print(f" Judge unparseable: {raw[:80]}")
|
||||
return 5, "judge unparseable — midpoint"
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
return 5, "judge timeout — midpoint"
|
||||
except Exception as e:
|
||||
return 5, f"judge error — midpoint"
|
||||
|
||||
|
||||
# ============================================
|
||||
# EMBEDDING SIMILARITY
|
||||
# ============================================
|
||||
|
||||
def get_embedding(text):
|
||||
"""Get embedding vector from nomic-embed-text via Ollama."""
|
||||
try:
|
||||
r = requests.post(
|
||||
f"{OLLAMA_URL}/api/embed",
|
||||
json={"model": EMBED_MODEL, "input": text[:2000]},
|
||||
timeout=30
|
||||
)
|
||||
return r.json().get("embeddings", [[]])[0]
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def cosine_similarity(v1, v2):
|
||||
if not v1 or not v2 or len(v1) != len(v2):
|
||||
return 0.0
|
||||
dot = sum(a * b for a, b in zip(v1, v2))
|
||||
mag = math.sqrt(sum(a**2 for a in v1)) * math.sqrt(sum(b**2 for b in v2))
|
||||
return dot / mag if mag else 0.0
|
||||
|
||||
|
||||
def embedding_score(text, reference):
|
||||
"""
|
||||
Score 0-10 based on cosine similarity.
|
||||
Uses stepped mapping for better discrimination.
|
||||
"""
|
||||
if not text or not reference:
|
||||
return 0
|
||||
|
||||
v1 = get_embedding(text[:1000])
|
||||
v2 = get_embedding(reference)
|
||||
sim = cosine_similarity(v1, v2)
|
||||
|
||||
# Stepped mapping — more discriminating than linear
|
||||
if sim >= 0.92: return 10
|
||||
if sim >= 0.85: return 8
|
||||
if sim >= 0.78: return 6
|
||||
if sim >= 0.70: return 4
|
||||
if sim >= 0.60: return 2
|
||||
return 0
|
||||
Reference in New Issue
Block a user