""" benchmark_v4/judge.py ===================== Layer 2: Semantic judge (rubric-based). Layer 3: Embedding similarity via Ollama nomic-embed-text. Judge is only called when validator gives a partial score. Embedding similarity used for RAG test. """ import re import json import math import requests from config import JUDGE_MODEL, EMBED_MODEL, OLLAMA_URL from prompts import JUDGE_RUBRICS, GROUND_TRUTHS, DEFAULT_RUBRIC # ============================================ # JUDGE PROMPT TEMPLATE # ============================================ JUDGE_PROMPT_TEMPLATE = """You are a strict benchmark judge. Score from 0-10. ABSOLUTE RULES: 1. Judge ONLY what the prompt asked for. Nothing else. 2. NEVER penalise for missing information not requested. 3. NEVER penalise for being concise — brevity is correct. 4. NEVER invent requirements. Only the prompt counts. 5. Minimal correct answers score 8-10. 6. Extra unnecessary content scores lower, not higher. SCORING: 10 = perfect | 8 = correct, trivial issue | 6 = mostly correct 4 = partial | 2 = major error | 0 = wrong or hallucination TEST: {test_name} WHAT TO JUDGE: {rubric} GROUND TRUTH: {ground_truth} PROMPT (what was asked): {prompt} OUTPUT (what model answered): {output} Return ONLY this JSON on one line, nothing else: {{"semantic_score": <0-10>, "reason": ""}}""" # ============================================ # JUDGE WARMUP # ============================================ def warmup_judge(): """Load judge model without generating output.""" print(f" Warming up judge: {JUDGE_MODEL}") try: requests.post( f"{OLLAMA_URL}/api/generate", json={ "model": JUDGE_MODEL, "prompt": "hi", "stream": False, "options": {"num_predict": 1} }, timeout=120 ) except Exception as e: print(f" Judge warmup error: {e}") # ============================================ # JUDGE CALL # ============================================ def call_judge(test_name, prompt, output): """ Call LLM judge with strict rubric. Returns (semantic_score 0-10, reason str). Falls back to midpoint (5) on failure to avoid corrupting results. """ rubric = JUDGE_RUBRICS.get(test_name, DEFAULT_RUBRIC) ground_truth = GROUND_TRUTHS.get(test_name, "See prompt requirements.") judge_prompt = JUDGE_PROMPT_TEMPLATE.format( test_name=test_name, rubric=rubric, ground_truth=ground_truth, prompt=prompt[:500], output=output[:1500], ) try: response = requests.post( f"{OLLAMA_URL}/api/generate", json={"model": JUDGE_MODEL, "prompt": judge_prompt, "stream": False}, timeout=180 ) raw = response.json().get("response", "").strip() # Try clean JSON parse m = re.search(r'\{[^{}]*"semantic_score"[^{}]*\}', raw) if m: try: parsed = json.loads(m.group()) score = max(0, min(10, int(parsed.get("semantic_score", 5)))) reason = str(parsed.get("reason", ""))[:80] return score, reason except (json.JSONDecodeError, ValueError): pass # Fallback: extract score number sm = re.search(r'"semantic_score"\s*:\s*(\d+)', raw) rm = re.search(r'"reason"\s*:\s*"([^"]{1,80})"', raw) if sm: score = max(0, min(10, int(sm.group(1)))) reason = rm.group(1) if rm else "extracted" return score, reason # Last resort last = re.search(r'semantic_score[^\d]*(\d+)', raw, re.IGNORECASE) if last: return max(0, min(10, int(last.group(1)))), "score extracted" print(f" Judge unparseable: {raw[:80]}") return 5, "judge unparseable — midpoint" except requests.exceptions.Timeout: return 5, "judge timeout — midpoint" except Exception as e: return 5, f"judge error — midpoint" # ============================================ # EMBEDDING SIMILARITY # ============================================ def get_embedding(text): """Get embedding vector from nomic-embed-text via Ollama.""" try: r = requests.post( f"{OLLAMA_URL}/api/embed", json={"model": EMBED_MODEL, "input": text[:2000]}, timeout=30 ) return r.json().get("embeddings", [[]])[0] except Exception: return [] def cosine_similarity(v1, v2): if not v1 or not v2 or len(v1) != len(v2): return 0.0 dot = sum(a * b for a, b in zip(v1, v2)) mag = math.sqrt(sum(a**2 for a in v1)) * math.sqrt(sum(b**2 for b in v2)) return dot / mag if mag else 0.0 def embedding_score(text, reference): """ Score 0-10 based on cosine similarity. Uses stepped mapping for better discrimination. """ if not text or not reference: return 0 v1 = get_embedding(text[:1000]) v2 = get_embedding(reference) sim = cosine_similarity(v1, v2) # Stepped mapping — more discriminating than linear if sim >= 0.92: return 10 if sim >= 0.85: return 8 if sim >= 0.78: return 6 if sim >= 0.70: return 4 if sim >= 0.60: return 2 return 0