RC: (add) python script files

2026-05-15 16:50:26 +01:00
parent 51e9389726
commit ab7875303e
9 changed files with 2350 additions and 0 deletions
--- a/judge.py
+++ b/judge.py
@@ -0,0 +1,178 @@
+"""
+benchmark_v4/judge.py
+=====================
+Layer 2: Semantic judge (rubric-based).
+Layer 3: Embedding similarity via Ollama nomic-embed-text.
+
+Judge is only called when validator gives a partial score.
+Embedding similarity used for RAG test.
+"""
+
+import re
+import json
+import math
+import requests
+from config import JUDGE_MODEL, EMBED_MODEL, OLLAMA_URL
+from prompts import JUDGE_RUBRICS, GROUND_TRUTHS, DEFAULT_RUBRIC
+
+
+# ============================================
+# JUDGE PROMPT TEMPLATE
+# ============================================
+
+JUDGE_PROMPT_TEMPLATE = """You are a strict benchmark judge. Score from 0-10.
+
+ABSOLUTE RULES:
+1. Judge ONLY what the prompt asked for. Nothing else.
+2. NEVER penalise for missing information not requested.
+3. NEVER penalise for being concise — brevity is correct.
+4. NEVER invent requirements. Only the prompt counts.
+5. Minimal correct answers score 8-10.
+6. Extra unnecessary content scores lower, not higher.
+
+SCORING:
+10 = perfect  |  8 = correct, trivial issue  |  6 = mostly correct
+4  = partial  |  2 = major error  |  0 = wrong or hallucination
+
+TEST: {test_name}
+WHAT TO JUDGE: {rubric}
+GROUND TRUTH: {ground_truth}
+
+PROMPT (what was asked):
+{prompt}
+
+OUTPUT (what model answered):
+{output}
+
+Return ONLY this JSON on one line, nothing else:
+{{"semantic_score": <0-10>, "reason": "<max 12 words>"}}"""
+
+
+# ============================================
+# JUDGE WARMUP
+# ============================================
+
+def warmup_judge():
+    """Load judge model without generating output."""
+    print(f"  Warming up judge: {JUDGE_MODEL}")
+    try:
+        requests.post(
+            f"{OLLAMA_URL}/api/generate",
+            json={
+                "model": JUDGE_MODEL,
+                "prompt": "hi",
+                "stream": False,
+                "options": {"num_predict": 1}
+            },
+            timeout=120
+        )
+    except Exception as e:
+        print(f"  Judge warmup error: {e}")
+
+
+# ============================================
+# JUDGE CALL
+# ============================================
+
+def call_judge(test_name, prompt, output):
+    """
+    Call LLM judge with strict rubric.
+    Returns (semantic_score 0-10, reason str).
+    Falls back to midpoint (5) on failure to avoid corrupting results.
+    """
+    rubric       = JUDGE_RUBRICS.get(test_name, DEFAULT_RUBRIC)
+    ground_truth = GROUND_TRUTHS.get(test_name, "See prompt requirements.")
+
+    judge_prompt = JUDGE_PROMPT_TEMPLATE.format(
+        test_name=test_name,
+        rubric=rubric,
+        ground_truth=ground_truth,
+        prompt=prompt[:500],
+        output=output[:1500],
+    )
+
+    try:
+        response = requests.post(
+            f"{OLLAMA_URL}/api/generate",
+            json={"model": JUDGE_MODEL, "prompt": judge_prompt, "stream": False},
+            timeout=180
+        )
+        raw = response.json().get("response", "").strip()
+
+        # Try clean JSON parse
+        m = re.search(r'\{[^{}]*"semantic_score"[^{}]*\}', raw)
+        if m:
+            try:
+                parsed = json.loads(m.group())
+                score  = max(0, min(10, int(parsed.get("semantic_score", 5))))
+                reason = str(parsed.get("reason", ""))[:80]
+                return score, reason
+            except (json.JSONDecodeError, ValueError):
+                pass
+
+        # Fallback: extract score number
+        sm = re.search(r'"semantic_score"\s*:\s*(\d+)', raw)
+        rm = re.search(r'"reason"\s*:\s*"([^"]{1,80})"', raw)
+        if sm:
+            score  = max(0, min(10, int(sm.group(1))))
+            reason = rm.group(1) if rm else "extracted"
+            return score, reason
+
+        # Last resort
+        last = re.search(r'semantic_score[^\d]*(\d+)', raw, re.IGNORECASE)
+        if last:
+            return max(0, min(10, int(last.group(1)))), "score extracted"
+
+        print(f"  Judge unparseable: {raw[:80]}")
+        return 5, "judge unparseable — midpoint"
+
+    except requests.exceptions.Timeout:
+        return 5, "judge timeout — midpoint"
+    except Exception as e:
+        return 5, f"judge error — midpoint"
+
+
+# ============================================
+# EMBEDDING SIMILARITY
+# ============================================
+
+def get_embedding(text):
+    """Get embedding vector from nomic-embed-text via Ollama."""
+    try:
+        r = requests.post(
+            f"{OLLAMA_URL}/api/embed",
+            json={"model": EMBED_MODEL, "input": text[:2000]},
+            timeout=30
+        )
+        return r.json().get("embeddings", [[]])[0]
+    except Exception:
+        return []
+
+
+def cosine_similarity(v1, v2):
+    if not v1 or not v2 or len(v1) != len(v2):
+        return 0.0
+    dot = sum(a * b for a, b in zip(v1, v2))
+    mag = math.sqrt(sum(a**2 for a in v1)) * math.sqrt(sum(b**2 for b in v2))
+    return dot / mag if mag else 0.0
+
+
+def embedding_score(text, reference):
+    """
+    Score 0-10 based on cosine similarity.
+    Uses stepped mapping for better discrimination.
+    """
+    if not text or not reference:
+        return 0
+
+    v1  = get_embedding(text[:1000])
+    v2  = get_embedding(reference)
+    sim = cosine_similarity(v1, v2)
+
+    # Stepped mapping — more discriminating than linear
+    if sim >= 0.92: return 10
+    if sim >= 0.85: return 8
+    if sim >= 0.78: return 6
+    if sim >= 0.70: return 4
+    if sim >= 0.60: return 2
+    return 0