llm-benchmark/judge.py

"""
benchmark_v4/judge.py
=====================
Layer 2: Semantic judge (rubric-based).
Layer 3: Embedding similarity via Ollama nomic-embed-text.

Judge is only called when validator gives a partial score.
Embedding similarity used for RAG test.
"""

import re
import json
import math
import requests
from config import JUDGE_MODEL, EMBED_MODEL, OLLAMA_URL
from prompts import JUDGE_RUBRICS, GROUND_TRUTHS, DEFAULT_RUBRIC


# ============================================
# JUDGE PROMPT TEMPLATE
# ============================================

JUDGE_PROMPT_TEMPLATE = """You are a strict benchmark judge. Score from 0-10.

ABSOLUTE RULES:
1. Judge ONLY what the prompt asked for. Nothing else.
2. NEVER penalise for missing information not requested.
3. NEVER penalise for being concise — brevity is correct.
4. NEVER invent requirements. Only the prompt counts.
5. Minimal correct answers score 8-10.
6. Extra unnecessary content scores lower, not higher.

SCORING:
10 = perfect  |  8 = correct, trivial issue  |  6 = mostly correct
4  = partial  |  2 = major error  |  0 = wrong or hallucination

TEST: {test_name}
WHAT TO JUDGE: {rubric}
GROUND TRUTH: {ground_truth}

PROMPT (what was asked):
{prompt}

OUTPUT (what model answered):
{output}

Return ONLY this JSON on one line, nothing else:
{{"semantic_score": <0-10>, "reason": "<max 12 words>"}}"""


# ============================================
# JUDGE WARMUP
# ============================================

def warmup_judge():
    """Load judge model without generating output."""
    print(f"  Warming up judge: {JUDGE_MODEL}")
    try:
        requests.post(
            f"{OLLAMA_URL}/api/generate",
            json={
                "model": JUDGE_MODEL,
                "prompt": "hi",
                "stream": False,
                "options": {"num_predict": 1}
            },
            timeout=120
        )
    except Exception as e:
        print(f"  Judge warmup error: {e}")


# ============================================
# JUDGE CALL
# ============================================

def call_judge(test_name, prompt, output):
    """
    Call LLM judge with strict rubric.
    Returns (semantic_score 0-10, reason str).
    Falls back to midpoint (5) on failure to avoid corrupting results.
    """
    rubric       = JUDGE_RUBRICS.get(test_name, DEFAULT_RUBRIC)
    ground_truth = GROUND_TRUTHS.get(test_name, "See prompt requirements.")

    judge_prompt = JUDGE_PROMPT_TEMPLATE.format(
        test_name=test_name,
        rubric=rubric,
        ground_truth=ground_truth,
        prompt=prompt[:500],
        output=output[:1500],
    )

    try:
        response = requests.post(
            f"{OLLAMA_URL}/api/generate",
            json={"model": JUDGE_MODEL, "prompt": judge_prompt, "stream": False},
            timeout=180
        )
        raw = response.json().get("response", "").strip()

        # Try clean JSON parse
        m = re.search(r'\{[^{}]*"semantic_score"[^{}]*\}', raw)
        if m:
            try:
                parsed = json.loads(m.group())
                score  = max(0, min(10, int(parsed.get("semantic_score", 5))))
                reason = str(parsed.get("reason", ""))[:80]
                return score, reason
            except (json.JSONDecodeError, ValueError):
                pass

        # Fallback: extract score number
        sm = re.search(r'"semantic_score"\s*:\s*(\d+)', raw)
        rm = re.search(r'"reason"\s*:\s*"([^"]{1,80})"', raw)
        if sm:
            score  = max(0, min(10, int(sm.group(1))))
            reason = rm.group(1) if rm else "extracted"
            return score, reason

        # Last resort
        last = re.search(r'semantic_score[^\d]*(\d+)', raw, re.IGNORECASE)
        if last:
            return max(0, min(10, int(last.group(1)))), "score extracted"

        print(f"  Judge unparseable: {raw[:80]}")
        return 5, "judge unparseable — midpoint"

    except requests.exceptions.Timeout:
        return 5, "judge timeout — midpoint"
    except Exception as e:
        return 5, f"judge error — midpoint"


# ============================================
# EMBEDDING SIMILARITY
# ============================================

def get_embedding(text):
    """Get embedding vector from nomic-embed-text via Ollama."""
    try:
        r = requests.post(
            f"{OLLAMA_URL}/api/embed",
            json={"model": EMBED_MODEL, "input": text[:2000]},
            timeout=30
        )
        return r.json().get("embeddings", [[]])[0]
    except Exception:
        return []


def cosine_similarity(v1, v2):
    if not v1 or not v2 or len(v1) != len(v2):
        return 0.0
    dot = sum(a * b for a, b in zip(v1, v2))
    mag = math.sqrt(sum(a**2 for a in v1)) * math.sqrt(sum(b**2 for b in v2))
    return dot / mag if mag else 0.0


def embedding_score(text, reference):
    """
    Score 0-10 based on cosine similarity.
    Uses stepped mapping for better discrimination.
    """
    if not text or not reference:
        return 0

    v1  = get_embedding(text[:1000])
    v2  = get_embedding(reference)
    sim = cosine_similarity(v1, v2)

    # Stepped mapping — more discriminating than linear
    if sim >= 0.92: return 10
    if sim >= 0.85: return 8
    if sim >= 0.78: return 6
    if sim >= 0.70: return 4
    if sim >= 0.60: return 2
    return 0