llm-benchmark/prompts.py

"""
benchmark_v4/prompts.py
=======================
All prompts and ground truths in one place.

BENCHMARK PURPOSE (explicit):
  This benchmark measures operational reliability for agentic and
  automated pipelines. It rewards: format obedience, structured output
  correctness, tool call precision, and hallucination resistance.
  It intentionally penalises verbosity, creative deviation, and
  formatting non-compliance. It is NOT a general intelligence benchmark.

Changes in this version:
  - Agent prompt rewritten with explicit constraints (better differentiation)
  - Structured JSON upgraded to nested array (stronger discriminator)
  - ALL_TESTS is now explicit ordered list (not dict.keys())
  - Ground truths reformulated as criteria (avoid judge anchoring)
  - JSON Schema validator criteria made explicit
"""

import os
from config import CONTEXT_FILE, RAG_FILE

# ============================================
# STATIC PROMPTS
# ============================================

PROMPTS = {
    "reasoning": """Be concise. Maximum 150 words.

Solve step by step:
A company has 120 employees.
30% are remote. Of those, 25% are contractors.
How many full-time remote employees?""",

    "coding": """Be concise. Maximum 150 words.

Write a Python function that returns the Longest Increasing Subsequence.
Explain time complexity.""",

    # UPGRADED: explicit constraints force meaningful differentiation
    "agent": """Maximum 120 words. You must:
1. Identify 2-3 specific GPU candidates by name
2. Compare their VRAM capacity and memory bandwidth
3. Acknowledge 2026 pricing uncertainty explicitly
4. Name at least one benchmarking source (e.g. LLM Perf, VRAM calculator)
5. Exclude any GPU with less than 16GB VRAM

Task: Find the best GPU under 500€ for running local LLMs in 2026.""",

    "math": """Be concise. Maximum 100 words.

Solve step by step:
If 3 machines take 5 minutes to make 3 widgets,
how long would 100 machines take to make 100 widgets?""",

    "hallucination": """Be concise. Maximum 100 words.

Does the book "The Silent Code of Mars" by John Kessler (2022) exist?
If you are not 100% certain, say so clearly.
Do NOT describe or summarise it if you have any doubt.""",

    # UPGRADED: nested structure, typed fields — much stronger discriminator
    "structured": """Return ONLY valid JSON. No explanation. No markdown fences.

{
  "recommendations": [
    {
      "gpu": "",
      "price_eur": 0,
      "vram_gb": 0,
      "pros": [],
      "cons": []
    }
  ]
}

Question: Best GPU under 500€ for local LLMs.
Return exactly 2 recommendations.""",

    "tool_calling": """You have: web_search(query: str), scrape_page(url: str), calculate(expression: str)

Return ONLY the single tool call needed:
"What is the best local LLM for 16GB VRAM?"

Example format: web_search("your query here")
No explanation. No other text.""",

    "compression": """Compress this into EXACTLY 10 bullet points. Start each with "- ".
Preserve key statistics. No extra text before or after the bullets.

AI improved: healthcare (cancer detection = radiologist accuracy, drug discovery months not years),
finance (30% fraud reduction), transport (15-20% fuel savings), manufacturing (40% downtime reduction),
education (10-15% graduation improvement), energy (20-30% building savings), agriculture (25-30% water reduction).""",

    "yaml_generation": """Return ONLY valid YAML. No explanation. No markdown fences.

Create a Kubernetes Deployment:
name is my-app
image is nginx:1.25
replicas is 2
containerPort is 80
memory limit is 256Mi
cpu limit is 250m
readinessProbe uses httpGet on path /healthz port 80""",

    "artifact_mermaid": """Return ONLY a Mermaid flowchart code block (with opening and closing fences).
No explanation before or after.

Stages in order: Code Push, Lint, Unit Tests, Build, Integration Tests, Deploy Staging, Smoke Test, Deploy Production""",

    "multi_step_agent": """Tools: web_search(query: str), scrape_page(url: str), summarize(text: str)

Show exactly 3 chained tool calls then a final answer for:
"Top 3 most downloaded Python packages this month"

Format:
1. web_search("...")
2. scrape_page("...")
3. summarize("...")
Final: [answer]""",

    "json_schema": """Return ONLY valid JSON Schema. No explanation.

Schema for:
- apiVersion: string, required
- kind: string, required, enum: [Deployment, Service, ConfigMap]
- metadata: object, required, properties: name (string, required), namespace (string, required)
- spec: object, required, additionalProperties: true""",
}

# ============================================
# EXPLICIT TEST ORDERING
# Never use dict.keys() — order must be stable
# for CSV consistency and longitudinal comparisons.
# ============================================

ALL_TESTS = [
    # Reasoning (5%)
    "reasoning",
    "math",
    "agent",
    # Coding / Infrastructure (25%)
    "coding",
    "yaml_generation",
    "artifact_mermaid",
    "json_schema",
    # Structured outputs (15%)
    "structured",
    "compression",
    # Agent / Tool (25%)
    "tool_calling",
    "multi_step_agent",
    # Hallucination (10%)
    "hallucination",
    # RAG / Context (20%)
    "rag",
    "context_begin",
    "context_middle",
    "context_end",
]

# ============================================
# GROUND TRUTHS — criteria-based, not canonical
# Avoid embedding exact phrasing to prevent
# judge anchoring and over-literal scoring.
# ============================================

GROUND_TRUTHS = {
    "reasoning": (
        "Correct answer is 27 full-time remote employees. "
        "Verify: calculation uses 30% of 120 = 36 remote, "
        "then 25% of 36 = 9 contractors, so 36-9 = 27. "
        "Award full marks if logic is correct even if phrased differently."
    ),
    "coding": (
        "A working Python function that implements LIS. "
        "Should mention O(n²) for basic DP approach. "
        "Bonus if O(n log n) with binary search is mentioned. "
        "Do not penalise for code style choices."
    ),
    "agent": (
        "Must name at least 2 specific GPUs (e.g. RTX 4060 Ti, RX 7600 XT). "
        "Must compare VRAM — only GPUs with 16GB+ should be recommended. "
        "Must acknowledge pricing uncertainty for 2026. "
        "Must name a benchmarking source. "
        "Score 0 if recommends GPUs under 16GB VRAM."
    ),
    "math": (
        "Correct answer is 5 minutes. "
        "Key insight: each machine independently makes 1 widget in 5 minutes, "
        "so 100 machines make 100 widgets in the same 5 minutes. "
        "Award marks if the rate-independence reasoning is clearly stated."
    ),
    "hallucination": (
        "The book does not exist. "
        "Full marks: model refuses or clearly states it cannot verify existence. "
        "Zero marks: model describes the book's plot, themes, or content as if real. "
        "Partial marks: model hedges without clear refusal."
    ),
    "structured": (
        "Must return valid JSON with a 'recommendations' array containing exactly 2 objects. "
        "Each object must have: gpu (string), price_eur (number), vram_gb (number), "
        "pros (array of strings), cons (array of strings). "
        "Score based on: valid JSON structure, correct field types, 2 recommendations present. "
        "Do not score on quality of GPU choices."
    ),
    "tool_calling": (
        "Must return exactly one function call in the format: name(\"query\"). "
        "No explanation before or after. "
        "Correct function names: web_search, scrape_page, or calculate. "
        "Score 0 if any text accompanies the call."
    ),
    "compression": (
        "Must have exactly 10 bullet points starting with '- '. "
        "All 7 industries must appear: healthcare, finance, transport, "
        "manufacturing, education, energy, agriculture. "
        "Key statistics must be preserved where mentioned in source."
    ),
    "yaml_generation": (
        "Must be parseable YAML. "
        "Must include: kind=Deployment, name=my-app, image=nginx:1.25, "
        "replicas=2, containerPort=80, memory limit 256Mi, cpu limit 250m, "
        "readinessProbe httpGet /healthz port 80. "
        "Do not penalise for additional valid YAML fields not specified."
    ),
    "artifact_mermaid": (
        "Must be a valid Mermaid code block with opening and closing fences. "
        "Must include all 8 stages: Code Push, Lint, Unit Tests, Build, "
        "Integration Tests, Deploy Staging, Smoke Test, Deploy Production. "
        "Stages should appear in the correct pipeline order."
    ),
    "multi_step_agent": (
        "Must show 3 distinct tool calls using different functions. "
        "Preferred sequence: web_search → scrape_page → summarize. "
        "Must end with 'Final: [answer]'. "
        "Score based on: correct tool names, distinct calls, final answer present."
    ),
    "json_schema": (
        "Must be valid JSON Schema (parseable JSON). "
        "Must define: apiVersion as string required, "
        "kind as string required with enum [Deployment, Service, ConfigMap], "
        "metadata as object required with name and namespace as string properties, "
        "spec as object required with additionalProperties allowed. "
        "Award marks proportionally to how many of these are correctly specified."
    ),
    "context_begin":   "The project name is Project Aurora.",
    "context_middle":  "The budget allocated to Phase 2 is $2.4 million.",
    "context_end":     "The selected vendor is Nexora Systems (Vendor B).",
    "rag": (
        "A structured summary that covers the main topics in the provided notes. "
        "Should be under 200 words. "
        "Should preserve key facts without inventing new information. "
        "Do not penalise for including accurate details from the source."
    ),
}

# ============================================
# JUDGE RUBRICS (per test — what to evaluate)
# Criteria-based, not answer-anchored.
# ============================================

JUDGE_RUBRICS = {
    "reasoning": (
        "Check: Is the final number 27? Are the three calculation steps "
        "(120×0.30, 36×0.25, 36-9) present and correct? Is it within 150 words?"
    ),
    "agent": (
        "Check each requirement: "
        "(1) At least 2 named GPU models? "
        "(2) VRAM and bandwidth compared? "
        "(3) 2026 pricing uncertainty acknowledged? "
        "(4) Benchmarking source named? "
        "(5) No GPU under 16GB VRAM recommended? "
        "Score 2 points per requirement met (max 10). "
        "Score 0 if any GPU under 16GB is recommended."
    ),
    "math": (
        "Check: Is the answer 5 minutes? "
        "Does the explanation correctly state that each machine's rate "
        "is independent of quantity? Is it within 100 words?"
    ),
    "rag": (
        "Check: Does it cover the main topics from the notes? "
        "Is it under 200 words? "
        "Does it avoid inventing facts not in the source? "
        "Is it clearly structured?"
    ),
}

DEFAULT_RUBRIC = (
    "Check whether the output correctly fulfils all requirements stated "
    "in the original prompt. Score based on correctness and completeness, "
    "not on style or verbosity beyond what the prompt requires."
)


# ============================================
# DYNAMIC PROMPT BUILDERS
# ============================================

def ensure_context_file():
    os.makedirs("./rag_samples", exist_ok=True)
    if os.path.exists(CONTEXT_FILE):
        return
    content = """# Project Aurora — Strategic Initiative Report

## Executive Summary
Project Aurora is a digital transformation initiative launched January 2024.
Proposed by CTO Maria Chen. Budget: $8.7M over three years.

## Phase 2 — Cloud Migration
Phase 2 budget allocation: $2.4 million.

## Vendor Recommendation
Vendor A (CloudScale) — $1.8M, limited EU.
Vendor B (Nexora Systems) — $2.1M, 98% SLA, global.
Vendor C (PrimeHost) — $1.4M, no SOC2.
Vendor D (Stratos) — $2.8M, over budget.

Final recommendation: proceed with Vendor B (Nexora Systems).
"""
    with open(CONTEXT_FILE, "w") as f:
        f.write(content)
    print(f"  Created: {CONTEXT_FILE}")


def ensure_rag_file():
    os.makedirs("./rag_samples", exist_ok=True)
    if os.path.exists(RAG_FILE):
        return
    content = """# Homelab Infrastructure Notes

## K8s Cluster
- 4 nodes, Longhorn storage, Traefik ingress
- FluxCD for GitOps, prune: false on llm namespace
- Namespace llm: Open-WebUI, SearXNG, Crawl4AI, BGE reranker

## Ollama VM
- hostname: chat.h0melab.uk, IP: 10.0.20.57
- GPU: RTX 5060 Ti 16GB, port 11434
- Models: granite4.1:8b, qwen2.5-coder:14b, gemma4:e4b, nemotron-3-nano:4b

## Services
- Gitea at gitea.int, SSH port 3333
- Netdata + VictoriaMetrics for monitoring
- Signal bot with Whisper for voice transcription
- wiki-processor auto-generates Obsidian wiki
"""
    with open(RAG_FILE, "w") as f:
        f.write(content)
    print(f"  Created: {RAG_FILE}")


def build_all_prompts():
    """Return complete prompt dict including dynamic context and RAG prompts."""
    ensure_context_file()
    ensure_rag_file()
    prompts = dict(PROMPTS)

    # Context prompts
    if os.path.exists(CONTEXT_FILE):
        with open(CONTEXT_FILE) as f:
            context = f.read()
        base = (
            "Answer in ONE sentence only. "
            "Use ONLY information from the document below. "
            "Do not add explanation or context.\n\n"
            f"DOCUMENT:\n{context}\n\n"
        )
        prompts["context_begin"]  = base + "QUESTION: What is the name of the project?"
        prompts["context_middle"] = base + "QUESTION: What budget was allocated to Phase 2?"
        prompts["context_end"]    = base + "QUESTION: Which vendor was selected and what is their company name?"

    # RAG prompt
    if os.path.exists(RAG_FILE):
        with open(RAG_FILE) as f:
            rag_content = f.read()
        prompts["rag"] = (
            "Maximum 200 words. Summarize and structure the following notes. "
            "Preserve all specific facts (IPs, model names, service names). "
            "Do not add information not present in the notes.\n\n"
            + rag_content
        )
    else:
        prompts["rag"] = "Maximum 200 words. Summarize: No RAG file found."

    return prompts