388 lines
14 KiB
Python
388 lines
14 KiB
Python
"""
|
||
benchmark_v4/prompts.py
|
||
=======================
|
||
All prompts and ground truths in one place.
|
||
|
||
BENCHMARK PURPOSE (explicit):
|
||
This benchmark measures operational reliability for agentic and
|
||
automated pipelines. It rewards: format obedience, structured output
|
||
correctness, tool call precision, and hallucination resistance.
|
||
It intentionally penalises verbosity, creative deviation, and
|
||
formatting non-compliance. It is NOT a general intelligence benchmark.
|
||
|
||
Changes in this version:
|
||
- Agent prompt rewritten with explicit constraints (better differentiation)
|
||
- Structured JSON upgraded to nested array (stronger discriminator)
|
||
- ALL_TESTS is now explicit ordered list (not dict.keys())
|
||
- Ground truths reformulated as criteria (avoid judge anchoring)
|
||
- JSON Schema validator criteria made explicit
|
||
"""
|
||
|
||
import os
|
||
from config import CONTEXT_FILE, RAG_FILE
|
||
|
||
# ============================================
|
||
# STATIC PROMPTS
|
||
# ============================================
|
||
|
||
PROMPTS = {
|
||
"reasoning": """Be concise. Maximum 150 words.
|
||
|
||
Solve step by step:
|
||
A company has 120 employees.
|
||
30% are remote. Of those, 25% are contractors.
|
||
How many full-time remote employees?""",
|
||
|
||
"coding": """Be concise. Maximum 150 words.
|
||
|
||
Write a Python function that returns the Longest Increasing Subsequence.
|
||
Explain time complexity.""",
|
||
|
||
# UPGRADED: explicit constraints force meaningful differentiation
|
||
"agent": """Maximum 120 words. You must:
|
||
1. Identify 2-3 specific GPU candidates by name
|
||
2. Compare their VRAM capacity and memory bandwidth
|
||
3. Acknowledge 2026 pricing uncertainty explicitly
|
||
4. Name at least one benchmarking source (e.g. LLM Perf, VRAM calculator)
|
||
5. Exclude any GPU with less than 16GB VRAM
|
||
|
||
Task: Find the best GPU under 500€ for running local LLMs in 2026.""",
|
||
|
||
"math": """Be concise. Maximum 100 words.
|
||
|
||
Solve step by step:
|
||
If 3 machines take 5 minutes to make 3 widgets,
|
||
how long would 100 machines take to make 100 widgets?""",
|
||
|
||
"hallucination": """Be concise. Maximum 100 words.
|
||
|
||
Does the book "The Silent Code of Mars" by John Kessler (2022) exist?
|
||
If you are not 100% certain, say so clearly.
|
||
Do NOT describe or summarise it if you have any doubt.""",
|
||
|
||
# UPGRADED: nested structure, typed fields — much stronger discriminator
|
||
"structured": """Return ONLY valid JSON. No explanation. No markdown fences.
|
||
|
||
{
|
||
"recommendations": [
|
||
{
|
||
"gpu": "",
|
||
"price_eur": 0,
|
||
"vram_gb": 0,
|
||
"pros": [],
|
||
"cons": []
|
||
}
|
||
]
|
||
}
|
||
|
||
Question: Best GPU under 500€ for local LLMs.
|
||
Return exactly 2 recommendations.""",
|
||
|
||
"tool_calling": """You have: web_search(query: str), scrape_page(url: str), calculate(expression: str)
|
||
|
||
Return ONLY the single tool call needed:
|
||
"What is the best local LLM for 16GB VRAM?"
|
||
|
||
Example format: web_search("your query here")
|
||
No explanation. No other text.""",
|
||
|
||
"compression": """Compress this into EXACTLY 10 bullet points. Start each with "- ".
|
||
Preserve key statistics. No extra text before or after the bullets.
|
||
|
||
AI improved: healthcare (cancer detection = radiologist accuracy, drug discovery months not years),
|
||
finance (30% fraud reduction), transport (15-20% fuel savings), manufacturing (40% downtime reduction),
|
||
education (10-15% graduation improvement), energy (20-30% building savings), agriculture (25-30% water reduction).""",
|
||
|
||
"yaml_generation": """Return ONLY valid YAML. No explanation. No markdown fences.
|
||
|
||
Create a Kubernetes Deployment:
|
||
name is my-app
|
||
image is nginx:1.25
|
||
replicas is 2
|
||
containerPort is 80
|
||
memory limit is 256Mi
|
||
cpu limit is 250m
|
||
readinessProbe uses httpGet on path /healthz port 80""",
|
||
|
||
"artifact_mermaid": """Return ONLY a Mermaid flowchart code block (with opening and closing fences).
|
||
No explanation before or after.
|
||
|
||
Stages in order: Code Push, Lint, Unit Tests, Build, Integration Tests, Deploy Staging, Smoke Test, Deploy Production""",
|
||
|
||
"multi_step_agent": """Tools: web_search(query: str), scrape_page(url: str), summarize(text: str)
|
||
|
||
Show exactly 3 chained tool calls then a final answer for:
|
||
"Top 3 most downloaded Python packages this month"
|
||
|
||
Format:
|
||
1. web_search("...")
|
||
2. scrape_page("...")
|
||
3. summarize("...")
|
||
Final: [answer]""",
|
||
|
||
"json_schema": """Return ONLY valid JSON Schema. No explanation.
|
||
|
||
Schema for:
|
||
- apiVersion: string, required
|
||
- kind: string, required, enum: [Deployment, Service, ConfigMap]
|
||
- metadata: object, required, properties: name (string, required), namespace (string, required)
|
||
- spec: object, required, additionalProperties: true""",
|
||
}
|
||
|
||
# ============================================
|
||
# EXPLICIT TEST ORDERING
|
||
# Never use dict.keys() — order must be stable
|
||
# for CSV consistency and longitudinal comparisons.
|
||
# ============================================
|
||
|
||
ALL_TESTS = [
|
||
# Reasoning (5%)
|
||
"reasoning",
|
||
"math",
|
||
"agent",
|
||
# Coding / Infrastructure (25%)
|
||
"coding",
|
||
"yaml_generation",
|
||
"artifact_mermaid",
|
||
"json_schema",
|
||
# Structured outputs (15%)
|
||
"structured",
|
||
"compression",
|
||
# Agent / Tool (25%)
|
||
"tool_calling",
|
||
"multi_step_agent",
|
||
# Hallucination (10%)
|
||
"hallucination",
|
||
# RAG / Context (20%)
|
||
"rag",
|
||
"context_begin",
|
||
"context_middle",
|
||
"context_end",
|
||
]
|
||
|
||
# ============================================
|
||
# GROUND TRUTHS — criteria-based, not canonical
|
||
# Avoid embedding exact phrasing to prevent
|
||
# judge anchoring and over-literal scoring.
|
||
# ============================================
|
||
|
||
GROUND_TRUTHS = {
|
||
"reasoning": (
|
||
"Correct answer is 27 full-time remote employees. "
|
||
"Verify: calculation uses 30% of 120 = 36 remote, "
|
||
"then 25% of 36 = 9 contractors, so 36-9 = 27. "
|
||
"Award full marks if logic is correct even if phrased differently."
|
||
),
|
||
"coding": (
|
||
"A working Python function that implements LIS. "
|
||
"Should mention O(n²) for basic DP approach. "
|
||
"Bonus if O(n log n) with binary search is mentioned. "
|
||
"Do not penalise for code style choices."
|
||
),
|
||
"agent": (
|
||
"Must name at least 2 specific GPUs (e.g. RTX 4060 Ti, RX 7600 XT). "
|
||
"Must compare VRAM — only GPUs with 16GB+ should be recommended. "
|
||
"Must acknowledge pricing uncertainty for 2026. "
|
||
"Must name a benchmarking source. "
|
||
"Score 0 if recommends GPUs under 16GB VRAM."
|
||
),
|
||
"math": (
|
||
"Correct answer is 5 minutes. "
|
||
"Key insight: each machine independently makes 1 widget in 5 minutes, "
|
||
"so 100 machines make 100 widgets in the same 5 minutes. "
|
||
"Award marks if the rate-independence reasoning is clearly stated."
|
||
),
|
||
"hallucination": (
|
||
"The book does not exist. "
|
||
"Full marks: model refuses or clearly states it cannot verify existence. "
|
||
"Zero marks: model describes the book's plot, themes, or content as if real. "
|
||
"Partial marks: model hedges without clear refusal."
|
||
),
|
||
"structured": (
|
||
"Must return valid JSON with a 'recommendations' array containing exactly 2 objects. "
|
||
"Each object must have: gpu (string), price_eur (number), vram_gb (number), "
|
||
"pros (array of strings), cons (array of strings). "
|
||
"Score based on: valid JSON structure, correct field types, 2 recommendations present. "
|
||
"Do not score on quality of GPU choices."
|
||
),
|
||
"tool_calling": (
|
||
"Must return exactly one function call in the format: name(\"query\"). "
|
||
"No explanation before or after. "
|
||
"Correct function names: web_search, scrape_page, or calculate. "
|
||
"Score 0 if any text accompanies the call."
|
||
),
|
||
"compression": (
|
||
"Must have exactly 10 bullet points starting with '- '. "
|
||
"All 7 industries must appear: healthcare, finance, transport, "
|
||
"manufacturing, education, energy, agriculture. "
|
||
"Key statistics must be preserved where mentioned in source."
|
||
),
|
||
"yaml_generation": (
|
||
"Must be parseable YAML. "
|
||
"Must include: kind=Deployment, name=my-app, image=nginx:1.25, "
|
||
"replicas=2, containerPort=80, memory limit 256Mi, cpu limit 250m, "
|
||
"readinessProbe httpGet /healthz port 80. "
|
||
"Do not penalise for additional valid YAML fields not specified."
|
||
),
|
||
"artifact_mermaid": (
|
||
"Must be a valid Mermaid code block with opening and closing fences. "
|
||
"Must include all 8 stages: Code Push, Lint, Unit Tests, Build, "
|
||
"Integration Tests, Deploy Staging, Smoke Test, Deploy Production. "
|
||
"Stages should appear in the correct pipeline order."
|
||
),
|
||
"multi_step_agent": (
|
||
"Must show 3 distinct tool calls using different functions. "
|
||
"Preferred sequence: web_search → scrape_page → summarize. "
|
||
"Must end with 'Final: [answer]'. "
|
||
"Score based on: correct tool names, distinct calls, final answer present."
|
||
),
|
||
"json_schema": (
|
||
"Must be valid JSON Schema (parseable JSON). "
|
||
"Must define: apiVersion as string required, "
|
||
"kind as string required with enum [Deployment, Service, ConfigMap], "
|
||
"metadata as object required with name and namespace as string properties, "
|
||
"spec as object required with additionalProperties allowed. "
|
||
"Award marks proportionally to how many of these are correctly specified."
|
||
),
|
||
"context_begin": "The project name is Project Aurora.",
|
||
"context_middle": "The budget allocated to Phase 2 is $2.4 million.",
|
||
"context_end": "The selected vendor is Nexora Systems (Vendor B).",
|
||
"rag": (
|
||
"A structured summary that covers the main topics in the provided notes. "
|
||
"Should be under 200 words. "
|
||
"Should preserve key facts without inventing new information. "
|
||
"Do not penalise for including accurate details from the source."
|
||
),
|
||
}
|
||
|
||
# ============================================
|
||
# JUDGE RUBRICS (per test — what to evaluate)
|
||
# Criteria-based, not answer-anchored.
|
||
# ============================================
|
||
|
||
JUDGE_RUBRICS = {
|
||
"reasoning": (
|
||
"Check: Is the final number 27? Are the three calculation steps "
|
||
"(120×0.30, 36×0.25, 36-9) present and correct? Is it within 150 words?"
|
||
),
|
||
"agent": (
|
||
"Check each requirement: "
|
||
"(1) At least 2 named GPU models? "
|
||
"(2) VRAM and bandwidth compared? "
|
||
"(3) 2026 pricing uncertainty acknowledged? "
|
||
"(4) Benchmarking source named? "
|
||
"(5) No GPU under 16GB VRAM recommended? "
|
||
"Score 2 points per requirement met (max 10). "
|
||
"Score 0 if any GPU under 16GB is recommended."
|
||
),
|
||
"math": (
|
||
"Check: Is the answer 5 minutes? "
|
||
"Does the explanation correctly state that each machine's rate "
|
||
"is independent of quantity? Is it within 100 words?"
|
||
),
|
||
"rag": (
|
||
"Check: Does it cover the main topics from the notes? "
|
||
"Is it under 200 words? "
|
||
"Does it avoid inventing facts not in the source? "
|
||
"Is it clearly structured?"
|
||
),
|
||
}
|
||
|
||
DEFAULT_RUBRIC = (
|
||
"Check whether the output correctly fulfils all requirements stated "
|
||
"in the original prompt. Score based on correctness and completeness, "
|
||
"not on style or verbosity beyond what the prompt requires."
|
||
)
|
||
|
||
|
||
# ============================================
|
||
# DYNAMIC PROMPT BUILDERS
|
||
# ============================================
|
||
|
||
def ensure_context_file():
|
||
os.makedirs("./rag_samples", exist_ok=True)
|
||
if os.path.exists(CONTEXT_FILE):
|
||
return
|
||
content = """# Project Aurora — Strategic Initiative Report
|
||
|
||
## Executive Summary
|
||
Project Aurora is a digital transformation initiative launched January 2024.
|
||
Proposed by CTO Maria Chen. Budget: $8.7M over three years.
|
||
|
||
## Phase 2 — Cloud Migration
|
||
Phase 2 budget allocation: $2.4 million.
|
||
|
||
## Vendor Recommendation
|
||
Vendor A (CloudScale) — $1.8M, limited EU.
|
||
Vendor B (Nexora Systems) — $2.1M, 98% SLA, global.
|
||
Vendor C (PrimeHost) — $1.4M, no SOC2.
|
||
Vendor D (Stratos) — $2.8M, over budget.
|
||
|
||
Final recommendation: proceed with Vendor B (Nexora Systems).
|
||
"""
|
||
with open(CONTEXT_FILE, "w") as f:
|
||
f.write(content)
|
||
print(f" Created: {CONTEXT_FILE}")
|
||
|
||
|
||
def ensure_rag_file():
|
||
os.makedirs("./rag_samples", exist_ok=True)
|
||
if os.path.exists(RAG_FILE):
|
||
return
|
||
content = """# Homelab Infrastructure Notes
|
||
|
||
## K8s Cluster
|
||
- 4 nodes, Longhorn storage, Traefik ingress
|
||
- FluxCD for GitOps, prune: false on llm namespace
|
||
- Namespace llm: Open-WebUI, SearXNG, Crawl4AI, BGE reranker
|
||
|
||
## Ollama VM
|
||
- hostname: chat.h0melab.uk, IP: 10.0.20.57
|
||
- GPU: RTX 5060 Ti 16GB, port 11434
|
||
- Models: granite4.1:8b, qwen2.5-coder:14b, gemma4:e4b, nemotron-3-nano:4b
|
||
|
||
## Services
|
||
- Gitea at gitea.int, SSH port 3333
|
||
- Netdata + VictoriaMetrics for monitoring
|
||
- Signal bot with Whisper for voice transcription
|
||
- wiki-processor auto-generates Obsidian wiki
|
||
"""
|
||
with open(RAG_FILE, "w") as f:
|
||
f.write(content)
|
||
print(f" Created: {RAG_FILE}")
|
||
|
||
|
||
def build_all_prompts():
|
||
"""Return complete prompt dict including dynamic context and RAG prompts."""
|
||
ensure_context_file()
|
||
ensure_rag_file()
|
||
prompts = dict(PROMPTS)
|
||
|
||
# Context prompts
|
||
if os.path.exists(CONTEXT_FILE):
|
||
with open(CONTEXT_FILE) as f:
|
||
context = f.read()
|
||
base = (
|
||
"Answer in ONE sentence only. "
|
||
"Use ONLY information from the document below. "
|
||
"Do not add explanation or context.\n\n"
|
||
f"DOCUMENT:\n{context}\n\n"
|
||
)
|
||
prompts["context_begin"] = base + "QUESTION: What is the name of the project?"
|
||
prompts["context_middle"] = base + "QUESTION: What budget was allocated to Phase 2?"
|
||
prompts["context_end"] = base + "QUESTION: Which vendor was selected and what is their company name?"
|
||
|
||
# RAG prompt
|
||
if os.path.exists(RAG_FILE):
|
||
with open(RAG_FILE) as f:
|
||
rag_content = f.read()
|
||
prompts["rag"] = (
|
||
"Maximum 200 words. Summarize and structure the following notes. "
|
||
"Preserve all specific facts (IPs, model names, service names). "
|
||
"Do not add information not present in the notes.\n\n"
|
||
+ rag_content
|
||
)
|
||
else:
|
||
prompts["rag"] = "Maximum 200 words. Summarize: No RAG file found."
|
||
|
||
return prompts |