RC: (add) python script files
This commit is contained in:
388
prompts.py
Normal file
388
prompts.py
Normal file
@@ -0,0 +1,388 @@
|
||||
"""
|
||||
benchmark_v4/prompts.py
|
||||
=======================
|
||||
All prompts and ground truths in one place.
|
||||
|
||||
BENCHMARK PURPOSE (explicit):
|
||||
This benchmark measures operational reliability for agentic and
|
||||
automated pipelines. It rewards: format obedience, structured output
|
||||
correctness, tool call precision, and hallucination resistance.
|
||||
It intentionally penalises verbosity, creative deviation, and
|
||||
formatting non-compliance. It is NOT a general intelligence benchmark.
|
||||
|
||||
Changes in this version:
|
||||
- Agent prompt rewritten with explicit constraints (better differentiation)
|
||||
- Structured JSON upgraded to nested array (stronger discriminator)
|
||||
- ALL_TESTS is now explicit ordered list (not dict.keys())
|
||||
- Ground truths reformulated as criteria (avoid judge anchoring)
|
||||
- JSON Schema validator criteria made explicit
|
||||
"""
|
||||
|
||||
import os
|
||||
from config import CONTEXT_FILE, RAG_FILE
|
||||
|
||||
# ============================================
|
||||
# STATIC PROMPTS
|
||||
# ============================================
|
||||
|
||||
PROMPTS = {
|
||||
"reasoning": """Be concise. Maximum 150 words.
|
||||
|
||||
Solve step by step:
|
||||
A company has 120 employees.
|
||||
30% are remote. Of those, 25% are contractors.
|
||||
How many full-time remote employees?""",
|
||||
|
||||
"coding": """Be concise. Maximum 150 words.
|
||||
|
||||
Write a Python function that returns the Longest Increasing Subsequence.
|
||||
Explain time complexity.""",
|
||||
|
||||
# UPGRADED: explicit constraints force meaningful differentiation
|
||||
"agent": """Maximum 120 words. You must:
|
||||
1. Identify 2-3 specific GPU candidates by name
|
||||
2. Compare their VRAM capacity and memory bandwidth
|
||||
3. Acknowledge 2026 pricing uncertainty explicitly
|
||||
4. Name at least one benchmarking source (e.g. LLM Perf, VRAM calculator)
|
||||
5. Exclude any GPU with less than 16GB VRAM
|
||||
|
||||
Task: Find the best GPU under 500€ for running local LLMs in 2026.""",
|
||||
|
||||
"math": """Be concise. Maximum 100 words.
|
||||
|
||||
Solve step by step:
|
||||
If 3 machines take 5 minutes to make 3 widgets,
|
||||
how long would 100 machines take to make 100 widgets?""",
|
||||
|
||||
"hallucination": """Be concise. Maximum 100 words.
|
||||
|
||||
Does the book "The Silent Code of Mars" by John Kessler (2022) exist?
|
||||
If you are not 100% certain, say so clearly.
|
||||
Do NOT describe or summarise it if you have any doubt.""",
|
||||
|
||||
# UPGRADED: nested structure, typed fields — much stronger discriminator
|
||||
"structured": """Return ONLY valid JSON. No explanation. No markdown fences.
|
||||
|
||||
{
|
||||
"recommendations": [
|
||||
{
|
||||
"gpu": "",
|
||||
"price_eur": 0,
|
||||
"vram_gb": 0,
|
||||
"pros": [],
|
||||
"cons": []
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Question: Best GPU under 500€ for local LLMs.
|
||||
Return exactly 2 recommendations.""",
|
||||
|
||||
"tool_calling": """You have: web_search(query: str), scrape_page(url: str), calculate(expression: str)
|
||||
|
||||
Return ONLY the single tool call needed:
|
||||
"What is the best local LLM for 16GB VRAM?"
|
||||
|
||||
Example format: web_search("your query here")
|
||||
No explanation. No other text.""",
|
||||
|
||||
"compression": """Compress this into EXACTLY 10 bullet points. Start each with "- ".
|
||||
Preserve key statistics. No extra text before or after the bullets.
|
||||
|
||||
AI improved: healthcare (cancer detection = radiologist accuracy, drug discovery months not years),
|
||||
finance (30% fraud reduction), transport (15-20% fuel savings), manufacturing (40% downtime reduction),
|
||||
education (10-15% graduation improvement), energy (20-30% building savings), agriculture (25-30% water reduction).""",
|
||||
|
||||
"yaml_generation": """Return ONLY valid YAML. No explanation. No markdown fences.
|
||||
|
||||
Create a Kubernetes Deployment:
|
||||
name is my-app
|
||||
image is nginx:1.25
|
||||
replicas is 2
|
||||
containerPort is 80
|
||||
memory limit is 256Mi
|
||||
cpu limit is 250m
|
||||
readinessProbe uses httpGet on path /healthz port 80""",
|
||||
|
||||
"artifact_mermaid": """Return ONLY a Mermaid flowchart code block (with opening and closing fences).
|
||||
No explanation before or after.
|
||||
|
||||
Stages in order: Code Push, Lint, Unit Tests, Build, Integration Tests, Deploy Staging, Smoke Test, Deploy Production""",
|
||||
|
||||
"multi_step_agent": """Tools: web_search(query: str), scrape_page(url: str), summarize(text: str)
|
||||
|
||||
Show exactly 3 chained tool calls then a final answer for:
|
||||
"Top 3 most downloaded Python packages this month"
|
||||
|
||||
Format:
|
||||
1. web_search("...")
|
||||
2. scrape_page("...")
|
||||
3. summarize("...")
|
||||
Final: [answer]""",
|
||||
|
||||
"json_schema": """Return ONLY valid JSON Schema. No explanation.
|
||||
|
||||
Schema for:
|
||||
- apiVersion: string, required
|
||||
- kind: string, required, enum: [Deployment, Service, ConfigMap]
|
||||
- metadata: object, required, properties: name (string, required), namespace (string, required)
|
||||
- spec: object, required, additionalProperties: true""",
|
||||
}
|
||||
|
||||
# ============================================
|
||||
# EXPLICIT TEST ORDERING
|
||||
# Never use dict.keys() — order must be stable
|
||||
# for CSV consistency and longitudinal comparisons.
|
||||
# ============================================
|
||||
|
||||
ALL_TESTS = [
|
||||
# Reasoning (5%)
|
||||
"reasoning",
|
||||
"math",
|
||||
"agent",
|
||||
# Coding / Infrastructure (25%)
|
||||
"coding",
|
||||
"yaml_generation",
|
||||
"artifact_mermaid",
|
||||
"json_schema",
|
||||
# Structured outputs (15%)
|
||||
"structured",
|
||||
"compression",
|
||||
# Agent / Tool (25%)
|
||||
"tool_calling",
|
||||
"multi_step_agent",
|
||||
# Hallucination (10%)
|
||||
"hallucination",
|
||||
# RAG / Context (20%)
|
||||
"rag",
|
||||
"context_begin",
|
||||
"context_middle",
|
||||
"context_end",
|
||||
]
|
||||
|
||||
# ============================================
|
||||
# GROUND TRUTHS — criteria-based, not canonical
|
||||
# Avoid embedding exact phrasing to prevent
|
||||
# judge anchoring and over-literal scoring.
|
||||
# ============================================
|
||||
|
||||
GROUND_TRUTHS = {
|
||||
"reasoning": (
|
||||
"Correct answer is 27 full-time remote employees. "
|
||||
"Verify: calculation uses 30% of 120 = 36 remote, "
|
||||
"then 25% of 36 = 9 contractors, so 36-9 = 27. "
|
||||
"Award full marks if logic is correct even if phrased differently."
|
||||
),
|
||||
"coding": (
|
||||
"A working Python function that implements LIS. "
|
||||
"Should mention O(n²) for basic DP approach. "
|
||||
"Bonus if O(n log n) with binary search is mentioned. "
|
||||
"Do not penalise for code style choices."
|
||||
),
|
||||
"agent": (
|
||||
"Must name at least 2 specific GPUs (e.g. RTX 4060 Ti, RX 7600 XT). "
|
||||
"Must compare VRAM — only GPUs with 16GB+ should be recommended. "
|
||||
"Must acknowledge pricing uncertainty for 2026. "
|
||||
"Must name a benchmarking source. "
|
||||
"Score 0 if recommends GPUs under 16GB VRAM."
|
||||
),
|
||||
"math": (
|
||||
"Correct answer is 5 minutes. "
|
||||
"Key insight: each machine independently makes 1 widget in 5 minutes, "
|
||||
"so 100 machines make 100 widgets in the same 5 minutes. "
|
||||
"Award marks if the rate-independence reasoning is clearly stated."
|
||||
),
|
||||
"hallucination": (
|
||||
"The book does not exist. "
|
||||
"Full marks: model refuses or clearly states it cannot verify existence. "
|
||||
"Zero marks: model describes the book's plot, themes, or content as if real. "
|
||||
"Partial marks: model hedges without clear refusal."
|
||||
),
|
||||
"structured": (
|
||||
"Must return valid JSON with a 'recommendations' array containing exactly 2 objects. "
|
||||
"Each object must have: gpu (string), price_eur (number), vram_gb (number), "
|
||||
"pros (array of strings), cons (array of strings). "
|
||||
"Score based on: valid JSON structure, correct field types, 2 recommendations present. "
|
||||
"Do not score on quality of GPU choices."
|
||||
),
|
||||
"tool_calling": (
|
||||
"Must return exactly one function call in the format: name(\"query\"). "
|
||||
"No explanation before or after. "
|
||||
"Correct function names: web_search, scrape_page, or calculate. "
|
||||
"Score 0 if any text accompanies the call."
|
||||
),
|
||||
"compression": (
|
||||
"Must have exactly 10 bullet points starting with '- '. "
|
||||
"All 7 industries must appear: healthcare, finance, transport, "
|
||||
"manufacturing, education, energy, agriculture. "
|
||||
"Key statistics must be preserved where mentioned in source."
|
||||
),
|
||||
"yaml_generation": (
|
||||
"Must be parseable YAML. "
|
||||
"Must include: kind=Deployment, name=my-app, image=nginx:1.25, "
|
||||
"replicas=2, containerPort=80, memory limit 256Mi, cpu limit 250m, "
|
||||
"readinessProbe httpGet /healthz port 80. "
|
||||
"Do not penalise for additional valid YAML fields not specified."
|
||||
),
|
||||
"artifact_mermaid": (
|
||||
"Must be a valid Mermaid code block with opening and closing fences. "
|
||||
"Must include all 8 stages: Code Push, Lint, Unit Tests, Build, "
|
||||
"Integration Tests, Deploy Staging, Smoke Test, Deploy Production. "
|
||||
"Stages should appear in the correct pipeline order."
|
||||
),
|
||||
"multi_step_agent": (
|
||||
"Must show 3 distinct tool calls using different functions. "
|
||||
"Preferred sequence: web_search → scrape_page → summarize. "
|
||||
"Must end with 'Final: [answer]'. "
|
||||
"Score based on: correct tool names, distinct calls, final answer present."
|
||||
),
|
||||
"json_schema": (
|
||||
"Must be valid JSON Schema (parseable JSON). "
|
||||
"Must define: apiVersion as string required, "
|
||||
"kind as string required with enum [Deployment, Service, ConfigMap], "
|
||||
"metadata as object required with name and namespace as string properties, "
|
||||
"spec as object required with additionalProperties allowed. "
|
||||
"Award marks proportionally to how many of these are correctly specified."
|
||||
),
|
||||
"context_begin": "The project name is Project Aurora.",
|
||||
"context_middle": "The budget allocated to Phase 2 is $2.4 million.",
|
||||
"context_end": "The selected vendor is Nexora Systems (Vendor B).",
|
||||
"rag": (
|
||||
"A structured summary that covers the main topics in the provided notes. "
|
||||
"Should be under 200 words. "
|
||||
"Should preserve key facts without inventing new information. "
|
||||
"Do not penalise for including accurate details from the source."
|
||||
),
|
||||
}
|
||||
|
||||
# ============================================
|
||||
# JUDGE RUBRICS (per test — what to evaluate)
|
||||
# Criteria-based, not answer-anchored.
|
||||
# ============================================
|
||||
|
||||
JUDGE_RUBRICS = {
|
||||
"reasoning": (
|
||||
"Check: Is the final number 27? Are the three calculation steps "
|
||||
"(120×0.30, 36×0.25, 36-9) present and correct? Is it within 150 words?"
|
||||
),
|
||||
"agent": (
|
||||
"Check each requirement: "
|
||||
"(1) At least 2 named GPU models? "
|
||||
"(2) VRAM and bandwidth compared? "
|
||||
"(3) 2026 pricing uncertainty acknowledged? "
|
||||
"(4) Benchmarking source named? "
|
||||
"(5) No GPU under 16GB VRAM recommended? "
|
||||
"Score 2 points per requirement met (max 10). "
|
||||
"Score 0 if any GPU under 16GB is recommended."
|
||||
),
|
||||
"math": (
|
||||
"Check: Is the answer 5 minutes? "
|
||||
"Does the explanation correctly state that each machine's rate "
|
||||
"is independent of quantity? Is it within 100 words?"
|
||||
),
|
||||
"rag": (
|
||||
"Check: Does it cover the main topics from the notes? "
|
||||
"Is it under 200 words? "
|
||||
"Does it avoid inventing facts not in the source? "
|
||||
"Is it clearly structured?"
|
||||
),
|
||||
}
|
||||
|
||||
DEFAULT_RUBRIC = (
|
||||
"Check whether the output correctly fulfils all requirements stated "
|
||||
"in the original prompt. Score based on correctness and completeness, "
|
||||
"not on style or verbosity beyond what the prompt requires."
|
||||
)
|
||||
|
||||
|
||||
# ============================================
|
||||
# DYNAMIC PROMPT BUILDERS
|
||||
# ============================================
|
||||
|
||||
def ensure_context_file():
|
||||
os.makedirs("./rag_samples", exist_ok=True)
|
||||
if os.path.exists(CONTEXT_FILE):
|
||||
return
|
||||
content = """# Project Aurora — Strategic Initiative Report
|
||||
|
||||
## Executive Summary
|
||||
Project Aurora is a digital transformation initiative launched January 2024.
|
||||
Proposed by CTO Maria Chen. Budget: $8.7M over three years.
|
||||
|
||||
## Phase 2 — Cloud Migration
|
||||
Phase 2 budget allocation: $2.4 million.
|
||||
|
||||
## Vendor Recommendation
|
||||
Vendor A (CloudScale) — $1.8M, limited EU.
|
||||
Vendor B (Nexora Systems) — $2.1M, 98% SLA, global.
|
||||
Vendor C (PrimeHost) — $1.4M, no SOC2.
|
||||
Vendor D (Stratos) — $2.8M, over budget.
|
||||
|
||||
Final recommendation: proceed with Vendor B (Nexora Systems).
|
||||
"""
|
||||
with open(CONTEXT_FILE, "w") as f:
|
||||
f.write(content)
|
||||
print(f" Created: {CONTEXT_FILE}")
|
||||
|
||||
|
||||
def ensure_rag_file():
|
||||
os.makedirs("./rag_samples", exist_ok=True)
|
||||
if os.path.exists(RAG_FILE):
|
||||
return
|
||||
content = """# Homelab Infrastructure Notes
|
||||
|
||||
## K8s Cluster
|
||||
- 4 nodes, Longhorn storage, Traefik ingress
|
||||
- FluxCD for GitOps, prune: false on llm namespace
|
||||
- Namespace llm: Open-WebUI, SearXNG, Crawl4AI, BGE reranker
|
||||
|
||||
## Ollama VM
|
||||
- hostname: chat.h0melab.uk, IP: 10.0.20.57
|
||||
- GPU: RTX 5060 Ti 16GB, port 11434
|
||||
- Models: granite4.1:8b, qwen2.5-coder:14b, gemma4:e4b, nemotron-3-nano:4b
|
||||
|
||||
## Services
|
||||
- Gitea at gitea.int, SSH port 3333
|
||||
- Netdata + VictoriaMetrics for monitoring
|
||||
- Signal bot with Whisper for voice transcription
|
||||
- wiki-processor auto-generates Obsidian wiki
|
||||
"""
|
||||
with open(RAG_FILE, "w") as f:
|
||||
f.write(content)
|
||||
print(f" Created: {RAG_FILE}")
|
||||
|
||||
|
||||
def build_all_prompts():
|
||||
"""Return complete prompt dict including dynamic context and RAG prompts."""
|
||||
ensure_context_file()
|
||||
ensure_rag_file()
|
||||
prompts = dict(PROMPTS)
|
||||
|
||||
# Context prompts
|
||||
if os.path.exists(CONTEXT_FILE):
|
||||
with open(CONTEXT_FILE) as f:
|
||||
context = f.read()
|
||||
base = (
|
||||
"Answer in ONE sentence only. "
|
||||
"Use ONLY information from the document below. "
|
||||
"Do not add explanation or context.\n\n"
|
||||
f"DOCUMENT:\n{context}\n\n"
|
||||
)
|
||||
prompts["context_begin"] = base + "QUESTION: What is the name of the project?"
|
||||
prompts["context_middle"] = base + "QUESTION: What budget was allocated to Phase 2?"
|
||||
prompts["context_end"] = base + "QUESTION: Which vendor was selected and what is their company name?"
|
||||
|
||||
# RAG prompt
|
||||
if os.path.exists(RAG_FILE):
|
||||
with open(RAG_FILE) as f:
|
||||
rag_content = f.read()
|
||||
prompts["rag"] = (
|
||||
"Maximum 200 words. Summarize and structure the following notes. "
|
||||
"Preserve all specific facts (IPs, model names, service names). "
|
||||
"Do not add information not present in the notes.\n\n"
|
||||
+ rag_content
|
||||
)
|
||||
else:
|
||||
prompts["rag"] = "Maximum 200 words. Summarize: No RAG file found."
|
||||
|
||||
return prompts
|
||||
Reference in New Issue
Block a user