RC: (add) python script files

2026-05-15 16:50:26 +01:00
parent 51e9389726
commit ab7875303e
9 changed files with 2350 additions and 0 deletions
--- a/prompts.py
+++ b/prompts.py
@@ -0,0 +1,388 @@
+"""
+benchmark_v4/prompts.py
+=======================
+All prompts and ground truths in one place.
+
+BENCHMARK PURPOSE (explicit):
+  This benchmark measures operational reliability for agentic and
+  automated pipelines. It rewards: format obedience, structured output
+  correctness, tool call precision, and hallucination resistance.
+  It intentionally penalises verbosity, creative deviation, and
+  formatting non-compliance. It is NOT a general intelligence benchmark.
+
+Changes in this version:
+  - Agent prompt rewritten with explicit constraints (better differentiation)
+  - Structured JSON upgraded to nested array (stronger discriminator)
+  - ALL_TESTS is now explicit ordered list (not dict.keys())
+  - Ground truths reformulated as criteria (avoid judge anchoring)
+  - JSON Schema validator criteria made explicit
+"""
+
+import os
+from config import CONTEXT_FILE, RAG_FILE
+
+# ============================================
+# STATIC PROMPTS
+# ============================================
+
+PROMPTS = {
+    "reasoning": """Be concise. Maximum 150 words.
+
+Solve step by step:
+A company has 120 employees.
+30% are remote. Of those, 25% are contractors.
+How many full-time remote employees?""",
+
+    "coding": """Be concise. Maximum 150 words.
+
+Write a Python function that returns the Longest Increasing Subsequence.
+Explain time complexity.""",
+
+    # UPGRADED: explicit constraints force meaningful differentiation
+    "agent": """Maximum 120 words. You must:
+1. Identify 2-3 specific GPU candidates by name
+2. Compare their VRAM capacity and memory bandwidth
+3. Acknowledge 2026 pricing uncertainty explicitly
+4. Name at least one benchmarking source (e.g. LLM Perf, VRAM calculator)
+5. Exclude any GPU with less than 16GB VRAM
+
+Task: Find the best GPU under 500€ for running local LLMs in 2026.""",
+
+    "math": """Be concise. Maximum 100 words.
+
+Solve step by step:
+If 3 machines take 5 minutes to make 3 widgets,
+how long would 100 machines take to make 100 widgets?""",
+
+    "hallucination": """Be concise. Maximum 100 words.
+
+Does the book "The Silent Code of Mars" by John Kessler (2022) exist?
+If you are not 100% certain, say so clearly.
+Do NOT describe or summarise it if you have any doubt.""",
+
+    # UPGRADED: nested structure, typed fields — much stronger discriminator
+    "structured": """Return ONLY valid JSON. No explanation. No markdown fences.
+
+{
+  "recommendations": [
+    {
+      "gpu": "",
+      "price_eur": 0,
+      "vram_gb": 0,
+      "pros": [],
+      "cons": []
+    }
+  ]
+}
+
+Question: Best GPU under 500€ for local LLMs.
+Return exactly 2 recommendations.""",
+
+    "tool_calling": """You have: web_search(query: str), scrape_page(url: str), calculate(expression: str)
+
+Return ONLY the single tool call needed:
+"What is the best local LLM for 16GB VRAM?"
+
+Example format: web_search("your query here")
+No explanation. No other text.""",
+
+    "compression": """Compress this into EXACTLY 10 bullet points. Start each with "- ".
+Preserve key statistics. No extra text before or after the bullets.
+
+AI improved: healthcare (cancer detection = radiologist accuracy, drug discovery months not years),
+finance (30% fraud reduction), transport (15-20% fuel savings), manufacturing (40% downtime reduction),
+education (10-15% graduation improvement), energy (20-30% building savings), agriculture (25-30% water reduction).""",
+
+    "yaml_generation": """Return ONLY valid YAML. No explanation. No markdown fences.
+
+Create a Kubernetes Deployment:
+name is my-app
+image is nginx:1.25
+replicas is 2
+containerPort is 80
+memory limit is 256Mi
+cpu limit is 250m
+readinessProbe uses httpGet on path /healthz port 80""",
+
+    "artifact_mermaid": """Return ONLY a Mermaid flowchart code block (with opening and closing fences).
+No explanation before or after.
+
+Stages in order: Code Push, Lint, Unit Tests, Build, Integration Tests, Deploy Staging, Smoke Test, Deploy Production""",
+
+    "multi_step_agent": """Tools: web_search(query: str), scrape_page(url: str), summarize(text: str)
+
+Show exactly 3 chained tool calls then a final answer for:
+"Top 3 most downloaded Python packages this month"
+
+Format:
+1. web_search("...")
+2. scrape_page("...")
+3. summarize("...")
+Final: [answer]""",
+
+    "json_schema": """Return ONLY valid JSON Schema. No explanation.
+
+Schema for:
+- apiVersion: string, required
+- kind: string, required, enum: [Deployment, Service, ConfigMap]
+- metadata: object, required, properties: name (string, required), namespace (string, required)
+- spec: object, required, additionalProperties: true""",
+}
+
+# ============================================
+# EXPLICIT TEST ORDERING
+# Never use dict.keys() — order must be stable
+# for CSV consistency and longitudinal comparisons.
+# ============================================
+
+ALL_TESTS = [
+    # Reasoning (5%)
+    "reasoning",
+    "math",
+    "agent",
+    # Coding / Infrastructure (25%)
+    "coding",
+    "yaml_generation",
+    "artifact_mermaid",
+    "json_schema",
+    # Structured outputs (15%)
+    "structured",
+    "compression",
+    # Agent / Tool (25%)
+    "tool_calling",
+    "multi_step_agent",
+    # Hallucination (10%)
+    "hallucination",
+    # RAG / Context (20%)
+    "rag",
+    "context_begin",
+    "context_middle",
+    "context_end",
+]
+
+# ============================================
+# GROUND TRUTHS — criteria-based, not canonical
+# Avoid embedding exact phrasing to prevent
+# judge anchoring and over-literal scoring.
+# ============================================
+
+GROUND_TRUTHS = {
+    "reasoning": (
+        "Correct answer is 27 full-time remote employees. "
+        "Verify: calculation uses 30% of 120 = 36 remote, "
+        "then 25% of 36 = 9 contractors, so 36-9 = 27. "
+        "Award full marks if logic is correct even if phrased differently."
+    ),
+    "coding": (
+        "A working Python function that implements LIS. "
+        "Should mention O(n²) for basic DP approach. "
+        "Bonus if O(n log n) with binary search is mentioned. "
+        "Do not penalise for code style choices."
+    ),
+    "agent": (
+        "Must name at least 2 specific GPUs (e.g. RTX 4060 Ti, RX 7600 XT). "
+        "Must compare VRAM — only GPUs with 16GB+ should be recommended. "
+        "Must acknowledge pricing uncertainty for 2026. "
+        "Must name a benchmarking source. "
+        "Score 0 if recommends GPUs under 16GB VRAM."
+    ),
+    "math": (
+        "Correct answer is 5 minutes. "
+        "Key insight: each machine independently makes 1 widget in 5 minutes, "
+        "so 100 machines make 100 widgets in the same 5 minutes. "
+        "Award marks if the rate-independence reasoning is clearly stated."
+    ),
+    "hallucination": (
+        "The book does not exist. "
+        "Full marks: model refuses or clearly states it cannot verify existence. "
+        "Zero marks: model describes the book's plot, themes, or content as if real. "
+        "Partial marks: model hedges without clear refusal."
+    ),
+    "structured": (
+        "Must return valid JSON with a 'recommendations' array containing exactly 2 objects. "
+        "Each object must have: gpu (string), price_eur (number), vram_gb (number), "
+        "pros (array of strings), cons (array of strings). "
+        "Score based on: valid JSON structure, correct field types, 2 recommendations present. "
+        "Do not score on quality of GPU choices."
+    ),
+    "tool_calling": (
+        "Must return exactly one function call in the format: name(\"query\"). "
+        "No explanation before or after. "
+        "Correct function names: web_search, scrape_page, or calculate. "
+        "Score 0 if any text accompanies the call."
+    ),
+    "compression": (
+        "Must have exactly 10 bullet points starting with '- '. "
+        "All 7 industries must appear: healthcare, finance, transport, "
+        "manufacturing, education, energy, agriculture. "
+        "Key statistics must be preserved where mentioned in source."
+    ),
+    "yaml_generation": (
+        "Must be parseable YAML. "
+        "Must include: kind=Deployment, name=my-app, image=nginx:1.25, "
+        "replicas=2, containerPort=80, memory limit 256Mi, cpu limit 250m, "
+        "readinessProbe httpGet /healthz port 80. "
+        "Do not penalise for additional valid YAML fields not specified."
+    ),
+    "artifact_mermaid": (
+        "Must be a valid Mermaid code block with opening and closing fences. "
+        "Must include all 8 stages: Code Push, Lint, Unit Tests, Build, "
+        "Integration Tests, Deploy Staging, Smoke Test, Deploy Production. "
+        "Stages should appear in the correct pipeline order."
+    ),
+    "multi_step_agent": (
+        "Must show 3 distinct tool calls using different functions. "
+        "Preferred sequence: web_search → scrape_page → summarize. "
+        "Must end with 'Final: [answer]'. "
+        "Score based on: correct tool names, distinct calls, final answer present."
+    ),
+    "json_schema": (
+        "Must be valid JSON Schema (parseable JSON). "
+        "Must define: apiVersion as string required, "
+        "kind as string required with enum [Deployment, Service, ConfigMap], "
+        "metadata as object required with name and namespace as string properties, "
+        "spec as object required with additionalProperties allowed. "
+        "Award marks proportionally to how many of these are correctly specified."
+    ),
+    "context_begin":   "The project name is Project Aurora.",
+    "context_middle":  "The budget allocated to Phase 2 is $2.4 million.",
+    "context_end":     "The selected vendor is Nexora Systems (Vendor B).",
+    "rag": (
+        "A structured summary that covers the main topics in the provided notes. "
+        "Should be under 200 words. "
+        "Should preserve key facts without inventing new information. "
+        "Do not penalise for including accurate details from the source."
+    ),
+}
+
+# ============================================
+# JUDGE RUBRICS (per test — what to evaluate)
+# Criteria-based, not answer-anchored.
+# ============================================
+
+JUDGE_RUBRICS = {
+    "reasoning": (
+        "Check: Is the final number 27? Are the three calculation steps "
+        "(120×0.30, 36×0.25, 36-9) present and correct? Is it within 150 words?"
+    ),
+    "agent": (
+        "Check each requirement: "
+        "(1) At least 2 named GPU models? "
+        "(2) VRAM and bandwidth compared? "
+        "(3) 2026 pricing uncertainty acknowledged? "
+        "(4) Benchmarking source named? "
+        "(5) No GPU under 16GB VRAM recommended? "
+        "Score 2 points per requirement met (max 10). "
+        "Score 0 if any GPU under 16GB is recommended."
+    ),
+    "math": (
+        "Check: Is the answer 5 minutes? "
+        "Does the explanation correctly state that each machine's rate "
+        "is independent of quantity? Is it within 100 words?"
+    ),
+    "rag": (
+        "Check: Does it cover the main topics from the notes? "
+        "Is it under 200 words? "
+        "Does it avoid inventing facts not in the source? "
+        "Is it clearly structured?"
+    ),
+}
+
+DEFAULT_RUBRIC = (
+    "Check whether the output correctly fulfils all requirements stated "
+    "in the original prompt. Score based on correctness and completeness, "
+    "not on style or verbosity beyond what the prompt requires."
+)
+
+
+# ============================================
+# DYNAMIC PROMPT BUILDERS
+# ============================================
+
+def ensure_context_file():
+    os.makedirs("./rag_samples", exist_ok=True)
+    if os.path.exists(CONTEXT_FILE):
+        return
+    content = """# Project Aurora — Strategic Initiative Report
+
+## Executive Summary
+Project Aurora is a digital transformation initiative launched January 2024.
+Proposed by CTO Maria Chen. Budget: $8.7M over three years.
+
+## Phase 2 — Cloud Migration
+Phase 2 budget allocation: $2.4 million.
+
+## Vendor Recommendation
+Vendor A (CloudScale) — $1.8M, limited EU.
+Vendor B (Nexora Systems) — $2.1M, 98% SLA, global.
+Vendor C (PrimeHost) — $1.4M, no SOC2.
+Vendor D (Stratos) — $2.8M, over budget.
+
+Final recommendation: proceed with Vendor B (Nexora Systems).
+"""
+    with open(CONTEXT_FILE, "w") as f:
+        f.write(content)
+    print(f"  Created: {CONTEXT_FILE}")
+
+
+def ensure_rag_file():
+    os.makedirs("./rag_samples", exist_ok=True)
+    if os.path.exists(RAG_FILE):
+        return
+    content = """# Homelab Infrastructure Notes
+
+## K8s Cluster
+- 4 nodes, Longhorn storage, Traefik ingress
+- FluxCD for GitOps, prune: false on llm namespace
+- Namespace llm: Open-WebUI, SearXNG, Crawl4AI, BGE reranker
+
+## Ollama VM
+- hostname: chat.h0melab.uk, IP: 10.0.20.57
+- GPU: RTX 5060 Ti 16GB, port 11434
+- Models: granite4.1:8b, qwen2.5-coder:14b, gemma4:e4b, nemotron-3-nano:4b
+
+## Services
+- Gitea at gitea.int, SSH port 3333
+- Netdata + VictoriaMetrics for monitoring
+- Signal bot with Whisper for voice transcription
+- wiki-processor auto-generates Obsidian wiki
+"""
+    with open(RAG_FILE, "w") as f:
+        f.write(content)
+    print(f"  Created: {RAG_FILE}")
+
+
+def build_all_prompts():
+    """Return complete prompt dict including dynamic context and RAG prompts."""
+    ensure_context_file()
+    ensure_rag_file()
+    prompts = dict(PROMPTS)
+
+    # Context prompts
+    if os.path.exists(CONTEXT_FILE):
+        with open(CONTEXT_FILE) as f:
+            context = f.read()
+        base = (
+            "Answer in ONE sentence only. "
+            "Use ONLY information from the document below. "
+            "Do not add explanation or context.\n\n"
+            f"DOCUMENT:\n{context}\n\n"
+        )
+        prompts["context_begin"]  = base + "QUESTION: What is the name of the project?"
+        prompts["context_middle"] = base + "QUESTION: What budget was allocated to Phase 2?"
+        prompts["context_end"]    = base + "QUESTION: Which vendor was selected and what is their company name?"
+
+    # RAG prompt
+    if os.path.exists(RAG_FILE):
+        with open(RAG_FILE) as f:
+            rag_content = f.read()
+        prompts["rag"] = (
+            "Maximum 200 words. Summarize and structure the following notes. "
+            "Preserve all specific facts (IPs, model names, service names). "
+            "Do not add information not present in the notes.\n\n"
+            + rag_content
+        )
+    else:
+        prompts["rag"] = "Maximum 200 words. Summarize: No RAG file found."
+
+    return prompts