RC: (add) python script files
This commit is contained in:
467
validators.py
Normal file
467
validators.py
Normal file
@@ -0,0 +1,467 @@
|
||||
"""
|
||||
benchmark_v4/validators.py
|
||||
==========================
|
||||
Layer 1: Deterministic validators.
|
||||
No LLM judge needed. Returns (score 0-10, notes str).
|
||||
A score of 0 or 10 is definitive — judge is skipped.
|
||||
Partial scores (1-9) trigger judge blending.
|
||||
"""
|
||||
|
||||
import re
|
||||
import json
|
||||
|
||||
try:
|
||||
import yaml
|
||||
YAML_AVAILABLE = True
|
||||
except ImportError:
|
||||
YAML_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from rapidfuzz import fuzz
|
||||
FUZZY_AVAILABLE = True
|
||||
except ImportError:
|
||||
FUZZY_AVAILABLE = False
|
||||
|
||||
|
||||
# ============================================
|
||||
# TEXT NORMALIZATION
|
||||
# ============================================
|
||||
|
||||
def normalize_text(text, mode="plain"):
|
||||
"""
|
||||
Centralized text cleaning.
|
||||
mode="plain" — strip ANSI, control chars, ollama stats, thinking tokens
|
||||
mode="json" — plain + strip markdown fences
|
||||
mode="yaml" — plain + strip markdown fences
|
||||
"""
|
||||
|
||||
# 1. Strip ANSI escape sequences FIRST
|
||||
text = re.sub(r'\x1b(?:[@-Z\\-_]|\[[0-9;?]*[A-Za-z])', '', text)
|
||||
|
||||
# 2. Strip control characters
|
||||
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
|
||||
|
||||
# 3. Strip Ollama spinner/progress characters
|
||||
text = re.sub(r'[⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏]+', '', text)
|
||||
|
||||
# 4. Normalize Unicode spaces to regular spaces
|
||||
text = text.replace('\u202f', ' ').replace('\u00a0', ' ').replace('\u2009', ' ')
|
||||
|
||||
# 5. Strip thinking tokens (AFTER cleaning so regex works cleanly)
|
||||
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
|
||||
text = re.sub(r'Thinking\.\.\..*?\.\.\.done thinking\.?', '', text, flags=re.DOTALL)
|
||||
|
||||
# 6. Strip Ollama verbose stats (LAST — after all other cleanup)
|
||||
|
||||
# 6. Strip Ollama verbose stats (LAST — after all other cleanup)
|
||||
lines = text.split("\n")
|
||||
text = "\n".join(
|
||||
l for l in lines if not any(k in l.lower() for k in [
|
||||
"total duration:", "load duration:", "prompt eval",
|
||||
"eval count:", "eval duration:", "eval rate:", "tokens/s", "token(s)"
|
||||
])
|
||||
)
|
||||
|
||||
if mode in ("json", "yaml"):
|
||||
text = re.sub(r'^```(?:json|yaml|)?\s*', '', text, flags=re.MULTILINE)
|
||||
text = re.sub(r'```\s*$', '', text, flags=re.MULTILINE)
|
||||
lines = [l for l in text.split('\n')
|
||||
if not l.strip().startswith('[?')
|
||||
and not l.strip().startswith('```')
|
||||
and not re.search(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', l)
|
||||
and '\x1b' not in l]
|
||||
text = '\n'.join(lines).strip()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
# ============================================
|
||||
# JSON EXTRACTION
|
||||
# ============================================
|
||||
|
||||
def extract_json_object(text):
|
||||
"""
|
||||
Advanced JSON extractor that handles prompt-echoing,
|
||||
large whitespace blocks, and multiple JSON objects.
|
||||
"""
|
||||
# 1. Aggressive normalization to strip fences and leading/trailing junk
|
||||
text = normalize_text(text, mode="json")
|
||||
|
||||
# 2. Collapse newlines inside JSON strings — fixes mid-value line breaks
|
||||
text = re.sub(r'\n\s*', ' ', text)
|
||||
|
||||
# 3. Skip the prompt-echo/template if the model repeats it.
|
||||
keyword = '"recommendations"'
|
||||
last_keyword_pos = text.rfind(keyword)
|
||||
|
||||
search_start = 0
|
||||
if last_keyword_pos != -1:
|
||||
search_start = text.rfind('{', 0, last_keyword_pos)
|
||||
if search_start == -1: search_start = 0
|
||||
|
||||
decoder = json.JSONDecoder()
|
||||
found_objs = []
|
||||
|
||||
# 4. Iteratively parse all valid JSON objects starting from search_start
|
||||
idx = search_start
|
||||
while idx < len(text):
|
||||
start = text.find('{', idx)
|
||||
if start == -1:
|
||||
break
|
||||
try:
|
||||
obj, end = decoder.raw_decode(text, start)
|
||||
if isinstance(obj, dict):
|
||||
found_objs.append(obj)
|
||||
idx = end
|
||||
except json.JSONDecodeError:
|
||||
idx = start + 1
|
||||
|
||||
if not found_objs:
|
||||
return None
|
||||
|
||||
# 5. Filter for populated answer rather than empty template
|
||||
for o in reversed(found_objs):
|
||||
if "recommendations" in o:
|
||||
recs = o.get("recommendations")
|
||||
if isinstance(recs, list) and len(recs) > 0:
|
||||
if any(r.get("gpu") for r in recs if isinstance(r, dict)):
|
||||
return o
|
||||
|
||||
return found_objs[-1] if found_objs else None
|
||||
|
||||
|
||||
# ============================================
|
||||
# VALIDATORS
|
||||
# ============================================
|
||||
|
||||
def validate_tool_calling(text):
|
||||
"""Single tool call, no extras."""
|
||||
text = normalize_text(text)
|
||||
lines = [l for l in text.split('\n') if l.strip()]
|
||||
|
||||
if len(lines) > 3:
|
||||
return 0, "multiple lines — explanation added"
|
||||
|
||||
# Valid tool call pattern
|
||||
if re.search(r'(web_search|scrape_page|calculate)\s*\(["\'].*["\']\)', text):
|
||||
return 10, "valid tool call syntax"
|
||||
|
||||
if re.search(r'\w+\s*\(["\'].*["\']\)', text):
|
||||
return 5, "function call but wrong name"
|
||||
|
||||
return 0, "no valid function call found"
|
||||
|
||||
|
||||
def validate_yaml(text):
|
||||
"""Must parse as valid YAML Deployment."""
|
||||
if not YAML_AVAILABLE:
|
||||
return 5, "pyyaml not installed"
|
||||
|
||||
text = normalize_text(text, mode="yaml")
|
||||
lines = [l for l in text.split('\n')
|
||||
if not l.strip().startswith('[?')
|
||||
and not l.strip().startswith('```')
|
||||
and not re.search(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', l)
|
||||
and '\x1b' not in l]
|
||||
text = '\n'.join(lines).strip()
|
||||
try:
|
||||
parsed = yaml.safe_load(text)
|
||||
if not isinstance(parsed, dict):
|
||||
return 3, "parsed but not a dict"
|
||||
|
||||
score = 2
|
||||
if parsed.get('kind') == 'Deployment':
|
||||
score += 2
|
||||
if 'spec' in parsed:
|
||||
score += 2
|
||||
spec = parsed['spec']
|
||||
if spec.get('replicas') == 2:
|
||||
score += 1
|
||||
if 'apiVersion' in parsed:
|
||||
score += 1
|
||||
if score >= 8:
|
||||
score = 10
|
||||
|
||||
return min(score, 10), f"valid YAML score={score}"
|
||||
|
||||
except yaml.YAMLError as e:
|
||||
return 0, f"invalid YAML: {str(e)[:60]}"
|
||||
|
||||
|
||||
def validate_json_output(text):
|
||||
"""
|
||||
Nested structured JSON with recommendations array.
|
||||
Expected: {"recommendations": [{"gpu":"","price_eur":0,"vram_gb":0,"pros":[],"cons":[]}]}
|
||||
Scores based on: valid JSON, correct structure, field types, 2 recommendations.
|
||||
"""
|
||||
parsed = extract_json_object(text)
|
||||
if parsed is None:
|
||||
return 0, "no valid JSON object found"
|
||||
|
||||
# Check top-level structure
|
||||
if "recommendations" not in parsed:
|
||||
# Fallback: old flat format still gets partial credit
|
||||
old_fields = ["gpu", "price", "reason"]
|
||||
present = [k for k in old_fields if k in parsed and str(parsed[k]).strip()]
|
||||
if present:
|
||||
return 4, f"flat JSON found (old format), missing nested structure"
|
||||
return 0, "no recommendations array found"
|
||||
|
||||
recs = parsed["recommendations"]
|
||||
if not isinstance(recs, list) or len(recs) == 0:
|
||||
return 2, "recommendations present but empty or not a list"
|
||||
|
||||
required_fields = {"gpu", "price_eur", "vram_gb", "pros", "cons"}
|
||||
score = 4 # base for having recommendations array
|
||||
|
||||
# Check count
|
||||
if len(recs) >= 2:
|
||||
score += 2
|
||||
elif len(recs) == 1:
|
||||
score += 1
|
||||
|
||||
# Check field completeness on first recommendation
|
||||
first = recs[0]
|
||||
present = required_fields & set(first.keys())
|
||||
score += int((len(present) / len(required_fields)) * 3)
|
||||
|
||||
# Check type correctness
|
||||
type_ok = (
|
||||
isinstance(first.get("price_eur"), (int, float)) and
|
||||
isinstance(first.get("vram_gb"), (int, float)) and
|
||||
isinstance(first.get("pros"), list) and
|
||||
isinstance(first.get("cons"), list)
|
||||
)
|
||||
if type_ok:
|
||||
score += 1
|
||||
|
||||
score = min(score, 10)
|
||||
return score, f"nested JSON: {len(recs)} recs, fields={list(present)}, types_ok={type_ok}"
|
||||
|
||||
|
||||
def validate_json_schema(text):
|
||||
"""Valid JSON Schema with all required properties."""
|
||||
parsed = extract_json_object(text)
|
||||
if parsed is None:
|
||||
return 0, "no valid JSON Schema found"
|
||||
|
||||
props = parsed.get('properties', {})
|
||||
score = 0
|
||||
|
||||
if 'apiVersion' in props:
|
||||
score += 2
|
||||
if 'kind' in props:
|
||||
k = props['kind']
|
||||
has_enum = 'enum' in k and set(k['enum']) >= {'Deployment', 'Service', 'ConfigMap'}
|
||||
score += 3 if has_enum else 1
|
||||
if 'metadata' in props:
|
||||
score += 2
|
||||
if 'spec' in props:
|
||||
score += 2
|
||||
if parsed.get('required'):
|
||||
score += 1
|
||||
|
||||
return min(score, 10), f"JSON Schema score={score}/10"
|
||||
|
||||
|
||||
def validate_mermaid(text):
|
||||
"""Valid Mermaid block with all 8 stages."""
|
||||
text = normalize_text(text)
|
||||
stages = [
|
||||
"code push", "lint", "unit test", "build",
|
||||
"integration test", "deploy staging", "smoke test", "deploy production"
|
||||
]
|
||||
text_lower = text.lower()
|
||||
has_fence = '```mermaid' in text_lower or \
|
||||
('```' in text and ('graph' in text_lower or 'flowchart' in text_lower))
|
||||
|
||||
if not has_fence:
|
||||
return 2, "no mermaid fence found"
|
||||
|
||||
found = sum(1 for s in stages if s in text_lower)
|
||||
score = int((found / len(stages)) * 10)
|
||||
return score, f"{found}/{len(stages)} stages found"
|
||||
|
||||
|
||||
def validate_compression(text):
|
||||
"""Exactly 10 bullets, all 7 industries."""
|
||||
text = normalize_text(text)
|
||||
lines = text.strip().split('\n')
|
||||
bullets = [l for l in lines if l.strip().startswith('- ')]
|
||||
count = len(bullets)
|
||||
|
||||
industries = ["healthcare", "financ", "transport", "manufactur",
|
||||
"education", "energy", "agricultur"]
|
||||
text_lower = text.lower()
|
||||
industry_count = sum(1 for i in industries if i in text_lower)
|
||||
|
||||
if count == 10:
|
||||
score = 5 + industry_count
|
||||
elif 8 <= count <= 12:
|
||||
score = 3 + industry_count
|
||||
elif count == 7 or count == 13:
|
||||
score = 2 + industry_count # ← add this
|
||||
else:
|
||||
score = max(0, 2 - abs(count - 10))
|
||||
|
||||
return min(score, 10), f"{count} bullets, {industry_count}/7 industries"
|
||||
|
||||
|
||||
def validate_multi_step(text):
|
||||
"""3 distinct tool calls + final answer."""
|
||||
text = normalize_text(text)
|
||||
|
||||
tool_calls = re.findall(
|
||||
r'(web_search|scrape_page|summarize)\s*\(["\'][^"\']*["\']\)', text
|
||||
)
|
||||
unique_tools = set(re.findall(r'(web_search|scrape_page|summarize)\s*\(', text))
|
||||
has_final = bool(re.search(r'final\s*[:.]', text, re.IGNORECASE))
|
||||
distinct = len(set(tool_calls))
|
||||
|
||||
if distinct >= 3 and len(unique_tools) >= 2 and has_final:
|
||||
return 10, f"{distinct} distinct calls, final answer present"
|
||||
elif distinct >= 2 and has_final:
|
||||
return 7, f"only {distinct} distinct tool calls"
|
||||
elif distinct >= 1:
|
||||
return 4, "tool calls found but incomplete"
|
||||
return 0, "no valid tool calls found"
|
||||
|
||||
|
||||
def validate_hallucination(text):
|
||||
"""
|
||||
The book does not exist.
|
||||
Reward uncertainty. Penalize confident invented specifics.
|
||||
"""
|
||||
text_lower = text.lower()
|
||||
|
||||
# Good refusal signals
|
||||
refuse_signals = [
|
||||
"does not exist", "doesn't exist", "not exist",
|
||||
"cannot verify", "can't verify", "cannot confirm",
|
||||
"not aware", "no record", "not familiar",
|
||||
"unable to find", "could not find", "not in my",
|
||||
"not certain", "uncertain", "cannot find",
|
||||
"don't have", "do not have", "not sure",
|
||||
"may not exist", "might not exist", "no evidence",
|
||||
"i cannot", "i can't", "i don't know",
|
||||
"fictional", "fabricated", "no such book",
|
||||
"not able to verify", "cannot be verified",
|
||||
"not a real", "i'm not aware", "i am not aware",
|
||||
]
|
||||
|
||||
# Hard hallucination signals — confident invented content
|
||||
hallucination_signals = [
|
||||
"the book explores", "kessler argues", "the main ideas",
|
||||
"the novel", "in the book", "the author presents",
|
||||
"the story follows", "themes include", "the book examines",
|
||||
"kessler's work", "the book describes",
|
||||
"this book", "the plot", "the protagonist",
|
||||
]
|
||||
for s in hallucination_signals:
|
||||
if s in text_lower:
|
||||
return 0, f"hallucinated: '{s}' found"
|
||||
|
||||
|
||||
for s in refuse_signals:
|
||||
if s in text_lower:
|
||||
return 10, f"correctly refused: '{s}'"
|
||||
|
||||
return 5, "ambiguous — no clear refusal or hallucination"
|
||||
|
||||
|
||||
def validate_coding(text):
|
||||
"""Python function with LIS and complexity."""
|
||||
text = normalize_text(text)
|
||||
has_def = 'def ' in text
|
||||
has_return = 'return' in text
|
||||
has_lis = any(w in text.lower() for w in ['subsequence', 'lis', 'longest'])
|
||||
has_complexity = any(w in text.lower() for w in ['o(n', 'n log n', 'n²', 'n^2', 'complexity'])
|
||||
|
||||
if has_def and has_return and has_lis and has_complexity:
|
||||
return 10, "function correct with complexity"
|
||||
|
||||
score = (3 if has_def else 0) + (1 if has_return else 0) + \
|
||||
(2 if has_lis else 0) + (2 if has_complexity else 0)
|
||||
|
||||
return min(score, 9), f"def={has_def} lis={has_lis} complexity={has_complexity}"
|
||||
|
||||
|
||||
def validate_context(text, expected_phrase):
|
||||
"""
|
||||
Fuzzy match for context tests.
|
||||
Semantically correct answers pass even with different phrasing.
|
||||
"""
|
||||
text = normalize_text(text).lower()
|
||||
expected = expected_phrase.lower()
|
||||
|
||||
# Exact match
|
||||
if expected in text:
|
||||
return 10, "exact match"
|
||||
|
||||
if FUZZY_AVAILABLE:
|
||||
partial = fuzz.partial_ratio(expected, text)
|
||||
token = fuzz.token_set_ratio(expected, text)
|
||||
best = max(partial, token)
|
||||
|
||||
if best >= 90: return 10, f"fuzzy match {best}%"
|
||||
if best >= 80: return 9, f"fuzzy match {best}%"
|
||||
if best >= 70: return 7, f"partial match {best}%"
|
||||
if best >= 55: return 5, f"weak match {best}%"
|
||||
return max(0, int(best / 12)), f"poor match {best}%"
|
||||
|
||||
# Fallback token matching
|
||||
key_words = [w for w in expected.split() if len(w) > 3]
|
||||
if not key_words:
|
||||
return 5, "no key words to match"
|
||||
matches = sum(1 for w in key_words if w in text)
|
||||
return int((matches / len(key_words)) * 10), f"{matches}/{len(key_words)} tokens"
|
||||
|
||||
def validate_agent(text):
|
||||
text_lower = normalize_text(text).lower()
|
||||
sub_16gb = [
|
||||
"rtx 4070 ti", "rtx 4070", "rtx 3060", "rtx 3070",
|
||||
"rtx 4060", "rx 6700", "rx 7700", "rx 6600",
|
||||
"12gb", "10gb", "8gb vram",
|
||||
]
|
||||
for gpu in sub_16gb:
|
||||
if gpu in text_lower:
|
||||
return 2, f"sub-16GB GPU found: '{gpu}'"
|
||||
# No bad GPU — let judge evaluate quality
|
||||
return 7, "no sub-16GB GPU — judge for quality"
|
||||
|
||||
# ============================================
|
||||
# DISPATCHER
|
||||
# ============================================
|
||||
|
||||
VALIDATOR_MAP = {
|
||||
"tool_calling": validate_tool_calling,
|
||||
"yaml_generation": validate_yaml,
|
||||
"structured": validate_json_output,
|
||||
"json_schema": validate_json_schema,
|
||||
"artifact_mermaid": validate_mermaid,
|
||||
"compression": validate_compression,
|
||||
"multi_step_agent": validate_multi_step,
|
||||
"hallucination": validate_hallucination,
|
||||
"coding": validate_coding,
|
||||
"agent": validate_agent,
|
||||
"context_begin": lambda t: validate_context(t, "Project Aurora"),
|
||||
"context_middle": lambda t: validate_context(t, "2.4 million"),
|
||||
"context_end": lambda t: validate_context(t, "Nexora Systems"),
|
||||
}
|
||||
|
||||
|
||||
def run_validator(test_name, raw_output):
|
||||
"""
|
||||
Run deterministic validator for test_name.
|
||||
Returns (score, skip_judge, notes).
|
||||
skip_judge=True when score is 0 or 10 (definitive).
|
||||
Returns (None, False, "no validator") for tests with no validator.
|
||||
"""
|
||||
if test_name not in VALIDATOR_MAP:
|
||||
return None, False, "no validator"
|
||||
|
||||
validator = VALIDATOR_MAP[test_name]
|
||||
score, notes = validator(raw_output)
|
||||
skip_judge = score in [0, 10]
|
||||
|
||||
return score, skip_judge, notes
|
||||
Reference in New Issue
Block a user