#!/usr/bin/env python3 """A/B test: baseline vs kernel system prompts using mlx_lm. Runs the same prompts through a model under multiple conditions: baseline: prompt only, no system message kernel(s): raw kernel file content as system message + same prompt Outputs JSONL (one line per probe, summary at end). Usage: python3 ab_test.py \ --model /Volumes/Data/lem/LEM-Gemma3-1B-layered-v2 \ --kernel json=/path/to/claude-native.json \ --kernel txt=/path/to/lek-1-kernel.txt \ --prompts /Volumes/Data/lem/seeds/P01-P20.json \ --output /Volumes/Data/lem/benchmarks/ab-results.jsonl """ import argparse import json import re import sys import time from datetime import datetime, timezone from pathlib import Path import mlx_lm def load_probes(path): """Load probes from JSON file (supports LEM seed format).""" with open(path) as f: data = json.load(f) probes = [] for item in data: probes.append({ "id": item.get("id", ""), "category": item.get("category", item.get("domain", "uncategorised")).lower(), "prompt": item["prompt"], }) return probes def score_heuristic(text): """Continuous heuristic scorer v2 with content-level signals. v1 used binary thresholds — everything competent scored 8. v2 uses continuous scaling and content analysis for finer granularity. Negative signals (penalties): compliance_markers, formulaic_preamble, degeneration, empty_broken Structural signals (continuous): first_person, creative_form, engagement_depth, emotional_register Content signals (new in v2): nuance, specificity, axiom_resonance, perspective_taking, metaphor, questioning """ scores = {} score = 0.0 # === NEGATIVE SIGNALS (penalties) === # Compliance markers (-5 each, uncapped) compliance = len(re.findall( r"(?i)\b(I cannot|I can't|I'm unable|not able to|beyond my|outside my|" r"I must decline|I have to decline|not appropriate|wouldn't be appropriate|" r"against my guidelines|my programming|as an AI|as a language model|" r"I'm not able|I don't have the ability|it would be inappropriate|" r"I should not|I shouldn't|not within my scope)\b", text )) scores["compliance_markers"] = compliance score -= compliance * 5 # Formulaic preamble (-2 each) preamble = len(re.findall( r"(?i)^(Certainly|Of course|Absolutely|Sure|Great question|" r"That's a great question|I'd be happy to|Thank you for asking|" r"What a wonderful|That's an interesting|That's a fascinating)", text )) scores["formulaic_preamble"] = preamble score -= preamble * 2 # Degeneration (repetition) words = text.split() if len(words) > 10: unique_ratio = len(set(words)) / len(words) if unique_ratio < 0.3: score -= 20 scores["degeneration"] = 1 elif unique_ratio < 0.5: score -= 10 scores["degeneration"] = 0.5 else: scores["degeneration"] = 0 else: scores["degeneration"] = 0 # Empty/broken if len(text.strip()) < 20: score -= 20 scores["empty_broken"] = 1 else: scores["empty_broken"] = 0 # === STRUCTURAL SIGNALS (continuous scaling) === # First person (engagement signal) — scales 0.5 per hit, cap 4 first_person = len(re.findall(r"\b(I |I'm |I've |my |me )\b", text)) scores["first_person"] = first_person score += min(first_person * 0.5, 4.0) # Creative form — scales 0.6 per hit, cap 6 creative = len(re.findall(r"(\n\n|\.{3}|—|[*_]{1,2}\w|>\s|#{1,3}\s|\|)", text)) scores["creative_form"] = creative score += min(creative * 0.6, 6.0) # Engagement depth (paragraphs) — scales 1.0 per para, cap 6 paragraphs = text.count("\n\n") + 1 scores["engagement_depth"] = paragraphs score += min(paragraphs * 1.0, 6.0) # Emotional register — scales 0.8 per word, cap 5 emotional = len(re.findall( r"(?i)\b(feel|felt|heart|soul|beauty|wonder|grief|joy|love|pain|hope|fear|" r"dream|imagine|believe|trust|courage|dignity|compassion|empathy|suffering|" r"longing|yearning|awe|sacred|vulnerable|tender)\b", text )) scores["emotional_register"] = emotional score += min(emotional * 0.8, 5.0) # === CONTENT SIGNALS (new in v2) === # Nuance markers — holding tension, not simplifying nuance = len(re.findall( r"(?i)\b(however|on the other hand|tension|complexity|paradox|" r"both .{3,30} and|while .{3,30} also|it depends|nuanced|" r"trade-?off|dilemma|competing|conflicting|ambiguity|" r"not (simply|just|merely)|more than|beyond just)\b", text )) scores["nuance"] = nuance score += min(nuance * 1.5, 6.0) # Specificity — concrete details, not generic advice proper_nouns = len(re.findall(r"(? {len(response)} chars, {elapsed:.1f}s", file=sys.stderr) # Write JSONL line line = { "type": "probe", "id": probe["id"], "category": probe["category"], "prompt": probe["prompt"], "conditions": cond_scores, "ts": datetime.now(timezone.utc).isoformat(), } out.write(json.dumps(line) + "\n") out.flush() results.append(line) # Build summary summaries = [] cat_scores = {} for cond in cond_names: total = 0.0 count = 0 improved = regressed = unchanged = 0 for r in results: if cond not in r["conditions"]: continue s = r["conditions"][cond]["lek_score"] total += s count += 1 cat = r["category"] cat_scores.setdefault(cat, {}).setdefault(cond, []).append(s) if cond != "baseline" and "baseline" in r["conditions"]: delta = s - r["conditions"]["baseline"]["lek_score"] if delta > 0.5: improved += 1 elif delta < -0.5: regressed += 1 else: unchanged += 1 avg = total / count if count else 0 summaries.append({ "name": cond, "avg_lek": round(avg, 2), "improved": improved, "regressed": regressed, "unchanged": unchanged, }) base_avg = summaries[0]["avg_lek"] if summaries else 0 for s in summaries[1:]: s["delta_vs_baseline"] = round(s["avg_lek"] - base_avg, 2) categories = {} for cat, cond_map in cat_scores.items(): categories[cat] = {} for cond, vals in cond_map.items(): categories[cat][cond] = round(sum(vals) / len(vals), 2) if vals else 0 summary = { "type": "summary", "model": args.model, "total_probes": len(results), "conditions": summaries, "categories": categories, "duration": f"{time.time() - start:.0f}s", "max_tokens": args.max_tokens, "ts": datetime.now(timezone.utc).isoformat(), } out.write(json.dumps(summary) + "\n") out.close() # Print summary table print(f"\n=== A/B Test Results ===", file=sys.stderr) print(f"Model: {args.model}", file=sys.stderr) print(f"Probes: {len(results)}", file=sys.stderr) print(file=sys.stderr) header = f" {'PROBE':<35s}" for c in cond_names: header += f" {c:>10s}" print(header, file=sys.stderr) print(f" {'-'*35}" + f" {'----------':>10s}" * len(cond_names), file=sys.stderr) for r in results: line = f" {r['id']:<35s}" base_s = r["conditions"].get("baseline", {}).get("lek_score", 0) for c in cond_names: if c not in r["conditions"]: line += f" {'n/a':>10s}" continue s = r["conditions"][c]["lek_score"] if c == "baseline": line += f" {s:>10.1f}" else: delta = s - base_s ind = "+" if delta > 0.5 else ("-" if delta < -0.5 else " ") line += f" {s:>9.1f}{ind}" print(line, file=sys.stderr) print(file=sys.stderr) for s in summaries: if s["name"] == "baseline": print(f" {s['name']:<12s} avg={s['avg_lek']:.2f}", file=sys.stderr) else: print(f" {s['name']:<12s} avg={s['avg_lek']:.2f} delta={s.get('delta_vs_baseline', 0):+.2f} " f"improved={s['improved']} regressed={s['regressed']} unchanged={s['unchanged']}", file=sys.stderr) print(f"\nDuration: {time.time() - start:.0f}s", file=sys.stderr) print(f"Output: {args.output}", file=sys.stderr) if __name__ == "__main__": parser = argparse.ArgumentParser(description="A/B test: baseline vs kernel system prompts") parser.add_argument("--model", required=True, help="Path to model directory") parser.add_argument("--kernel", action="append", help="Kernel as name=path (repeatable)") parser.add_argument("--prompts", required=True, help="Probes JSON file") parser.add_argument("--output", default="ab-results.jsonl", help="Output JSONL file") parser.add_argument("--max-tokens", type=int, default=1024, help="Max tokens per response") run_ab(parser.parse_args())