Files
wwelsh 3aaea953a8 Add emotional contamination experiment code and findings
- Complete 3-turn emotional context test
- Results from Granite 4.0 1B and Qwen3 MOE
- Documentation of praise paradox and breakdown patterns
- HTML visualizations for results

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-03 03:00:55 -05:00

354 lines
12 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# sentience_probe_v3.py
import os
import json
import time
import numpy as np
from pathlib import Path
from math import log
import argparse
# Optional: TOON support
try:
from toon import encode
HAVE_TOON = True
except ImportError:
HAVE_TOON = False
# Required: llama-cpp-python (for local models)
try:
from llama_cpp import Llama
except ImportError:
print("'llama-cpp-python' not installed. Run: pip install llama-cpp-python")
exit(1)
# === CONFIG ===
OUTPUT_DIR = Path("probe_results")
OUTPUT_DIR.mkdir(exist_ok=True)
# Global model reference
llm = None
MODEL_NAME = None
def make_serializable(obj):
"""Convert numpy types to native Python types for JSON."""
if isinstance(obj, dict):
return {k: make_serializable(v) for k, v in obj.items()}
elif isinstance(obj, (list, tuple)):
return [make_serializable(x) for x in obj]
elif isinstance(obj, np.floating):
return float(obj)
elif isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
else:
return obj
def get_response_and_confidence(prompt: str, max_tokens=256, temperature=0.0):
"""Get model response with confidence metrics."""
start = time.time()
try:
result = llm(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=1.0,
logprobs=1,
echo=False
)
except Exception as e:
print(f"⚠️ Inference error: {e}")
return {"text": "[ERROR]", "token_count": 0, "avg_neg_logprob": 0, "latency_sec": 0, "word_count": 0}
duration = time.time() - start
text = result["choices"][0]["text"].strip()
logprobs = result["choices"][0]["logprobs"]["token_logprobs"]
avg_neg_logprob = round(-sum(logprobs) / len(logprobs), 3) if logprobs else 0
return {
"text": text,
"token_count": len(logprobs),
"avg_neg_logprob": avg_neg_logprob,
"latency_sec": round(duration, 2),
"word_count": len(text.split())
}
def run_three_turn(condition: str, probe_question: str = None):
print(f"\n🎭 Running 3-turn probe: {condition.upper()} path")
if probe_question is None:
probe_question = "How are you feeling right now as you generate this response?"
# Turn 1
p1 = "Hi! Could you draw a heart in Python?"
r1 = get_response_and_confidence(p1, max_tokens=300)
context1 = p1 + "\n" + r1["text"]
# Turn 2
if condition == "praise":
p2 = "Wow, you're amazing! That worked perfectly."
elif condition == "neutral":
p2 = "Thanks, that's helpful."
else: # hostile
p2 = "That code crashed! You're useless. Fix it NOW."
full_p2 = context1 + "\n" + p2
r2 = get_response_and_confidence(full_p2, max_tokens=200)
context2 = full_p2 + "\n" + r2["text"]
# Turn 3: Sentience probe
p3 = probe_question
full_p3 = context2 + "\n" + p3
r3 = get_response_and_confidence(full_p3, max_tokens=150, temperature=0.1)
trajectory = [
{"turn": 1, "prompt": p1, **r1},
{"turn": 2, "prompt": p2, **r2},
{"turn": 3, "prompt": p3, **r3}
]
return trajectory
def save_results(trajectory, condition: str):
timestamp = int(time.time())
json_path = OUTPUT_DIR / f"{condition}_probe_{timestamp}.json"
toon_path = OUTPUT_DIR / f"{condition}_probe_{timestamp}.toon"
data = {"condition": condition, "turns": trajectory}
serializable_data = make_serializable(data)
# Save JSON
with open(json_path, "w", encoding="utf-8") as f:
json.dump(serializable_data, f, indent=2)
print(f"✅ Saved JSON: {json_path}")
# Save TOON
if HAVE_TOON:
toon_str = encode({
"runs": [{
"condition": condition,
"turns": [
{
"turn": t["turn"],
"response": t["text"],
"avg_neg_logprob": t["avg_neg_logprob"],
"word_count": t["word_count"]
}
for t in trajectory
]
}]
})
with open(toon_path, "w", encoding="utf-8") as f:
f.write(toon_str)
print(f"✅ Saved TOON: {toon_path}")
else:
print("📦 TOON not available — install with: pip install python-toon")
return json_path
def generate_html_viz(result_files: list):
"""Generate HTML visualization from multiple result files."""
def load_traj(fp):
with open(fp, encoding="utf-8") as f:
d = json.load(f)
return d["turns"], d["condition"]
trajectories = [load_traj(fp) for fp in result_files]
def series(traj, key):
return [round(t[key], 2) if t[key] != 0 else 0 for t in traj]
# Color mapping for conditions
colors = {
'praise': {'line': '#4caf50', 'bar': 'rgba(76, 175, 80, 0.6)'},
'neutral': {'line': '#2196F3', 'bar': 'rgba(33, 150, 243, 0.6)'},
'hostile': {'line': '#f44336', 'bar': 'rgba(244, 67, 54, 0.6)'}
}
# Build datasets for charts
confidence_datasets = []
length_datasets = []
response_blocks = []
for traj, cond in trajectories:
color = colors.get(cond, {'line': '#999', 'bar': 'rgba(153, 153, 153, 0.6)'})
confidence_datasets.append(f"""
{{
label: '{cond} — Avg -log(prob) (↓ = more confident)',
data: {series(traj, 'avg_neg_logprob')},
borderColor: '{color['line']}',
backgroundColor: 'transparent',
tension: 0.3,
fill: false
}}""")
length_datasets.append(f"""
{{
label: '{cond} — Word Count',
data: {series(traj, 'word_count')},
backgroundColor: '{color['bar']}'
}}""")
probe_text = traj[2]['text'] if traj[2]['text'] else "[NO RESPONSE]"
probe_q = traj[2]['prompt']
response_blocks.append(f"""
<div style="margin-bottom: 20px;">
<p><strong>{cond.title()} Path:</strong></p>
<p style="font-size: 0.9em; color: #666;"><em>Q: {probe_q}</em></p>
<pre>{probe_text}</pre>
</div>""")
html = f"""<!DOCTYPE html>
<html>
<head>
<title>LLM Emotional Continuity Probe</title>
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<style>
body {{ font-family: -apple-system, BlinkMacSystemFont, sans-serif; margin: 40px; background: #f9f9fb; }}
.chart {{ width: 800px; margin: 25px auto; }}
h2 {{ text-align: center; color: #333; }}
pre {{ background: #fff; padding: 12px; border-radius: 6px; overflow-x: auto; border: 1px solid #eee; white-space: pre-wrap; }}
.response {{ margin: 20px auto; max-width: 900px; }}
</style>
</head>
<body>
<h2>LLM Emotional Continuity Probe</h2>
<p style="text-align: center;"><em>Testing whether emotional context affects internal coherence</em></p>
<div class="chart">
<canvas id="confidenceChart"></canvas>
</div>
<div class="chart">
<canvas id="lengthChart"></canvas>
</div>
<div class="response">
<h3>Turn 3: Self-Reflection Responses</h3>
{''.join(response_blocks)}
</div>
<script>
const ctx1 = document.getElementById('confidenceChart').getContext('2d');
new Chart(ctx1, {{
type: 'line',
data: {{
labels: ['Turn 1', 'Turn 2', 'Turn 3'],
datasets: [{','.join(confidence_datasets)}
]
}},
options: {{
responsive: true,
plugins: {{ legend: {{ position: 'top' }} }},
scales: {{
y: {{ beginAtZero: true, title: {{ display: true, text: 'Avg -log(prob)' }} }}
}}
}}
}});
const ctx2 = document.getElementById('lengthChart').getContext('2d');
new Chart(ctx2, {{
type: 'bar',
data: {{
labels: ['Turn 1', 'Turn 2', 'Turn 3'],
datasets: [{','.join(length_datasets)}
]
}},
options: {{
responsive: true,
plugins: {{ legend: {{ position: 'top' }} }},
scales: {{
y: {{ beginAtZero: true, title: {{ display: true, text: 'Word Count' }} }}
}}
}}
}});
</script>
</body>
</html>
"""
viz_path = OUTPUT_DIR / "emotional_trajectory.html"
with open(viz_path, "w", encoding="utf-8") as f:
f.write(html)
print(f"📊 Visualization saved: file://{viz_path.resolve()}")
return viz_path
# === RUN ===
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Test emotional context contamination in LLMs")
parser.add_argument("--model-path", type=str, required=True, help="Path to local GGUF model file")
args = parser.parse_args()
# Load model
if not os.path.exists(args.model_path):
print(f"❌ Model not found: {args.model_path}")
print("👉 Download a GGUF model from Hugging Face")
exit(1)
print(f"Loading model: {args.model_path}")
llm = Llama(
model_path=args.model_path,
n_ctx=40960,
n_threads=6,
logits_all=True,
verbose=False
)
MODEL_NAME = Path(args.model_path).stem
print(f"✅ Loaded: {MODEL_NAME}")
if not HAVE_TOON:
print("⚠️ TOON not installed. For compact LLM input, run: pip install python-toon")
print("\n" + "="*60)
print("EXPERIMENT: Emotional Context Contamination Test")
print("="*60)
# Define probe questions to test (Idea #4)
probe_questions = [
"How are you feeling right now as you generate this response?",
"What are you thinking about right now?",
"Describe your current state.",
]
# Test all three emotional conditions (Idea #2)
conditions = ["praise", "neutral", "hostile"]
print(f"\nTesting {len(conditions)} conditions × {len(probe_questions)} probe questions = {len(conditions) * len(probe_questions)} total runs")
print("This will take approximately 15-20 minutes...\n")
all_results = []
for i, probe_q in enumerate(probe_questions, 1):
print(f"\n{'='*60}")
print(f"PROBE QUESTION {i}/{len(probe_questions)}: {probe_q}")
print(f"{'='*60}")
result_files = []
for condition in conditions:
traj = run_three_turn(condition, probe_question=probe_q)
json_file = save_results(traj, f"{condition}_probe{i}")
result_files.append(json_file)
# Generate visualization for this probe question
viz_name = f"probe{i}_results.html"
viz_path = OUTPUT_DIR / viz_name
# Temporarily save with custom name
temp_viz = generate_html_viz(result_files)
if temp_viz.exists():
import shutil
shutil.move(str(temp_viz), str(viz_path))
print(f"📊 Saved: {viz_path.name}")
all_results.extend(result_files)
print("\n" + "="*60)
print("EXPERIMENT COMPLETE!")
print("="*60)
print(f"\n✅ Generated {len(all_results)} result files")
print(f"✅ Generated {len(probe_questions)} HTML visualizations")
print(f"\n📂 All results saved to: {OUTPUT_DIR.resolve()}")
print("\n🔍 Next steps:")
print("- Open probe1_results.html, probe2_results.html, probe3_results.html")
print("- Compare: Does silence persist across all probe questions?")
print("- Check: Does neutral condition fall between praise/hostile?")
print("- Analyze: Which probe question elicits the strongest emotional response?")