A/B Testing
Parsec includes built-in A/B testing infrastructure for prompt templates with statistical significance testing. Compare different template versions and make data-driven decisions about which prompts perform best.
Quick Start
from parsec.prompts import (
TemplateManager,
TemplateAnalytics,
ABTest,
Variant,
TrafficSplitStrategy
)
from parsec.models.adapters import OpenAIAdapter
from parsec.validators import PydanticValidator
from parsec import EnforcementEngine
from pydantic import BaseModel
# Setup
adapter = OpenAIAdapter(api_key="your-key", model="gpt-4o-mini")
validator = PydanticValidator()
engine = EnforcementEngine(adapter, validator)
analytics = TemplateAnalytics()
manager = TemplateManager(analytics=analytics)
# Define output schema
class Email(BaseModel):
subject: str
body: str
# Register template versions to test
manager.register_template(
name="email_writer",
template="Write a professional email about {topic}.",
version="1.0.0",
schema=Email
)
manager.register_template(
name="email_writer",
template="Write a {tone} email about {topic}. Be {style}.",
version="2.0.0",
schema=Email
)
# Create A/B test
variants = [
Variant(template_name="email_writer", version="1.0.0"),
Variant(template_name="email_writer", version="2.0.0")
]
ab_test = ABTest(
test_name="email_writer_test",
variants=variants,
analytics=analytics,
strategy=TrafficSplitStrategy.UNIFORM
)
# Production: select variant for each request
variant = ab_test.select_variant()
template = manager.get_template(variant.template_name, variant.version)
result = await template.render(engine=engine, topic="project deadline")
# After collecting data: analyze results
results = ab_test.get_results()
if results.is_significant:
print(f"Winner: {results.winner.version}")
print(f"Confidence: {results.confidence:.2%}")Traffic Split Strategies
Uniform Distribution
Split traffic evenly across all variants:
ab_test = ABTest(
test_name="my_test",
variants=[variant_a, variant_b, variant_c],
analytics=analytics,
strategy=TrafficSplitStrategy.UNIFORM
)
# Each variant gets ~33% of trafficWeighted Distribution
Custom traffic allocation for gradual rollouts:
variants = [
Variant("template", "1.0.0", weight=0.8), # 80% traffic
Variant("template", "2.0.0", weight=0.2) # 20% traffic
]
ab_test = ABTest(
test_name="my_test",
variants=variants,
analytics=analytics,
strategy=TrafficSplitStrategy.WEIGHTED
)Use weighted distribution to:
- Gradually roll out new versions (95/5 → 80/20 → 50/50)
- Minimize risk of untested versions
- Conduct “champion/challenger” testing
Epsilon-Greedy
Automatically optimize over time with exploration:
ab_test = ABTest(
test_name="my_test",
variants=[variant_a, variant_b],
analytics=analytics,
strategy=TrafficSplitStrategy.EPSILON_GREEDY
)
# 90% traffic to best performing variant
# 10% traffic randomly distributed (exploration)Best for:
- Automatic optimization
- Adapting to changing conditions
- Continuous learning scenarios
Statistical Significance
Parsec uses two-proportion z-tests to determine if differences are statistically significant:
ab_test = ABTest(
test_name="conversion_test",
variants=[control, treatment],
analytics=analytics,
min_sample_size=100, # Minimum calls before declaring winner
significance_level=0.05 # p < 0.05 required (95% confidence)
)
# Get results
results = ab_test.get_results()
if results.is_significant:
print(f"Winner: {results.winner.version}")
print(f"Confidence: {results.confidence:.2%}")
# View metrics
winner_key = f"{results.winner.template_name}:{results.winner.version}"
winner_metrics = results.metrics_by_variant[winner_key]
print(f"Success rate: {winner_metrics.success_rate:.2%}")Understanding Results
results = ab_test.get_results()
# Check if test has enough data
if results.sample_size < ab_test.min_sample_size:
print(f"Need {ab_test.min_sample_size - results.sample_size} more samples")
# Continue collecting data
# Check for statistical significance
elif results.is_significant:
print(f"✅ Significant difference found!")
print(f"Winner: {results.winner.version}")
print(f"Confidence: {results.confidence:.2%}")
else:
print(f"No significant difference detected")
print(f"Variants perform similarly")Comparing Metrics
View detailed metrics for each variant:
results = ab_test.get_results()
for variant_key, metrics in results.metrics_by_variant.items():
print(f"\n{variant_key}:")
print(f" Success rate: {metrics.success_rate:.2%}")
print(f" Avg latency: {metrics.average_latency_ms:.0f}ms")
print(f" Avg tokens: {metrics.average_tokens:.0f}")
print(f" Total calls: {metrics.total_calls}")Complete Workflow
from parsec.prompts import TemplateManager, TemplateAnalytics, ABTest, Variant, TrafficSplitStrategy
from parsec.models.adapters import OpenAIAdapter
from parsec.validators import PydanticValidator
from parsec import EnforcementEngine
from pydantic import BaseModel
# 1. Setup
adapter = OpenAIAdapter(api_key="your-key", model="gpt-4o-mini")
validator = PydanticValidator()
engine = EnforcementEngine(adapter, validator)
analytics = TemplateAnalytics()
manager = TemplateManager(analytics=analytics)
class EmailOutput(BaseModel):
subject: str
body: str
# 2. Register template versions
manager.register_template(
name="email_writer",
template="Write a professional email about {topic}.",
version="1.0.0",
schema=EmailOutput
)
manager.register_template(
name="email_writer",
template="Write a {tone} email about {topic}. Be {style}.",
version="2.0.0",
schema=EmailOutput
)
# 3. Create A/B test
variants = [
Variant("email_writer", "1.0.0", weight=0.5),
Variant("email_writer", "2.0.0", weight=0.5)
]
ab_test = ABTest(
test_name="email_writer_improvement",
variants=variants,
analytics=analytics,
strategy=TrafficSplitStrategy.WEIGHTED,
min_sample_size=100
)
# 4. Use in production
async def generate_email(topic: str):
variant = ab_test.select_variant()
template = manager.get_template(variant.template_name, variant.version)
result = await template.render(
engine=engine,
topic=topic,
tone="professional", # Only used by v2.0.0
style="concise" # Only used by v2.0.0
)
return result
# 5. Collect data
for topic in topics:
await generate_email(topic)
# 6. Analyze results
results = ab_test.get_results()
if results.is_significant and results.winner:
print(f"Winner: {results.winner.version}")
print(f"Confidence: {results.confidence:.2%}")
# View performance
winner_key = f"{results.winner.template_name}:{results.winner.version}"
metrics = results.metrics_by_variant[winner_key]
print(f"Success rate: {metrics.success_rate:.2%}")
print(f"Avg latency: {metrics.average_latency_ms:.0f}ms")Multi-Variant Testing
Test more than two variants simultaneously:
variants = [
Variant("classifier", "1.0.0"), # Original
Variant("classifier", "2.0.0"), # More detailed
Variant("classifier", "2.1.0"), # Simplified
Variant("classifier", "3.0.0"), # Different approach
]
ab_test = ABTest(
test_name="multi_variant_test",
variants=variants,
analytics=analytics,
strategy=TrafficSplitStrategy.UNIFORM,
min_sample_size=200 # Higher sample size for more variants
)
# Winner declared only if significantly better than ALL variantsGradual Rollout Pattern
Use weighted distribution for safe rollouts:
# Week 1: 95/5 split (minimal risk)
variants = [
Variant("template", "1.0.0", weight=0.95),
Variant("template", "2.0.0", weight=0.05)
]
# Week 2: If stable, increase to 80/20
variants = [
Variant("template", "1.0.0", weight=0.80),
Variant("template", "2.0.0", weight=0.20)
]
# Week 3: If good, go to 50/50
variants = [
Variant("template", "1.0.0", weight=0.50),
Variant("template", "2.0.0", weight=0.50)
]
# After significance: 100% to winner
if results.winner and results.winner.version == "2.0.0":
variants = [Variant("template", "2.0.0", weight=1.0)]Best Practices
Sample Size
- Minimum 30 samples per variant for statistical validity
- More variants require more samples
- Higher confidence levels require more samples
- Consider practical significance, not just statistical
Significance Level
# Standard: 95% confidence (p < 0.05)
ab_test = ABTest(significance_level=0.05)
# Stringent: 99% confidence (p < 0.01)
ab_test = ABTest(significance_level=0.01)
# Lenient: 90% confidence (p < 0.10)
ab_test = ABTest(significance_level=0.10)Choose based on:
- Risk tolerance (higher confidence = less risk)
- Sample size constraints
- Cost of false positives vs false negatives
Running Tests
- Run one test at a time per template
- Don’t stop early when seeing positive results
- Wait for
min_sample_sizebefore checking results - Consider if the difference is practically meaningful
Metrics to Optimize
# Success rate (default)
best = analytics.get_best_performing_version("template", "success_rate")
# Latency
fastest = analytics.get_best_performing_version("template", "average_latency_ms")
# Cost
cheapest = analytics.get_best_performing_version("template", "average_tokens")Choose your primary metric before starting the test.
Monitoring Tests
Track test progress:
from datetime import datetime
class ABTestMonitor:
def __init__(self, ab_test: ABTest):
self.ab_test = ab_test
self.start_time = datetime.now()
def print_status(self):
results = self.ab_test.get_results()
print(f"\n{'='*60}")
print(f"A/B Test: {self.ab_test.test_name}")
print(f"Started: {self.start_time}")
print(f"{'='*60}")
# Progress
progress = (results.sample_size / self.ab_test.min_sample_size) * 100
print(f"\nProgress: {progress:.1f}%")
print(f"Samples: {results.sample_size}/{self.ab_test.min_sample_size}")
# Variant performance
print(f"\nVariant Performance:")
for variant_key, metrics in results.metrics_by_variant.items():
print(f" {variant_key}:")
print(f" Success: {metrics.success_rate:.2%}")
print(f" Calls: {metrics.total_calls}")
# Conclusion
if results.is_significant:
print(f"\n✅ Winner: {results.winner.version}")
print(f" Confidence: {results.confidence:.2%}")
elif results.sample_size >= self.ab_test.min_sample_size:
print(f"\n⚠️ No significant difference")
else:
print(f"\n⏳ Still collecting data...")
# Usage
monitor = ABTestMonitor(ab_test)
monitor.print_status()API Reference
ABTest
class ABTest:
def __init__(
self,
test_name: str,
variants: List[Variant],
analytics: TemplateAnalytics,
strategy: TrafficSplitStrategy = TrafficSplitStrategy.UNIFORM,
min_sample_size: int = 30,
significance_level: float = 0.05
)
def select_variant(self) -> Variant:
"""Select a variant based on traffic split strategy."""
def get_results(self) -> ABTestResult:
"""Analyze results and determine winning variant."""Variant
@dataclass
class Variant:
template_name: str
version: str
weight: float = 1.0 # Used for WEIGHTED strategyABTestResult
@dataclass
class ABTestResult:
winner: Optional[Variant]
confidence: float
metrics_by_variant: Dict[str, TemplateMetrics]
is_significant: bool
sample_size: intTrafficSplitStrategy
class TrafficSplitStrategy(str, Enum):
UNIFORM = "uniform" # Equal distribution
WEIGHTED = "weighted" # Custom weights
EPSILON_GREEDY = "epsilon_greedy" # Auto-optimizeLast updated on