Skip to Content
DocsA/B Testing

A/B Testing

Parsec includes built-in A/B testing infrastructure for prompt templates with statistical significance testing. Compare different template versions and make data-driven decisions about which prompts perform best.

Quick Start

from parsec.prompts import ( TemplateManager, TemplateAnalytics, ABTest, Variant, TrafficSplitStrategy ) from parsec.models.adapters import OpenAIAdapter from parsec.validators import PydanticValidator from parsec import EnforcementEngine from pydantic import BaseModel # Setup adapter = OpenAIAdapter(api_key="your-key", model="gpt-4o-mini") validator = PydanticValidator() engine = EnforcementEngine(adapter, validator) analytics = TemplateAnalytics() manager = TemplateManager(analytics=analytics) # Define output schema class Email(BaseModel): subject: str body: str # Register template versions to test manager.register_template( name="email_writer", template="Write a professional email about {topic}.", version="1.0.0", schema=Email ) manager.register_template( name="email_writer", template="Write a {tone} email about {topic}. Be {style}.", version="2.0.0", schema=Email ) # Create A/B test variants = [ Variant(template_name="email_writer", version="1.0.0"), Variant(template_name="email_writer", version="2.0.0") ] ab_test = ABTest( test_name="email_writer_test", variants=variants, analytics=analytics, strategy=TrafficSplitStrategy.UNIFORM ) # Production: select variant for each request variant = ab_test.select_variant() template = manager.get_template(variant.template_name, variant.version) result = await template.render(engine=engine, topic="project deadline") # After collecting data: analyze results results = ab_test.get_results() if results.is_significant: print(f"Winner: {results.winner.version}") print(f"Confidence: {results.confidence:.2%}")

Traffic Split Strategies

Uniform Distribution

Split traffic evenly across all variants:

ab_test = ABTest( test_name="my_test", variants=[variant_a, variant_b, variant_c], analytics=analytics, strategy=TrafficSplitStrategy.UNIFORM ) # Each variant gets ~33% of traffic

Weighted Distribution

Custom traffic allocation for gradual rollouts:

variants = [ Variant("template", "1.0.0", weight=0.8), # 80% traffic Variant("template", "2.0.0", weight=0.2) # 20% traffic ] ab_test = ABTest( test_name="my_test", variants=variants, analytics=analytics, strategy=TrafficSplitStrategy.WEIGHTED )

Use weighted distribution to:

  • Gradually roll out new versions (95/5 → 80/20 → 50/50)
  • Minimize risk of untested versions
  • Conduct “champion/challenger” testing

Epsilon-Greedy

Automatically optimize over time with exploration:

ab_test = ABTest( test_name="my_test", variants=[variant_a, variant_b], analytics=analytics, strategy=TrafficSplitStrategy.EPSILON_GREEDY ) # 90% traffic to best performing variant # 10% traffic randomly distributed (exploration)

Best for:

  • Automatic optimization
  • Adapting to changing conditions
  • Continuous learning scenarios

Statistical Significance

Parsec uses two-proportion z-tests to determine if differences are statistically significant:

ab_test = ABTest( test_name="conversion_test", variants=[control, treatment], analytics=analytics, min_sample_size=100, # Minimum calls before declaring winner significance_level=0.05 # p < 0.05 required (95% confidence) ) # Get results results = ab_test.get_results() if results.is_significant: print(f"Winner: {results.winner.version}") print(f"Confidence: {results.confidence:.2%}") # View metrics winner_key = f"{results.winner.template_name}:{results.winner.version}" winner_metrics = results.metrics_by_variant[winner_key] print(f"Success rate: {winner_metrics.success_rate:.2%}")

Understanding Results

results = ab_test.get_results() # Check if test has enough data if results.sample_size < ab_test.min_sample_size: print(f"Need {ab_test.min_sample_size - results.sample_size} more samples") # Continue collecting data # Check for statistical significance elif results.is_significant: print(f"✅ Significant difference found!") print(f"Winner: {results.winner.version}") print(f"Confidence: {results.confidence:.2%}") else: print(f"No significant difference detected") print(f"Variants perform similarly")

Comparing Metrics

View detailed metrics for each variant:

results = ab_test.get_results() for variant_key, metrics in results.metrics_by_variant.items(): print(f"\n{variant_key}:") print(f" Success rate: {metrics.success_rate:.2%}") print(f" Avg latency: {metrics.average_latency_ms:.0f}ms") print(f" Avg tokens: {metrics.average_tokens:.0f}") print(f" Total calls: {metrics.total_calls}")

Complete Workflow

from parsec.prompts import TemplateManager, TemplateAnalytics, ABTest, Variant, TrafficSplitStrategy from parsec.models.adapters import OpenAIAdapter from parsec.validators import PydanticValidator from parsec import EnforcementEngine from pydantic import BaseModel # 1. Setup adapter = OpenAIAdapter(api_key="your-key", model="gpt-4o-mini") validator = PydanticValidator() engine = EnforcementEngine(adapter, validator) analytics = TemplateAnalytics() manager = TemplateManager(analytics=analytics) class EmailOutput(BaseModel): subject: str body: str # 2. Register template versions manager.register_template( name="email_writer", template="Write a professional email about {topic}.", version="1.0.0", schema=EmailOutput ) manager.register_template( name="email_writer", template="Write a {tone} email about {topic}. Be {style}.", version="2.0.0", schema=EmailOutput ) # 3. Create A/B test variants = [ Variant("email_writer", "1.0.0", weight=0.5), Variant("email_writer", "2.0.0", weight=0.5) ] ab_test = ABTest( test_name="email_writer_improvement", variants=variants, analytics=analytics, strategy=TrafficSplitStrategy.WEIGHTED, min_sample_size=100 ) # 4. Use in production async def generate_email(topic: str): variant = ab_test.select_variant() template = manager.get_template(variant.template_name, variant.version) result = await template.render( engine=engine, topic=topic, tone="professional", # Only used by v2.0.0 style="concise" # Only used by v2.0.0 ) return result # 5. Collect data for topic in topics: await generate_email(topic) # 6. Analyze results results = ab_test.get_results() if results.is_significant and results.winner: print(f"Winner: {results.winner.version}") print(f"Confidence: {results.confidence:.2%}") # View performance winner_key = f"{results.winner.template_name}:{results.winner.version}" metrics = results.metrics_by_variant[winner_key] print(f"Success rate: {metrics.success_rate:.2%}") print(f"Avg latency: {metrics.average_latency_ms:.0f}ms")

Multi-Variant Testing

Test more than two variants simultaneously:

variants = [ Variant("classifier", "1.0.0"), # Original Variant("classifier", "2.0.0"), # More detailed Variant("classifier", "2.1.0"), # Simplified Variant("classifier", "3.0.0"), # Different approach ] ab_test = ABTest( test_name="multi_variant_test", variants=variants, analytics=analytics, strategy=TrafficSplitStrategy.UNIFORM, min_sample_size=200 # Higher sample size for more variants ) # Winner declared only if significantly better than ALL variants

Gradual Rollout Pattern

Use weighted distribution for safe rollouts:

# Week 1: 95/5 split (minimal risk) variants = [ Variant("template", "1.0.0", weight=0.95), Variant("template", "2.0.0", weight=0.05) ] # Week 2: If stable, increase to 80/20 variants = [ Variant("template", "1.0.0", weight=0.80), Variant("template", "2.0.0", weight=0.20) ] # Week 3: If good, go to 50/50 variants = [ Variant("template", "1.0.0", weight=0.50), Variant("template", "2.0.0", weight=0.50) ] # After significance: 100% to winner if results.winner and results.winner.version == "2.0.0": variants = [Variant("template", "2.0.0", weight=1.0)]

Best Practices

Sample Size

  • Minimum 30 samples per variant for statistical validity
  • More variants require more samples
  • Higher confidence levels require more samples
  • Consider practical significance, not just statistical

Significance Level

# Standard: 95% confidence (p < 0.05) ab_test = ABTest(significance_level=0.05) # Stringent: 99% confidence (p < 0.01) ab_test = ABTest(significance_level=0.01) # Lenient: 90% confidence (p < 0.10) ab_test = ABTest(significance_level=0.10)

Choose based on:

  • Risk tolerance (higher confidence = less risk)
  • Sample size constraints
  • Cost of false positives vs false negatives

Running Tests

  • Run one test at a time per template
  • Don’t stop early when seeing positive results
  • Wait for min_sample_size before checking results
  • Consider if the difference is practically meaningful

Metrics to Optimize

# Success rate (default) best = analytics.get_best_performing_version("template", "success_rate") # Latency fastest = analytics.get_best_performing_version("template", "average_latency_ms") # Cost cheapest = analytics.get_best_performing_version("template", "average_tokens")

Choose your primary metric before starting the test.

Monitoring Tests

Track test progress:

from datetime import datetime class ABTestMonitor: def __init__(self, ab_test: ABTest): self.ab_test = ab_test self.start_time = datetime.now() def print_status(self): results = self.ab_test.get_results() print(f"\n{'='*60}") print(f"A/B Test: {self.ab_test.test_name}") print(f"Started: {self.start_time}") print(f"{'='*60}") # Progress progress = (results.sample_size / self.ab_test.min_sample_size) * 100 print(f"\nProgress: {progress:.1f}%") print(f"Samples: {results.sample_size}/{self.ab_test.min_sample_size}") # Variant performance print(f"\nVariant Performance:") for variant_key, metrics in results.metrics_by_variant.items(): print(f" {variant_key}:") print(f" Success: {metrics.success_rate:.2%}") print(f" Calls: {metrics.total_calls}") # Conclusion if results.is_significant: print(f"\n✅ Winner: {results.winner.version}") print(f" Confidence: {results.confidence:.2%}") elif results.sample_size >= self.ab_test.min_sample_size: print(f"\n⚠️ No significant difference") else: print(f"\n⏳ Still collecting data...") # Usage monitor = ABTestMonitor(ab_test) monitor.print_status()

API Reference

ABTest

class ABTest: def __init__( self, test_name: str, variants: List[Variant], analytics: TemplateAnalytics, strategy: TrafficSplitStrategy = TrafficSplitStrategy.UNIFORM, min_sample_size: int = 30, significance_level: float = 0.05 ) def select_variant(self) -> Variant: """Select a variant based on traffic split strategy.""" def get_results(self) -> ABTestResult: """Analyze results and determine winning variant."""

Variant

@dataclass class Variant: template_name: str version: str weight: float = 1.0 # Used for WEIGHTED strategy

ABTestResult

@dataclass class ABTestResult: winner: Optional[Variant] confidence: float metrics_by_variant: Dict[str, TemplateMetrics] is_significant: bool sample_size: int

TrafficSplitStrategy

class TrafficSplitStrategy(str, Enum): UNIFORM = "uniform" # Equal distribution WEIGHTED = "weighted" # Custom weights EPSILON_GREEDY = "epsilon_greedy" # Auto-optimize
Last updated on