Loading...
Loading...
Run a Bayesian A/B test on conversion data using PyMC. Use when the user wants to compare two variants (landing pages, emails, pricing, UI changes) and decide which to ship using posterior probabilities and expected loss instead of p-values. Covers Beta-Binomial model, ROPE, expected loss, sample-size guidance, and ArviZ diagnostics.
npx skill4agent add brojonat/llmsrules bayesian-ab-testingbayesian-regressionbayesian-bandits<project>/
├── data/ # input parquet/csv (or generate in-notebook)
├── src/
│ ├── train.py # PyMC model fit → MLflow log
│ ├── predict.py # reload idata, compute decision metrics
│ └── plots.py # posterior, trace, loss, ROPE visualizations
├── notebooks/
│ └── demo.py # marimo walkthrough
└── mlruns/ # MLflow tracking store (gitignored)| Field | Type | Description |
|---|---|---|
| int | Visitors assigned to control (A) |
| int | Conversions in control |
| int | Visitors assigned to treatment (B) |
| int | Conversions in treatment |
import ibis
table = ibis.duckdb.connect().read_parquet("data/experiment.parquet")
summary = (
table
.group_by("variant")
.aggregate(
visitors=table.count(),
conversions=table.converted.sum().cast("int64"),
)
.execute()
)
n_a = int(summary.loc[summary.variant == "control", "visitors"].iloc[0])
conversions_a = int(summary.loc[summary.variant == "control", "conversions"].iloc[0])
n_b = int(summary.loc[summary.variant == "treatment", "visitors"].iloc[0])
conversions_b = int(summary.loc[summary.variant == "treatment", "conversions"].iloc[0])import pymc as pm
with pm.Model() as ab_model:
# Priors — Beta(1,1) = uniform if no prior knowledge
# Use informative priors if you have historical conversion rates
p_a = pm.Beta("p_A", alpha=1, beta=1)
p_b = pm.Beta("p_B", alpha=1, beta=1)
# The quantity of interest: absolute lift
delta = pm.Deterministic("delta", p_b - p_a)
pm.Deterministic("relative_lift", (p_b - p_a) / p_a)
# Likelihood — use Binomial with sufficient statistics,
# NOT N independent Bernoulli observations
pm.Binomial("obs_A", n=n_a, p=p_a, observed=conversions_a)
pm.Binomial("obs_B", n=n_b, p=p_b, observed=conversions_b)
idata = pm.sample(
draws=2000, tune=1000, chains=4,
random_seed=42, progressbar=False,
)| Situation | Prior | Why |
|---|---|---|
| No idea what to expect | Beta(1, 1) | Uniform on [0, 1] |
| Typical web conversion (~3-5%) | Beta(3, 97) | Concentrates around 3% |
| Strong historical data (last quarter's rate) | Beta(α, β) from method of moments | Use the data you have |
alpha_0 = mu * kappa
beta_0 = (1 - mu) * kappadelta_samples = idata.posterior["delta"].to_numpy().flatten()
prob_b_better = float(np.mean(delta_samples > 0))p_a_samples = idata.posterior["p_A"].to_numpy().flatten()
p_b_samples = idata.posterior["p_B"].to_numpy().flatten()
# If you choose B but A is actually better, your loss is (p_A - p_B)
loss_choosing_b = float(np.mean(np.maximum(p_a_samples - p_b_samples, 0)))
loss_choosing_a = float(np.mean(np.maximum(p_b_samples - p_a_samples, 0)))rope = 0.005 # minimum practically significant difference
prob_b_clears_rope = float(np.mean(delta_samples > rope))
prob_equivalent = float(np.mean(np.abs(delta_samples) < rope))if loss_choosing_b < loss_choosing_a and prob_b_clears_rope > 0.90:
decision = "Ship B"
elif prob_equivalent > 0.50:
decision = "Practically equivalent — pick the cheaper option"
else:
decision = "Keep collecting data"import arviz as az
# Summary table with R-hat and ESS
summary = az.summary(idata, var_names=["p_A", "p_B", "delta"])
# Trace plot — chains should mix well (fuzzy caterpillars)
az.plot_trace(idata, var_names=["p_A", "p_B", "delta"])
# Posterior with HDI
az.plot_posterior(idata, var_names=["delta"], ref_val=0)drawstune| Kind | What |
|---|---|
| n_a, n_b, conversions_a, conversions_b, prior_alpha, prior_beta, draws, tune, chains, seed, rope |
| prob_b_better, expected_loss_a, expected_loss_b, posterior_mean_delta, hdi_94_low, hdi_94_high, rhat_max, ess_min, prob_b_clears_rope |
| data_hash, true_p_a, true_p_b (if synthetic) |
| posterior/idata.nc, plots/{posterior.png, trace.png, loss.png, rope.png} |
from scipy import stats as sp_stats
for n in range(100, 10001, 100):
k_a = int(n * observed_rate_a)
k_b = int(n * observed_rate_b)
post_a = sp_stats.beta(alpha_0 + k_a, beta_0 + n - k_a)
post_b = sp_stats.beta(alpha_0 + k_b, beta_0 + n - k_b)
draws_a = post_a.rvs(5000)
draws_b = post_b.rvs(5000)
expected_loss = np.mean(np.maximum(draws_a - draws_b, 0))
# Stop when expected_loss < your tolerancedemo.pymarimo edit --sandbox demo.py