Loading...
Loading...
Build a production-ready regression model on tabular data using XGBoost with conformalized quantile regression for prediction intervals. Use when the user needs to predict a continuous target from tabular features (price, sales, demand, time-to-event, score) and report uncertainty alongside the point estimate. Default to this for any tabular regression task.
npx skill4agent add brojonat/llmsrules regressionRidgeLassoElasticNet<project>/
├── data/ # input parquet/csv
├── src/
│ ├── train.py # ibis read → 3 XGBRegressors → conformal cal → MLflow
│ ├── predict.py # reload models + conformal_q, return point + interval
│ └── plots.py # predicted vs actual, residual diagnostics, coverage, SHAP
├── notebooks/
│ └── demo.py # marimo walkthrough
└── mlruns/ # MLflow tracking store (gitignored)ibis-framework[duckdb].execute()import ibis
table = ibis.duckdb.connect().read_parquet("data/train.parquet")
feature_cols = [c for c in table.columns if c.startswith("feature_")]
target_stats = (
table
.aggregate(
target_mean=table.target.mean(),
target_std=table.target.std(),
n_total=table.count(),
)
.execute()
.iloc[0]
)
data = (
table
.select(*feature_cols, "target")
.execute()
)
X = data[feature_cols]
y = data["target"]objective="reg:squarederror"objective="reg:quantileerror"quantile_alpha=0.05quantile_alpha=0.95from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
def build_xgb_regressor(feature_cols, seed, *, objective="reg:squarederror", quantile_alpha=None):
kwargs = dict(
n_estimators=400, max_depth=4, learning_rate=0.05,
subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
objective=objective, random_state=seed, n_jobs=-1,
)
if quantile_alpha is not None:
kwargs["quantile_alpha"] = quantile_alpha
return Pipeline([
("preprocess", ColumnTransformer([("num", StandardScaler(), feature_cols)])),
("clf", XGBRegressor(**kwargs)),
])
xgb_point = build_xgb_regressor(feature_cols, seed=42)
xgb_lower = build_xgb_regressor(feature_cols, seed=42, objective="reg:quantileerror", quantile_alpha=0.05)
xgb_upper = build_xgb_regressor(feature_cols, seed=42, objective="reg:quantileerror", quantile_alpha=0.95)reg:quantileerrorlightgbmquantile_alpha=0.05/0.95import numpy as np
from sklearn.model_selection import train_test_split
# 1. Split a calibration set off the training data
X_train, X_calib, y_train, y_calib = train_test_split(
X_train_full, y_train_full, test_size=0.2, random_state=42,
)
# 2. Fit quantile models on the (smaller) train set
xgb_lower.fit(X_train, y_train)
xgb_upper.fit(X_train, y_train)
# 3. Compute conformity scores on the calibration set:
# E_i = max(q_low(x_i) - y_i, y_i - q_high(x_i))
# Positive when y_i is OUTSIDE the predicted interval.
cal_low = xgb_lower.predict(X_calib)
cal_high = xgb_upper.predict(X_calib)
conformity = np.maximum(cal_low - y_calib, y_calib - cal_high)
# 4. Find the appropriate quantile of the conformity scores
nominal_coverage = 0.90
n_cal = len(y_calib)
q_level = min(1.0, np.ceil(nominal_coverage * (n_cal + 1)) / n_cal)
conformal_q = float(np.quantile(conformity, q_level))
# 5. At inference time, expand the raw quantile bounds by ±conformal_q
def predict_interval(X_new):
y_low_raw = xgb_lower.predict(X_new)
y_high_raw = xgb_upper.predict(X_new)
return y_low_raw - conformal_q, y_high_raw + conformal_qnominal_coveragey_true - y_predreg:absoluteerrorreg:huberimport matplotlib.pyplot as plt
from scipy import stats
residuals = y_test - y_pred
fig, axes = plt.subplots(1, 3, figsize=(14, 4.5))
axes[0].scatter(y_pred, residuals, alpha=0.5)
axes[0].axhline(0, color="red", ls="--")
axes[1].hist(residuals, bins=40, density=True)
stats.probplot(residuals, dist="norm", plot=axes[2])from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))
mae = float(mean_absolute_error(y_test, y_pred))
r2 = float(r2_score(y_test, y_pred))feature_importances_TreeExplainerimport shap
clf = pipeline.named_steps["clf"] # the XGBRegressor
preprocessor = pipeline.named_steps["preprocess"]
X_test_t = preprocessor.transform(X_test.iloc[:200])
explainer = shap.TreeExplainer(clf)
shap_values = explainer(X_test_t)
shap.summary_plot(shap_values, X_test_t, feature_names=feature_cols)shap.plots.waterfall| Kind | What |
|---|---|
| data path, n_rows, n_features, target_mean / target_std, seed, lower_quantile, upper_quantile, nominal_coverage, model name, hyperparameters |
| test_rmse, test_mae, test_r2, irreducible_rmse (when known), rmse_above_irreducible, conformal_q, coverage_raw vs coverage_conformal, interval_width_raw vs interval_width_conformal |
| data hash, target distribution stats |
| three models ( |
coverage_conformalnominal_coverageresidual vs predictedreg:quantileerrorreg:huberfeature_importances_RMSE - noise_stddemo.pyy = 10·sin(π·x₀·x₁) + 20·(x₂−0.5)² + 10·x₃ + 5·x₄ + εLinearRegression