Loading...
Loading...
Compare original and translation side by side
RidgeLassoElasticNetRidgeLassoElasticNet<project>/
├── data/ # input parquet/csv
├── src/
│ ├── train.py # ibis read → 3 XGBRegressors → conformal cal → MLflow
│ ├── predict.py # reload models + conformal_q, return point + interval
│ └── plots.py # predicted vs actual, residual diagnostics, coverage, SHAP
├── notebooks/
│ └── demo.py # marimo walkthrough
└── mlruns/ # MLflow tracking store (gitignored)<project>/
├── data/ # 输入的parquet/csv文件
├── src/
│ ├── train.py # ibis读取数据 → 训练3个XGBRegressors → 保形校准 → MLflow记录
│ ├── predict.py # 重新加载模型 + 保形参数,返回点估计和区间
│ └── plots.py # 预测值vs真实值、残差诊断、覆盖率、SHAP可视化
├── notebooks/
│ └── demo.py # marimo分步演示
└── mlruns/ # MLflow跟踪存储(已加入git忽略)ibis-framework[duckdb].execute()import ibis
table = ibis.duckdb.connect().read_parquet("data/train.parquet")
feature_cols = [c for c in table.columns if c.startswith("feature_")]
target_stats = (
table
.aggregate(
target_mean=table.target.mean(),
target_std=table.target.std(),
n_total=table.count(),
)
.execute()
.iloc[0]
)
data = (
table
.select(*feature_cols, "target")
.execute()
)
X = data[feature_cols]
y = data["target"]ibis-framework[duckdb].execute()import ibis
table = ibis.duckdb.connect().read_parquet("data/train.parquet")
feature_cols = [c for c in table.columns if c.startswith("feature_")]
target_stats = (
table
.aggregate(
target_mean=table.target.mean(),
target_std=table.target.std(),
n_total=table.count(),
)
.execute()
.iloc[0]
)
data = (
table
.select(*feature_cols, "target")
.execute()
)
X = data[feature_cols]
y = data["target"]objective="reg:squarederror"objective="reg:quantileerror"quantile_alpha=0.05quantile_alpha=0.95from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
def build_xgb_regressor(feature_cols, seed, *, objective="reg:squarederror", quantile_alpha=None):
kwargs = dict(
n_estimators=400, max_depth=4, learning_rate=0.05,
subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
objective=objective, random_state=seed, n_jobs=-1,
)
if quantile_alpha is not None:
kwargs["quantile_alpha"] = quantile_alpha
return Pipeline([
("preprocess", ColumnTransformer([("num", StandardScaler(), feature_cols)])),
("clf", XGBRegressor(**kwargs)),
])
xgb_point = build_xgb_regressor(feature_cols, seed=42)
xgb_lower = build_xgb_regressor(feature_cols, seed=42, objective="reg:quantileerror", quantile_alpha=0.05)
xgb_upper = build_xgb_regressor(feature_cols, seed=42, objective="reg:quantileerror", quantile_alpha=0.95)reg:quantileerrorlightgbmobjective="reg:squarederror"objective="reg:quantileerror"quantile_alpha=0.05quantile_alpha=0.95from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
def build_xgb_regressor(feature_cols, seed, *, objective="reg:squarederror", quantile_alpha=None):
kwargs = dict(
n_estimators=400, max_depth=4, learning_rate=0.05,
subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
objective=objective, random_state=seed, n_jobs=-1,
)
if quantile_alpha is not None:
kwargs["quantile_alpha"] = quantile_alpha
return Pipeline([
("preprocess", ColumnTransformer([("num", StandardScaler(), feature_cols)])),
("clf", XGBRegressor(**kwargs)),
])
xgb_point = build_xgb_regressor(feature_cols, seed=42)
xgb_lower = build_xgb_regressor(feature_cols, seed=42, objective="reg:quantileerror", quantile_alpha=0.05)
xgb_upper = build_xgb_regressor(feature_cols, seed=42, objective="reg:quantileerror", quantile_alpha=0.95)reg:quantileerrorlightgbmquantile_alpha=0.05/0.95import numpy as np
from sklearn.model_selection import train_test_splitquantile_alpha=0.05/0.95import numpy as np
from sklearn.model_selection import train_test_split
This is **conformalized quantile regression** (Romano, Patterson,
Candes 2019). It guarantees marginal coverage of at least
`nominal_coverage` on test data drawn from the same distribution as
calibration. The intervals get wider, but they're now honest.
**Always log both the raw and conformalized empirical coverage** to
MLflow so you can see the gap. If raw coverage is already at the
nominal level, your data is well-behaved and the correction is small;
if it's far off, the correction was load-bearing.
这就是**保形分位数回归**(Romano, Patterson, Candes 2019)。它能保证从与校准集同分布的测试数据中得到至少`nominal_coverage`的边际覆盖率。区间会变宽,但结果更可靠。
**务必将原始和保形后的经验覆盖率都记录到MLflow**,以便查看差距。若原始覆盖率已达到标称值,说明数据表现良好,校准修正幅度较小;若差距较大,则校准是必不可少的。y_true - y_predreg:absoluteerrorreg:huberimport matplotlib.pyplot as plt
from scipy import stats
residuals = y_test - y_pred
fig, axes = plt.subplots(1, 3, figsize=(14, 4.5))
axes[0].scatter(y_pred, residuals, alpha=0.5)
axes[0].axhline(0, color="red", ls="--")
axes[1].hist(residuals, bins=40, density=True)
stats.probplot(residuals, dist="norm", plot=axes[2])y_true - y_predreg:absoluteerrorreg:huberimport matplotlib.pyplot as plt
from scipy import stats
residuals = y_test - y_pred
fig, axes = plt.subplots(1, 3, figsize=(14, 4.5))
axes[0].scatter(y_pred, residuals, alpha=0.5)
axes[0].axhline(0, color="red", ls="--")
axes[1].hist(residuals, bins=40, density=True)
stats.probplot(residuals, dist="norm", plot=axes[2])from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))
mae = float(mean_absolute_error(y_test, y_pred))
r2 = float(r2_score(y_test, y_pred))from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))
mae = float(mean_absolute_error(y_test, y_pred))
r2 = float(r2_score(y_test, y_pred))feature_importances_TreeExplainerimport shap
clf = pipeline.named_steps["clf"] # the XGBRegressor
preprocessor = pipeline.named_steps["preprocess"]
X_test_t = preprocessor.transform(X_test.iloc[:200])
explainer = shap.TreeExplainer(clf)
shap_values = explainer(X_test_t)
shap.summary_plot(shap_values, X_test_t, feature_names=feature_cols)shap.plots.waterfallfeature_importances_TreeExplainerimport shap
clf = pipeline.named_steps["clf"] # XGBRegressor实例
preprocessor = pipeline.named_steps["preprocess"]
X_test_t = preprocessor.transform(X_test.iloc[:200])
explainer = shap.TreeExplainer(clf)
shap_values = explainer(X_test_t)
shap.summary_plot(shap_values, X_test_t, feature_names=feature_cols)shap.plots.waterfall| Kind | What |
|---|---|
| data path, n_rows, n_features, target_mean / target_std, seed, lower_quantile, upper_quantile, nominal_coverage, model name, hyperparameters |
| test_rmse, test_mae, test_r2, irreducible_rmse (when known), rmse_above_irreducible, conformal_q, coverage_raw vs coverage_conformal, interval_width_raw vs interval_width_conformal |
| data hash, target distribution stats |
| three models ( |
coverage_conformalnominal_coverage| 类型 | 内容 |
|---|---|
| 数据路径、行数、特征数、目标均值/标准差、随机种子、低分位数、高分位数、标称覆盖率、模型名称、超参数 |
| 测试集RMSE、测试集MAE、测试集R²、不可约RMSE(已知时)、RMSE超出不可约误差部分、conformal_q、原始覆盖率vs保形后覆盖率、原始区间宽度vs保形后区间宽度 |
| 数据哈希、目标分布统计量 |
| 三个模型( |
coverage_conformalnominal_coverageresidual vs predictedreg:quantileerrorreg:huberfeature_importances_RMSE - noise_stdreg:quantileerrorreg:huberfeature_importances_RMSE - noise_stddemo.pyy = 10·sin(π·x₀·x₁) + 20·(x₂−0.5)² + 10·x₃ + 5·x₄ + εLinearRegressiondemo.pyy = 10·sin(π·x₀·x₁) + 20·(x₂−0.5)² + 10·x₃ + 5·x₄ + εLinearRegression