Loading...
Loading...
Compare original and translation side by side
MultiOutputClassifierClassifierChainMultiOutputClassifierClassifierChainmulticlass-classificationbinary-classificationmulticlass-classificationbinary-classification<project>/
├── data/
├── src/
│ ├── train.py # ibis read → MultiOutputClassifier(XGBClassifier) → MLflow
│ ├── predict.py # reload, return per-row label vector + per-label probas
│ └── plots.py # label balance, co-occurrence, per-label metrics, cardinality
├── notebooks/
│ └── demo.py
└── mlruns/<project>/
├── data/
├── src/
│ ├── train.py # ibis读取数据 → MultiOutputClassifier(XGBClassifier) → MLflow
│ ├── predict.py # 加载模型,返回每行的标签向量+单标签概率
│ └── plots.py # 标签分布、共现情况、单标签指标、标签基数
├── notebooks/
│ └── demo.py
└── mlruns/import ibis
table = ibis.duckdb.connect().read_parquet("data/train.parquet")
feature_cols = [c for c in table.columns if c.startswith("feature_")]
label_cols = [c for c in table.columns if c.startswith("label_")]
data = (
table
.select(*feature_cols, *label_cols)
.execute()
)
X = data[feature_cols]
Y = data[label_cols].to_numpy().astype(int) # shape: (n_samples, n_labels)Yimport ibis
table = ibis.duckdb.connect().read_parquet("data/train.parquet")
feature_cols = [c for c in table.columns if c.startswith("feature_")]
label_cols = [c for c in table.columns if c.startswith("label_")]
data = (
table
.select(*feature_cols, *label_cols)
.execute()
)
X = data[feature_cols]
Y = data[label_cols].to_numpy().astype(int) # 形状: (n_samples, n_labels)YMultiOutputClassifierMultiOutputClassifierfrom sklearn.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
def build_pipeline(feature_cols, seed):
return Pipeline([
("preprocess", ColumnTransformer([("num", StandardScaler(), feature_cols)])),
("clf", MultiOutputClassifier(
XGBClassifier(
n_estimators=300,
max_depth=4,
learning_rate=0.05,
subsample=0.8,
colsample_bytree=0.8,
reg_lambda=1.0,
objective="binary:logistic",
eval_metric="logloss",
random_state=seed,
n_jobs=-1,
),
n_jobs=-1, # parallelize across labels
)),
])MultiOutputClassifierbinary:logisticfrom sklearn.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
def build_pipeline(feature_cols, seed):
return Pipeline([
("preprocess", ColumnTransformer([("num", StandardScaler(), feature_cols)])),
("clf", MultiOutputClassifier(
XGBClassifier(
n_estimators=300,
max_depth=4,
learning_rate=0.05,
subsample=0.8,
colsample_bytree=0.8,
reg_lambda=1.0,
objective="binary:logistic",
eval_metric="logloss",
random_state=seed,
n_jobs=-1,
),
n_jobs=-1, # 跨标签并行训练
)),
])MultiOutputClassifierbinary:logisticfrom sklearn.metrics import hamming_loss, accuracy_score
ham = hamming_loss(Y_test, Y_pred) # primary metric, lower = better
exact_match = accuracy_score(Y_test, Y_pred) # subset accuracy — too strict alonefrom sklearn.metrics import hamming_loss, accuracy_score
ham = hamming_loss(Y_test, Y_pred) # 核心指标,值越小越好
exact_match = accuracy_score(Y_test, Y_pred) # 子集准确率——单独使用过于严格f1_scoresamples| Average | What it computes | When to use |
|---|---|---|
| macro | Unweighted mean of per-label F1 | All labels matter equally — rare labels drag the average down (good) |
| micro | F1 over the pooled | Overall correctness across all label slots |
| weighted | Per-label F1 weighted by support | Weights toward common labels — hides rare-label failures |
| samples | Per-row F1, then averaged across rows | Per-row "did we get the labels mostly right?" — useful for tagging tasks |
from sklearn.metrics import f1_score
f1_macro = f1_score(Y_test, Y_pred, average="macro", zero_division=0)
f1_micro = f1_score(Y_test, Y_pred, average="micro", zero_division=0)
f1_weighted = f1_score(Y_test, Y_pred, average="weighted", zero_division=0)
f1_samples = f1_score(Y_test, Y_pred, average="samples", zero_division=0)f1_scoresamples| 平均策略 | 计算方式 | 使用场景 |
|---|---|---|
| macro | 单标签F1值的未加权平均值 | 所有标签同等重要——稀有标签会拉低平均值(这是好事,能暴露问题) |
| micro | 基于所有 | 衡量所有标签槽位的整体正确性 |
| weighted | 按标签样本量加权的单标签F1平均值 | 向常见标签倾斜——会掩盖稀有标签的性能问题 |
| samples | 每行的F1值,再求所有行的平均值 | 衡量每行“是否大致预测对了标签”——适用于标签标注任务 |
from sklearn.metrics import f1_score
f1_macro = f1_score(Y_test, Y_pred, average="macro", zero_division=0)
f1_micro = f1_score(Y_test, Y_pred, average="micro", zero_division=0)
f1_weighted = f1_score(Y_test, Y_pred, average="weighted", zero_division=0)
f1_samples = f1_score(Y_test, Y_pred, average="samples", zero_division=0)MultiOutputClassifierimport numpy as np
n_labels = Y.shape[1]
cooc = np.zeros((n_labels, n_labels))
for i in range(n_labels):
i_count = int(Y[:, i].sum())
if i_count == 0:
continue
for j in range(n_labels):
cooc[i, j] = float(((Y[:, i] == 1) & (Y[:, j] == 1)).sum() / i_count)MultiOutputClassifierimport numpy as np
n_labels = Y.shape[1]
cooc = np.zeros((n_labels, n_labels))
for i in range(n_labels):
i_count = int(Y[:, i].sum())
if i_count == 0:
continue
for j in range(n_labels):
cooc[i, j] = float(((Y[:, i] == 1) & (Y[:, j] == 1)).sum() / i_count)
If most off-diagonal entries hover around the marginal P(label_j),
labels are roughly independent → use `MultiOutputClassifier`. If
some off-diagonal entries are much higher than the marginals, labels
are correlated → consider `ClassifierChain`.
如果大多数非对角线元素接近label_j的边缘概率,说明标签大致独立→使用`MultiOutputClassifier`。如果某些非对角线元素远高于边缘概率,说明标签存在相关性→考虑使用`ClassifierChain`。ClassifierChainClassifierChainfrom sklearn.multioutput import ClassifierChain
clf_chain = ClassifierChain(
XGBClassifier(...),
order=[0, 1, 2, 3, 4, 5], # or "random" for cross-validated stability
random_state=42,
)ClassifierChainMultiOutputClassifierClassifierChainfrom sklearn.multioutput import ClassifierChain
clf_chain = ClassifierChain(
XGBClassifier(...),
order=[0, 1, 2, 3, 4, 5], # 或设为"random"以通过交叉验证保证稳定性
random_state=42,
)ClassifierChainMultiOutputClassifierClassifierChainfor i, lbl in enumerate(label_cols):
f1_i = float(f1_score(Y_test[:, i], Y_pred[:, i], average="binary", zero_division=0))
mlflow.log_metric(f"test_f1__{lbl}", f1_i)for i, lbl in enumerate(label_cols):
f1_i = float(f1_score(Y_test[:, i], Y_pred[:, i], average="binary", zero_division=0))
mlflow.log_metric(f"test_f1__{lbl}", f1_i)| Kind | What |
|---|---|
| data path, n_rows, n_features, n_labels, label_columns, seed, hyperparameters |
| hamming_loss (primary), subset_accuracy, F1 macro / micro / weighted / samples, per-label F1 (one metric per label), label cardinality (true vs predicted) |
| data hash, label cardinality / density from sidecar |
| model, label balance bar, co-occurrence heatmap, per-label metrics bar, label cardinality histogram (true vs pred) |
| 类型 | 记录内容 |
|---|---|
| 数据路径、样本数、特征数、标签数、标签列、随机种子、超参数 |
| hamming_loss(核心指标)、subset_accuracy、F1 macro/micro/weighted/samples、单标签F1值(每个标签对应一个指标)、标签基数(真实值vs预测值) |
| 数据哈希、标签基数/密度(来自辅助文件) |
| 模型、标签分布柱状图、标签共现热力图、单标签指标柱状图、标签基数直方图(真实值vs预测值) |
MultiOutputClassifierMultiOutputClassifier(..., n_jobs=-1)zero_division=0zero_division=0scale_pos_weightMultiOutputClassifiersample_weightY.shape == (n_samples, n_labels)y.shape == (n_samples,)[0, n_classes)f1_scoreMultiOutputClassifierMultiOutputClassifier(..., n_jobs=-1)zero_division=0zero_division=0scale_pos_weightMultiOutputClassifiersample_weightY.shape == (n_samples, n_labels)y.shape == (n_samples,)[0, n_classes)f1_scoredemo.pyMultiOutputClassifierdemo.pyMultiOutputClassifier