categorical-encoder
Compare original and translation side by side
🇺🇸
Original
English🇨🇳
Translation
ChineseCategorical Encoder Expert
类别编码专家
Эксперт по кодированию категориальных переменных для машинного обучения.
机器学习类别变量编码专家。
Выбор на основе кардинальности
基于基数的编码选择
| Кардинальность | Рекомендация |
|---|---|
| Низкая (<10) | One-hot, Dummy |
| Средняя (10-50) | Target, Frequency, Binary |
| Высокая (>50) | Hash, Embeddings |
| Порядковая | Ordinal |
| 基数 | 推荐编码方式 |
|---|---|
| 低基数(<10) | One-hot, Dummy |
| 中基数(10-50) | Target, Frequency, Binary |
| 高基数(>50) | Hash, Embeddings |
| 有序类别 | Ordinal |
One-Hot Encoding
One-Hot Encoding
python
from sklearn.preprocessing import OneHotEncoder
import pandas as pdpython
from sklearn.preprocessing import OneHotEncoder
import pandas as pdДля pandas
Pandas实现方式
df_encoded = pd.get_dummies(df, columns=['category_col'], prefix='cat')
df_encoded = pd.get_dummies(df, columns=['category_col'], prefix='cat')
Для sklearn
Scikit-learn实现方式
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train[['category_col']])
X_test_encoded = encoder.transform(X_test[['category_col']])
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train[['category_col']])
X_test_encoded = encoder.transform(X_test[['category_col']])
Получить названия признаков
获取特征名称
feature_names = encoder.get_feature_names_out(['category_col'])
undefinedfeature_names = encoder.get_feature_names_out(['category_col'])
undefinedTarget Encoding с кросс-валидацией
带交叉验证的Target Encoding
python
from sklearn.model_selection import KFold
import numpy as np
def target_encode_cv(X, y, column, n_splits=5, alpha=1.0):
"""
Target кодирование с CV для предотвращения переобучения
"""
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
encoded = np.zeros(len(X))
global_mean = y.mean()
for train_idx, val_idx in kf.split(X):
# Вычислить средние на тренировочной выборке
category_means = y.iloc[train_idx].groupby(
X[column].iloc[train_idx]
).mean()
# Байесовское сглаживание
category_counts = X[column].iloc[train_idx].value_counts()
smoothed_means = (
category_counts * category_means + alpha * global_mean
) / (category_counts + alpha)
# Закодировать валидационную выборку
encoded[val_idx] = X[column].iloc[val_idx].map(
smoothed_means
).fillna(global_mean)
return encodedpython
from sklearn.model_selection import KFold
import numpy as np
def target_encode_cv(X, y, column, n_splits=5, alpha=1.0):
"""
带交叉验证的目标编码,防止过拟合
"""
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
encoded = np.zeros(len(X))
global_mean = y.mean()
for train_idx, val_idx in kf.split(X):
# 在训练集上计算均值
category_means = y.iloc[train_idx].groupby(
X[column].iloc[train_idx]
).mean()
# 贝叶斯平滑
category_counts = X[column].iloc[train_idx].value_counts()
smoothed_means = (
category_counts * category_means + alpha * global_mean
) / (category_counts + alpha)
# 对验证集进行编码
encoded[val_idx] = X[column].iloc[val_idx].map(
smoothed_means
).fillna(global_mean)
return encodedBinary Encoding
Binary Encoding
python
import category_encoders as cepython
import category_encoders as ceBinary кодирование уменьшает размерность
二进制编码可降低维度
binary_encoder = ce.BinaryEncoder(cols=['high_cardinality_col'])
X_train_binary = binary_encoder.fit_transform(X_train)
X_test_binary = binary_encoder.transform(X_test)
binary_encoder = ce.BinaryEncoder(cols=['high_cardinality_col'])
X_train_binary = binary_encoder.fit_transform(X_train)
X_test_binary = binary_encoder.transform(X_test)
Для 100 категорий: one-hot = 100, binary = 7 признаков
100个类别:独热编码=100个特征,二进制编码=7个特征
print(f"Исходных категорий: {X_train['col'].nunique()}")
print(f"Binary признаков: {len([c for c in X_train_binary.columns if 'col' in c])}")
undefinedprint(f"原始类别数量: {X_train['col'].nunique()}")
print(f"二进制编码特征数量: {len([c for c in X_train_binary.columns if 'col' in c])}")
undefinedFrequency и Count Encoding
Frequency与Count Encoding
python
def frequency_encode(train_series, test_series=None):
"""Кодирование по частоте появления"""
freq_map = train_series.value_counts(normalize=True).to_dict()
train_encoded = train_series.map(freq_map)
if test_series is not None:
test_encoded = test_series.map(freq_map).fillna(0)
return train_encoded, test_encoded
return train_encoded
def count_encode(train_series, test_series=None):
"""Кодирование по количеству"""
count_map = train_series.value_counts().to_dict()
train_encoded = train_series.map(count_map)
if test_series is not None:
test_encoded = test_series.map(count_map).fillna(0)
return train_encoded, test_encoded
return train_encodedpython
def frequency_encode(train_series, test_series=None):
"""基于出现频率的编码"""
freq_map = train_series.value_counts(normalize=True).to_dict()
train_encoded = train_series.map(freq_map)
if test_series is not None:
test_encoded = test_series.map(freq_map).fillna(0)
return train_encoded, test_encoded
return train_encoded
def count_encode(train_series, test_series=None):
"""基于出现次数的编码"""
count_map = train_series.value_counts().to_dict()
train_encoded = train_series.map(count_map)
if test_series is not None:
test_encoded = test_series.map(count_map).fillna(0)
return train_encoded, test_encoded
return train_encodedEmbeddings для высокой кардинальности
高基数特征的嵌入编码
python
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import OneHotEncoder
def create_categorical_embeddings(X_train, X_test, column, n_components=10):
"""Создать эмбеддинги из one-hot"""
# One-hot кодирование
encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
X_train_oh = encoder.fit_transform(X_train[[column]])
X_test_oh = encoder.transform(X_test[[column]])
# Понижение размерности
svd = TruncatedSVD(n_components=n_components, random_state=42)
X_train_emb = svd.fit_transform(X_train_oh)
X_test_emb = svd.transform(X_test_oh)
return X_train_emb, X_test_emb, encoder, svdpython
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import OneHotEncoder
def create_categorical_embeddings(X_train, X_test, column, n_components=10):
"""从独热编码生成嵌入特征"""
# 独热编码
encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
X_train_oh = encoder.fit_transform(X_train[[column]])
X_test_oh = encoder.transform(X_test[[column]])
# 降维处理
svd = TruncatedSVD(n_components=n_components, random_state=42)
X_train_emb = svd.fit_transform(X_train_oh)
X_test_emb = svd.transform(X_test_oh)
return X_train_emb, X_test_emb, encoder, svdMultiple Encoding Strategy
多编码策略
python
def multi_encode_categorical(df, column, target=None):
"""Создать множественные кодирования"""
encodings = {}
# Frequency
encodings[f'{column}_freq'] = frequency_encode(df[column])
# Count
encodings[f'{column}_count'] = count_encode(df[column])
# Target (если есть)
if target is not None:
encodings[f'{column}_target'] = target_encode_cv(df, target, column)
# Ordinal для древесных моделей
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
encodings[f'{column}_ordinal'] = le.fit_transform(df[column])
return pd.DataFrame(encodings)python
def multi_encode_categorical(df, column, target=None):
"""生成多种编码结果"""
encodings = {}
# 频率编码
encodings[f'{column}_freq'] = frequency_encode(df[column])
# 计数编码
encodings[f'{column}_count'] = count_encode(df[column])
# 目标编码(如果有目标变量)
if target is not None:
encodings[f'{column}_target'] = target_encode_cv(df, target, column)
# 用于树模型的序数编码
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
encodings[f'{column}_ordinal'] = le.fit_transform(df[column])
return pd.DataFrame(encodings)Production-ready Encoder
生产级编码器
python
class RobustCategoricalEncoder:
def __init__(self, encoding_type='onehot', handle_unknown='mode'):
self.encoding_type = encoding_type
self.handle_unknown = handle_unknown
self.encoders = {}
self.fallback_values = {}
def fit(self, X, y=None):
for column in X.select_dtypes(include=['object', 'category']).columns:
if self.encoding_type == 'onehot':
encoder = OneHotEncoder(
sparse_output=False,
handle_unknown='ignore'
)
encoder.fit(X[[column]])
self.encoders[column] = encoder
elif self.encoding_type == 'target' and y is not None:
target_map = y.groupby(X[column]).mean().to_dict()
self.encoders[column] = target_map
self.fallback_values[column] = y.mean()
return self
def transform(self, X):
X_transformed = X.copy()
for column, encoder in self.encoders.items():
if self.encoding_type == 'onehot':
encoded = encoder.transform(X_transformed[[column]])
feature_names = encoder.get_feature_names_out([column])
encoded_df = pd.DataFrame(
encoded,
columns=feature_names,
index=X.index
)
X_transformed = pd.concat([
X_transformed.drop(column, axis=1),
encoded_df
], axis=1)
elif self.encoding_type == 'target':
X_transformed[column] = X_transformed[column].map(
encoder
).fillna(self.fallback_values[column])
return X_transformedpython
class RobustCategoricalEncoder:
def __init__(self, encoding_type='onehot', handle_unknown='mode'):
self.encoding_type = encoding_type
self.handle_unknown = handle_unknown
self.encoders = {}
self.fallback_values = {}
def fit(self, X, y=None):
for column in X.select_dtypes(include=['object', 'category']).columns:
if self.encoding_type == 'onehot':
encoder = OneHotEncoder(
sparse_output=False,
handle_unknown='ignore'
)
encoder.fit(X[[column]])
self.encoders[column] = encoder
elif self.encoding_type == 'target' and y is not None:
target_map = y.groupby(X[column]).mean().to_dict()
self.encoders[column] = target_map
self.fallback_values[column] = y.mean()
return self
def transform(self, X):
X_transformed = X.copy()
for column, encoder in self.encoders.items():
if self.encoding_type == 'onehot':
encoded = encoder.transform(X_transformed[[column]])
feature_names = encoder.get_feature_names_out([column])
encoded_df = pd.DataFrame(
encoded,
columns=feature_names,
index=X.index
)
X_transformed = pd.concat([
X_transformed.drop(column, axis=1),
encoded_df
], axis=1)
elif self.encoding_type == 'target':
X_transformed[column] = X_transformed[column].map(
encoder
).fillna(self.fallback_values[column])
return X_transformedРекомендации для моделей
各模型的编码推荐
| Модель | Рекомендуемое кодирование |
|---|---|
| Древесные (RF, XGB) | Ordinal, Target, Frequency |
| Линейные (LR, SVM) | One-hot, избегать ordinal |
| Нейронные сети | Embeddings для высокой кардинальности |
| На основе расстояния | Стандартизированные закодированные |
| 模型 | 推荐编码方式 |
|---|---|
| 树模型(RF、XGB) | Ordinal, Target, Frequency |
| 线性模型(LR、SVM) | One-hot,避免使用Ordinal |
| 神经网络 | 高基数特征使用Embeddings |
| 基于距离的模型 | 标准化编码特征 |
Предотвращение утечки данных
防止数据泄露
python
undefinedpython
undefinedПРАВИЛЬНО: fit только на train
正确做法:仅在训练集上拟合
encoder.fit(X_train)
X_train_enc = encoder.transform(X_train)
X_test_enc = encoder.transform(X_test)
encoder.fit(X_train)
X_train_enc = encoder.transform(X_train)
X_test_enc = encoder.transform(X_test)
НЕПРАВИЛЬНО: fit на всех данных
错误做法:在全量数据上拟合
encoder.fit(X_all) # Утечка!
undefinedencoder.fit(X_all) # 数据泄露!
undefinedВалидация
编码验证
python
def validate_encoding(X_original, X_encoded):
"""Валидировать результаты кодирования"""
print(f"Исходная размерность: {X_original.shape}")
print(f"Закодированная размерность: {X_encoded.shape}")
print(f"Память: {X_encoded.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
# Проверить NaN
null_count = X_encoded.isnull().sum().sum()
if null_count > 0:
print(f"Предупреждение: {null_count} пустых значений")
# Коэффициент расширения
print(f"Расширение: {X_encoded.shape[1] / X_original.shape[1]:.2f}x")python
def validate_encoding(X_original, X_encoded):
"""验证编码结果"""
print(f"原始数据维度: {X_original.shape}")
print(f"编码后数据维度: {X_encoded.shape}")
print(f"内存占用: {X_encoded.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
# 检查空值
null_count = X_encoded.isnull().sum().sum()
if null_count > 0:
print(f"警告: {null_count}个空值")
# 维度扩张系数
print(f"维度扩张倍数: {X_encoded.shape[1] / X_original.shape[1]:.2f}x")Лучшие практики
最佳实践
- Fit только на train — избегайте утечки данных
- Обрабатывайте unknown — используйте fallback стратегию
- Используйте CV для target encoding — предотвращает переобучение
- Мониторьте размерность — one-hot взрывает размерность
- Выбирайте по модели — разные модели предпочитают разное
- 仅在训练集上拟合 — 避免数据泄露
- 处理未知类别 — 使用回退策略
- Target编码使用交叉验证 — 防止过拟合
- 监控维度变化 — 独热编码会导致维度爆炸
- 根据模型选择编码 — 不同模型偏好不同编码方式