特征重要性分析
目录
1. LightGBM 内置重要性
1.1 两种重要性类型
LightGBM 提供两种特征重要性:
| 类型 | 说明 | 计算方式 | 优点 | 缺点 |
|---|---|---|---|---|
| split | 分裂次数 | 特征被用作分裂点的次数 | 直观,表示使用频率 | 偏向高基数特征 |
| gain | 信息增益 | 特征分裂带来的总增益 | 更反映贡献 | 计算稍慢 |
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
# 生成示例数据
np.random.seed(42)
X, y = make_regression(
n_samples=5000,
n_features=20,
n_informative=8,
noise=0.1,
random_state=42
)
# 特征命名
feature_names = [f'factor_{i}' for i in range(20)]
# 训练模型
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = lgb.LGBMRegressor(
n_estimators=200,
max_depth=6,
learning_rate=0.05,
random_state=42,
verbose=-1
)
model.fit(X_train, y_train)
# 获取特征重要性
importance_split = model.booster_.feature_importance(importance_type='split')
importance_gain = model.booster_.feature_importance(importance_type='gain')
# 组织成 DataFrame
importance_df = pd.DataFrame({
'feature': feature_names,
'importance_split': importance_split,
'importance_gain': importance_gain
})
# 排序
importance_df_split = importance_df.sort_values('importance_split', ascending=False)
importance_df_gain = importance_df.sort_values('importance_gain', ascending=False)
print("=== Split 重要性 (Top 10) ===")
print(importance_df_split.head(10)[['feature', 'importance_split']])
print("\n=== Gain 重要性 (Top 10) ===")
print(importance_df_gain.head(10)[['feature', 'importance_gain']])1.2 可视化内置重要性
import matplotlib.pyplot as plt
def plot_feature_importance(importance_df, importance_type='split', top_n=15):
"""绘制特征重要性"""
df_sorted = importance_df.sort_values(importance_type, ascending=False).head(top_n)
fig, ax = plt.subplots(figsize=(10, 6))
colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(df_sorted)))
ax.barh(range(len(df_sorted)), df_sorted[f'importance_{importance_type}'], color=colors)
ax.set_yticks(range(len(df_sorted)))
ax.set_yticklabels(df_sorted['feature'])
ax.set_xlabel('Importance')
ax.set_title(f'Feature Importance ({importance_type.capitalize()})')
ax.invert_yaxis()
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()
plot_feature_importance(importance_df, 'split')
plot_feature_importance(importance_df, 'gain')1.3 归一化重要性
def normalize_importance(importance_df, importance_type='gain'):
"""归一化特征重要性(百分比)"""
df = importance_df.copy()
total = df[f'importance_{importance_type}'].sum()
df[f'importance_{importance_type}_pct'] = df[f'importance_{importance_type}'] / total * 100
return df.sort_values(f'importance_{importance_type}_pct', ascending=False)
# 归一化
importance_norm = normalize_importance(importance_df, 'gain')
print("归一化 Gain 重要性 (Top 10):")
print(importance_norm.head(10)[['feature', 'importance_gain_pct']].round(2))2. 排列重要性
2.1 原理
排列重要性(Permutation Importance)通过打乱特征值来评估其重要性:
- 计算基准模型性能
- 对某个特征的值进行随机打乱
- 重新计算模型性能
- 性能下降越多 = 特征越重要
优势:
- 不依赖于模型内部实现
- 可以用于任何模型
- 更能反映特征的真实贡献
2.2 实现排列重要性
from sklearn.metrics import mean_squared_error
def permutation_importance(model, X, y, metric=mean_squared_error,
n_repeats=5, random_state=42):
"""
计算排列重要性
参数:
model: 训练好的模型
X: 特征数据
y: 真实标签
metric: 评估函数(越小越好)
n_repeats: 重复次数
random_state: 随机种子
返回:
importance_df: 特征重要性 DataFrame
"""
np.random.seed(random_state)
# 基准性能
baseline_score = metric(y, model.predict(X))
n_features = X.shape[1]
feature_names = [f'feature_{i}' for i in range(n_features)]
results = []
for feature_idx in range(n_features):
importance_scores = []
for _ in range(n_repeats):
# 复制数据
X_permuted = X.copy()
# 打乱当前特征
perm_indices = np.random.permutation(len(X))
X_permuted[:, feature_idx] = X[perm_indices, feature_idx]
# 计算性能
permuted_score = metric(y, model.predict(X_permuted))
# 重要性 = 性能下降(因为MSE越小越好,所以用 permuted - baseline)
importance = permuted_score - baseline_score
importance_scores.append(importance)
results.append({
'feature': feature_names[feature_idx],
'importance_mean': np.mean(importance_scores),
'importance_std': np.std(importance_scores)
})
importance_df = pd.DataFrame(results)
importance_df = importance_df.sort_values('importance_mean', ascending=False)
return importance_df, baseline_score
# 计算排列重要性
perm_importance, baseline = permutation_importance(
model, X_test, y_test,
metric=mean_squared_error,
n_repeats=10
)
print(f"基准 MSE: {baseline:.4f}\n")
print("排列重要性 (Top 10):")
print(perm_importance.head(10))2.3 可视化排列重要性
def plot_permutation_importance(perm_importance, top_n=15):
"""绘制排列重要性"""
df = perm_importance.head(top_n).sort_values('importance_mean')
fig, ax = plt.subplots(figsize=(10, 6))
y_positions = range(len(df))
ax.barh(
y_positions,
df['importance_mean'],
xerr=df['importance_std'],
color='steelblue',
alpha=0.8,
error_kw={'linewidth': 2, 'capsize': 3}
)
ax.set_yticks(y_positions)
ax.set_yticklabels(df['feature'])
ax.set_xlabel('Importance (Performance Drop)')
ax.set_title('Permutation Importance (Error Bars = Std Dev)')
ax.axvline(x=0, color='black', linestyle='--', linewidth=1)
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()
plot_permutation_importance(perm_importance)3. SHAP 值分析
3.1 SHAP 原理
SHAP(SHapley Additive exPlanations)基于博弈论的 Shapley 值:
- 每个特征的贡献 = 所有可能特征组合的边际贡献的加权平均
- 满足可加性:预测值 = 基准值 + 各特征 SHAP 值
3.2 SHAP 基础使用
# 安装 shap: pip install shap
import shap
# 计算 SHAP 值
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
# Summary plot - 全局特征重要性
shap.summary_plot(shap_values, X_test, feature_names=feature_names, show=False)
plt.gcf().set_size_inches(10, 6)
plt.title("SHAP Summary Plot")
plt.tight_layout()
plt.show()
# 特征重要性排序
mean_abs_shap = np.abs(shap_values).mean(axis=0)
shap_importance = pd.DataFrame({
'feature': feature_names,
'shap_importance': mean_abs_shap
}).sort_values('shap_importance', ascending=False)
print("\nSHAP 特征重要性 (Top 10):")
print(shap_importance.head(10))3.3 SHAP 详解图
# Detailed summary plot
shap.summary_plot(shap_values, X_test, feature_names=feature_names,
plot_type="layered", show=False)
plt.gcf().set_size_inches(10, 6)
plt.title("SHAP Summary Plot (Layered)")
plt.tight_layout()
plt.show()
# Bar plot - 按重要性排序
shap.summary_plot(shap_values, X_test, feature_names=feature_names,
plot_type="bar", show=False)
plt.gcf().set_size_inches(10, 6)
plt.title("SHAP Feature Importance (Bar)")
plt.tight_layout()
plt.show()3.4 SHAP 依赖图
分析单个特征的影响和交互作用。
def plot_shap_dependence(shap_values, X, feature_idx, feature_names):
"""绘制 SHAP 依赖图"""
feature_name = feature_names[feature_idx]
shap.dependence_plot(
feature_idx,
shap_values,
X,
feature_names=feature_names,
show=False
)
plt.gcf().set_size_inches(8, 6)
plt.title(f"SHAP Dependence Plot: {feature_name}")
plt.tight_layout()
plt.show()
# 绘制最重要特征的依赖图
top_feature_idx = shap_importance.iloc[0]['feature'].replace('feature_', '')
top_feature_idx = int(top_feature_idx)
plot_shap_dependence(shap_values, X_test, top_feature_idx, feature_names)3.5 SHAP 单样本解释
# Force plot - 单样本解释
sample_idx = 0
# 预测值
pred = model.predict(X_test[sample_idx:sample_idx+1])[0]
actual = y_test[sample_idx]
print(f"样本 {sample_idx}:")
print(f" 实际值: {actual:.4f}")
print(f" 预测值: {pred:.4f}")
print(f" 误差: {abs(pred - actual):.4f}")
# Force plot
shap.force_plot(
explainer.expected_value,
shap_values[sample_idx],
X_test[sample_idx],
feature_names=feature_names,
matplotlib=True,
show=False
)
plt.gcf().set_size_inches(12, 3)
plt.title(f"SHAP Force Plot - Sample {sample_idx}")
plt.tight_layout()
plt.show()
# Waterfall plot - 更清晰的单样本解释
shap.waterfall_plot(
shap.Explanation(
values=shap_values[sample_idx],
base_values=explainer.expected_value,
data=X_test[sample_idx],
feature_names=feature_names
),
show=False
)
plt.gcf().set_size_inches(10, 6)
plt.title(f"SHAP Waterfall Plot - Sample {sample_idx}")
plt.tight_layout()
plt.show()4. 全局 vs 局部解释
4.1 全局解释
目的: 理解模型整体的行为模式
方法:
- 特征重要性排名
- SHAP summary plot
- 特征影响分布
def global_feature_analysis(model, X, y, feature_names, explainer=None):
"""全局特征分析"""
n_features = X.shape[1]
# 1. 内置重要性
split_importance = model.booster_.feature_importance('split')
gain_importance = model.booster_.feature_importance('gain')
# 2. SHAP 重要性
if explainer is None:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
shap_importance = np.abs(shap_values).mean(axis=0)
# 3. 排列重要性
perm_imp, _ = permutation_importance(model, X, y, n_repeats=5)
# 汇总
global_df = pd.DataFrame({
'feature': feature_names,
'split_importance': split_importance,
'gain_importance': gain_importance,
'shap_importance': shap_importance,
'permutation_importance': perm_imp['importance_mean'].values
})
# 归一化
for col in ['split_importance', 'gain_importance', 'shap_importance', 'permutation_importance']:
global_df[f'{col}_norm'] = global_df[col] / global_df[col].sum()
return global_df
# 全局分析
global_analysis = global_feature_analysis(model, X_test, y_test, feature_names)
print("全局特征分析 (归一化后):")
print(global_analysis[['feature', 'gain_importance_norm',
'shap_importance_norm', 'permutation_importance_norm']]
.sort_values('gain_importance_norm', ascending=False)
.head(10)
.round(4))4.2 局部解释
目的: 理解单个预测的决策过程
def local_explanation(model, explainer, X, sample_idx, feature_names):
"""单个样本的详细解释"""
# 预测
pred = model.predict(X[sample_idx:sample_idx+1])[0]
# SHAP 值
shap_values = explainer.shap_values(X[sample_idx:sample_idx+1])[0]
# 基准值
base_value = explainer.expected_value
# 组织结果
explanation = pd.DataFrame({
'feature': feature_names,
'value': X[sample_idx],
'shap_value': shap_values
}).sort_values('shap_value', key=abs, ascending=False)
print(f"=== 样本 {sample_idx} 的局部解释 ===")
print(f"基准值: {base_value:.4f}")
print(f"预测值: {pred:.4f}")
print(f"SHAP 总和: {base_value + shap_values.sum():.4f}")
print(f"\n特征贡献 (Top 10):")
print(explanation.head(10).to_string(index=False))
return explanation
# 局部解释
local_expl = local_explanation(model, explainer, X_test, 0, feature_names)5. 特征重要性稳定性
5.1 时间稳定性分析
def temporal_feature_importance(model, X_list, y_list, feature_names):
"""
分析特征重要性随时间的变化
参数:
model: 模型(将被复制并重新训练)
X_list: 不同时期的特征列表
y_list: 不同时期的标签列表
feature_names: 特征名称
"""
from sklearn.base import clone
importance_over_time = []
for period, (X_period, y_period) in enumerate(zip(X_list, y_list)):
# 训练模型
model_period = clone(model)
model_period.fit(X_period, y_period)
# 获取重要性
gain_importance = model_period.booster_.feature_importance('gain')
importance_over_time.append({
'period': period,
**{f'feat_{i}': imp for i, imp in enumerate(gain_importance)}
})
importance_df = pd.DataFrame(importance_over_time)
# 可视化前几个特征
top_features = importance_df.drop('period', axis=1).mean().sort_values(ascending=False).head(5).index
fig, ax = plt.subplots(figsize=(12, 5))
for feat in top_features:
ax.plot(importance_df['period'], importance_df[feat], marker='o', label=feat)
ax.set_xlabel('Period')
ax.set_ylabel('Importance (Gain)')
ax.set_title('Feature Importance Over Time')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
return importance_df
# 模拟不同时期的数据
n_periods = 5
X_list = []
y_list = []
for i in range(n_periods):
start = i * 200
end = start + 500
X_list.append(X[start:end])
y_list.append(y[start:end])
# 分析
temporal_imp = temporal_feature_importance(model, X_list, y_list, feature_names)5.2 交叉验证稳定性
def cv_feature_importance(model, X, y, cv=5, random_state=42):
"""交叉验证特征重要性"""
from sklearn.model_selection import KFold
from sklearn.base import clone
kf = KFold(n_splits=cv, shuffle=False) # 不 shuffle,保持时序
importance_list = []
for fold, (train_idx, _) in enumerate(kf.split(X)):
model_fold = clone(model)
model_fold.fit(X[train_idx], y[train_idx])
gain_importance = model_fold.booster_.feature_importance('gain')
importance_list.append({
'fold': fold,
**{f'feat_{i}': imp for i, imp in enumerate(gain_importance)}
})
importance_df = pd.DataFrame(importance_list)
# 计算统计
importance_stats = pd.DataFrame({
'feature': feature_names,
'mean': importance_df.drop('fold', axis=1).mean().values,
'std': importance_df.drop('fold', axis=1).std().values,
'cv': importance_df.drop('fold', axis=1).std().values / importance_df.drop('fold', axis=1).mean().values
}).sort_values('mean', ascending=False)
print("特征重要性交叉验证统计 (Top 10):")
print(importance_stats.head(10).round(4))
return importance_df, importance_stats
# 分析
cv_imp, cv_stats = cv_feature_importance(model, X_train, y_train, cv=5)
# 可视化
top_features = cv_stats.head(5)['feature'].values
fig, ax = plt.subplots(figsize=(12, 5))
for feat in top_features:
feat_col = f'feat_{feature_names.index(feat)}'
ax.plot(range(1, 6), cv_imp[feat_col], marker='o', label=feat)
ax.set_xlabel('Fold')
ax.set_ylabel('Importance (Gain)')
ax.set_title('Feature Importance Across CV Folds')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()6. 多重共线性检测
6.1 VIF(方差膨胀因子)
检测特征间的共线性。
其中 是用其他特征预测特征 的 。
| VIF 值 | 共线性程度 |
|---|---|
| < 5 | 可接受 |
| 5-10 | 中等共线性 |
| > 10 | 严重共线性 |
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
def calculate_vif(X, feature_names):
"""
计算方差膨胀因子(VIF)
参数:
X: 特征矩阵
feature_names: 特征名称
返回:
vif_df: 包含 VIF 的 DataFrame
"""
vif_data = []
for i in range(X.shape[1]):
# 用其他特征预测当前特征
other_features = [j for j in range(X.shape[1]) if j != i]
if len(other_features) == 0:
vif = np.inf
else:
model = LinearRegression()
model.fit(X[:, other_features], X[:, i])
y_pred = model.predict(X[:, other_features])
r2 = r2_score(X[:, i], y_pred)
vif = 1 / (1 - r2) if r2 < 1 else np.inf
vif_data.append({
'feature': feature_names[i],
'VIF': vif
})
return pd.DataFrame(vif_data).sort_values('VIF', ascending=False)
# 计算 VIF
vif_df = calculate_vif(X_train, feature_names)
print("方差膨胀因子 (VIF):")
print(vif_df.head(15).round(2))
# 标记高 VIF 特征
vif_df['high_collinearity'] = vif_df['VIF'] > 10
high_vif = vif_df[vif_df['high_collinearity']]
if len(high_vif) > 0:
print(f"\n发现 {len(high_vif)} 个高共线性特征 (VIF > 10):")
print(high_vif[['feature', 'VIF']])6.2 相关系数矩阵
def correlation_analysis(X, feature_names, threshold=0.8):
"""
相关性分析
返回高度相关的特征对
"""
corr_matrix = np.corrcoef(X.T)
corr_df = pd.DataFrame(corr_matrix, index=feature_names, columns=feature_names)
# 找出高相关对
high_corr_pairs = []
for i in range(len(feature_names)):
for j in range(i + 1, len(feature_names)):
corr_val = abs(corr_df.iloc[i, j])
if corr_val > threshold:
high_corr_pairs.append({
'feature_1': feature_names[i],
'feature_2': feature_names[j],
'correlation': corr_val
})
high_corr_df = pd.DataFrame(high_corr_pairs).sort_values('correlation', ascending=False)
print(f"相关性 > {threshold} 的特征对:")
print(high_corr_df if len(high_corr_df) > 0 else "无")
return corr_df, high_corr_df
# 相关性分析
corr_matrix, high_corr = correlation_analysis(X_train, feature_names, threshold=0.8)
# 可视化相关系数矩阵
import seaborn as sns
fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(corr_matrix, cmap='coolwarm', center=0,
square=True, linewidths=0.5, cbar_kws={"shrink": 0.8},
xticklabels=feature_names, yticklabels=feature_names, ax=ax)
ax.set_title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()7. 冗余特征剔除
7.1 基于重要性和共线性剔除
def remove_redundant_features(model, X, y, feature_names,
importance_threshold=0.01,
vif_threshold=10,
corr_threshold=0.9):
"""
剔除冗余特征
策略:
1. 移除低重要性特征
2. 在高共线性对中保留重要性更高的特征
"""
# 1. 计算特征重要性
importance = model.booster_.feature_importance('gain')
importance_norm = importance / importance.sum()
# 2. 计算 VIF
vif_df = calculate_vif(X, feature_names)
# 3. 计算相关性
corr_matrix = np.corrcoef(X.T)
# 确定保留的特征
features_to_remove = set()
# 规则1: 低重要性
for i, (feat, imp) in enumerate(zip(feature_names, importance_norm)):
if imp < importance_threshold:
features_to_remove.add(i)
# 规则2: 高共线性,保留重要性更高的
for i in range(len(feature_names)):
if i in features_to_remove:
continue
for j in range(i + 1, len(feature_names)):
if j in features_to_remove:
continue
if abs(corr_matrix[i, j]) > corr_threshold:
# 移除重要性较低的
if importance[i] < importance[j]:
features_to_remove.add(i)
else:
features_to_remove.add(j)
# 规则3: 高 VIF
for _, row in vif_df[vif_df['VIF'] > vif_threshold].iterrows():
feat_idx = feature_names.index(row['feature'])
if feat_idx not in features_to_remove:
# 检查是否有高相关且重要性更高的替代特征
for j in range(len(feature_names)):
if j != feat_idx and j not in features_to_remove:
if abs(corr_matrix[feat_idx, j]) > 0.7:
if importance[j] > importance[feat_idx]:
features_to_remove.add(feat_idx)
break
# 结果
keep_indices = [i for i in range(len(feature_names)) if i not in features_to_remove]
remove_indices = list(features_to_remove)
print(f"原始特征数: {len(feature_names)}")
print(f"保留特征数: {len(keep_indices)}")
print(f"移除特征数: {len(remove_indices)}")
if len(remove_indices) > 0:
print("\n移除的特征:")
for idx in sorted(remove_indices):
print(f" {feature_names[idx]}: 重要性={importance_norm[idx]:.4f}")
return keep_indices, remove_indices
# 剔除冗余特征
keep_idx, remove_idx = remove_redundant_features(
model, X_train, y_train, feature_names,
importance_threshold=0.01,
vif_threshold=10,
corr_threshold=0.9
)
# 用保留的特征重新训练
X_train_reduced = X_train[:, keep_idx]
X_test_reduced = X_test[:, keep_idx]
reduced_names = [feature_names[i] for i in keep_idx]
model_reduced = lgb.LGBMRegressor(
n_estimators=200,
max_depth=6,
learning_rate=0.05,
random_state=42,
verbose=-1
)
model_reduced.fit(X_train_reduced, y_train)
# 对比性能
from sklearn.metrics import mean_squared_error
mse_original = mean_squared_error(y_test, model.predict(X_test))
mse_reduced = mean_squared_error(y_test, model_reduced.predict(X_test_reduced))
print(f"\n性能对比:")
print(f" 原始模型 MSE: {mse_original:.4f}")
print(f" 简化模型 MSE: {mse_reduced:.4f}")
print(f" 变化: {(mse_reduced - mse_original) / mse_original * 100:+.2f}%")8. 完整分析报告模板
8.1 综合特征分析报告
def generate_feature_analysis_report(model, X_train, X_test, y_train, y_test,
feature_names, explainer=None):
"""
生成完整的特征分析报告
"""
print("=" * 60)
print(" 特征分析报告")
print("=" * 60)
# 1. 基本特征重要性
print("\n【1. 特征重要性排名】\n")
importance_gain = model.booster_.feature_importance('gain')
importance_df = pd.DataFrame({
'feature': feature_names,
'gain_importance': importance_gain,
'gain_pct': importance_gain / importance_gain.sum() * 100
}).sort_values('gain_importance', ascending=False)
print(importance_df.head(15).to_string(index=False))
# 2. SHAP 分析
print("\n【2. SHAP 特征重要性】\n")
if explainer is None:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
shap_importance = np.abs(shap_values).mean(axis=0)
shap_df = pd.DataFrame({
'feature': feature_names,
'shap_importance': shap_importance,
'shap_pct': shap_importance / shap_importance.sum() * 100
}).sort_values('shap_importance', ascending=False)
print(shap_df.head(15).to_string(index=False))
# 3. 排列重要性
print("\n【3. 排列重要性】\n")
perm_imp, baseline = permutation_importance(
model, X_test, y_test,
metric=mean_squared_error,
n_repeats=10
)
print(f"基准 MSE: {baseline:.4f}")
print("\nTop 15 重要特征:")
print(perm_imp.head(15)[['feature', 'importance_mean']].to_string(index=False))
# 4. 共线性分析
print("\n【4. 共线性分析】\n")
vif_df = calculate_vif(X_train, feature_names)
high_vif = vif_df[vif_df['VIF'] > 10]
if len(high_vif) > 0:
print(f"发现 {len(high_vif)} 个高 VIF 特征:")
print(high_vif[['feature', 'VIF']].to_string(index=False))
else:
print("未发现严重的共线性问题")
# 5. 相关性分析
print("\n【5. 高相关性特征对】\n")
_, high_corr = correlation_analysis(X_train, feature_names, threshold=0.85)
# 6. 综合建议
print("\n【6. 综合建议】\n")
# 合并各方法的重要性排名
rank_df = pd.DataFrame({
'feature': feature_names,
'gain_rank': importance_df['gain_importance'].rank(ascending=False).values,
'shap_rank': shap_df['shap_importance'].rank(ascending=False).values,
'perm_rank': perm_imp['importance_mean'].rank(ascending=False).values
})
rank_df['avg_rank'] = rank_df[['gain_rank', 'shap_rank', 'perm_rank']].mean(axis=1)
rank_df = rank_df.sort_values('avg_rank')
print("综合重要性排名 (Top 10):")
print(rank_df.head(10)[['feature', 'gain_rank', 'shap_rank', 'perm_rank', 'avg_rank']].to_string(index=False))
# 建议保留的核心特征
top_features = rank_df.head(10)['feature'].tolist()
print(f"\n建议保留的核心特征 (前10): {', '.join(top_features)}")
# 可疑特征
suspicious = rank_df[rank_df['avg_rank'] > len(feature_names) * 0.8]['feature'].tolist()
if len(suspicious) > 0:
print(f"\n可能需要移除的特征: {', '.join(suspicious)}")
print("\n" + "=" * 60)
return {
'importance_df': importance_df,
'shap_df': shap_df,
'perm_importance': perm_imp,
'vif_df': vif_df,
'high_corr': high_corr,
'rank_df': rank_df
}
# 生成报告
report = generate_feature_analysis_report(
model, X_train, X_test, y_train, y_test, feature_names
)8.2 交互式分析工具
class FeatureAnalyzer:
"""特征分析交互工具"""
def __init__(self, model, X_train, X_test, y_train, y_test, feature_names):
self.model = model
self.X_train = X_train
self.X_test = X_test
self.y_train = y_train
self.y_test = y_test
self.feature_names = feature_names
self.explainer = shap.TreeExplainer(model)
self.shap_values = self.explainer.shap_values(X_test)
def summary(self):
"""打印摘要"""
return generate_feature_analysis_report(
self.model, self.X_train, self.X_test,
self.y_train, self.y_test, self.feature_names,
self.explainer
)
def plot_importance(self, method='gain', top_n=15):
"""绘制特征重要性"""
if method == 'gain':
importance = self.model.booster_.feature_importance('gain')
elif method == 'split':
importance = self.model.booster_.feature_importance('split')
elif method == 'shap':
importance = np.abs(self.shap_values).mean(axis=0)
else:
raise ValueError(f"Unknown method: {method}")
df = pd.DataFrame({
'feature': self.feature_names,
'importance': importance
}).sort_values('importance').tail(top_n)
plt.figure(figsize=(10, 6))
plt.barh(range(len(df)), df['importance'], color='steelblue')
plt.yticks(range(len(df)), df['feature'])
plt.xlabel('Importance')
plt.title(f'Feature Importance ({method.capitalize()})')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()
def explain_sample(self, sample_idx):
"""解释单个样本"""
return local_explanation(
self.model, self.explainer, self.X_test,
sample_idx, self.feature_names
)
def shap_summary(self):
"""SHAP 总结图"""
shap.summary_plot(
self.shap_values, self.X_test,
feature_names=self.feature_names
)
def shap_dependence(self, feature_idx):
"""SHAP 依赖图"""
shap.dependence_plot(
feature_idx, self.shap_values, self.X_test,
feature_names=self.feature_names
)
# 使用
analyzer = FeatureAnalyzer(model, X_train, X_test, y_train, y_test, feature_names)
# 打印摘要
# analyzer.summary()
# 绘制重要性
# analyzer.plot_importance('shap')
# 解释样本
# analyzer.explain_sample(0)核心知识点总结
特征重要性方法对比
| 方法 | 优点 | 缺点 | 适用场景 |
|---|---|---|---|
| Split 重要性 | 计算快,直观 | 偏向高基数特征 | 快速筛选 |
| Gain 重要性 | 反映真实贡献 | 略慢 | 主要参考 |
| 排列重要性 | 模型无关,可靠 | 计算慢 | 验证重要性 |
| SHAP 值 | 理论严谨,可解释 | 计算慢 | 深入分析 |
特征选择策略
第一步: 快速筛选
├── 使用 Gain 重要性
├── 移除重要性 < 1% 的特征
└── 通常可减少 30-50% 特征
第二步: 共线性处理
├── 计算 VIF
├── 在高共线性对中保留重要性更高的
└── 通常可再减少 10-20% 特征
第三步: 验证
├── 使用排列重要性确认
├── 对比简化前后模型性能
└── 确保性能损失 < 5%
稳定性检查
- 时间稳定性: 特征重要性不应随时间剧烈变化
- 交叉验证稳定性: 不同折的重要性排名应一致
- 方法一致性: 不同方法的重要性排名应大致相同
下一节: 06-实战案例.md - 完整的端到端实战项目。