特征重要性分析

目录


1. LightGBM 内置重要性

1.1 两种重要性类型

LightGBM 提供两种特征重要性:

类型说明计算方式优点缺点
split分裂次数特征被用作分裂点的次数直观,表示使用频率偏向高基数特征
gain信息增益特征分裂带来的总增益更反映贡献计算稍慢
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
 
# 生成示例数据
np.random.seed(42)
X, y = make_regression(
    n_samples=5000,
    n_features=20,
    n_informative=8,
    noise=0.1,
    random_state=42
)
 
# 特征命名
feature_names = [f'factor_{i}' for i in range(20)]
 
# 训练模型
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
model = lgb.LGBMRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    random_state=42,
    verbose=-1
)
model.fit(X_train, y_train)
 
# 获取特征重要性
importance_split = model.booster_.feature_importance(importance_type='split')
importance_gain = model.booster_.feature_importance(importance_type='gain')
 
# 组织成 DataFrame
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance_split': importance_split,
    'importance_gain': importance_gain
})
 
# 排序
importance_df_split = importance_df.sort_values('importance_split', ascending=False)
importance_df_gain = importance_df.sort_values('importance_gain', ascending=False)
 
print("=== Split 重要性 (Top 10) ===")
print(importance_df_split.head(10)[['feature', 'importance_split']])
 
print("\n=== Gain 重要性 (Top 10) ===")
print(importance_df_gain.head(10)[['feature', 'importance_gain']])

1.2 可视化内置重要性

import matplotlib.pyplot as plt
 
def plot_feature_importance(importance_df, importance_type='split', top_n=15):
    """绘制特征重要性"""
    df_sorted = importance_df.sort_values(importance_type, ascending=False).head(top_n)
 
    fig, ax = plt.subplots(figsize=(10, 6))
 
    colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(df_sorted)))
    ax.barh(range(len(df_sorted)), df_sorted[f'importance_{importance_type}'], color=colors)
    ax.set_yticks(range(len(df_sorted)))
    ax.set_yticklabels(df_sorted['feature'])
    ax.set_xlabel('Importance')
    ax.set_title(f'Feature Importance ({importance_type.capitalize()})')
    ax.invert_yaxis()
    ax.grid(True, alpha=0.3, axis='x')
 
    plt.tight_layout()
    plt.show()
 
plot_feature_importance(importance_df, 'split')
plot_feature_importance(importance_df, 'gain')

1.3 归一化重要性

def normalize_importance(importance_df, importance_type='gain'):
    """归一化特征重要性(百分比)"""
    df = importance_df.copy()
    total = df[f'importance_{importance_type}'].sum()
    df[f'importance_{importance_type}_pct'] = df[f'importance_{importance_type}'] / total * 100
    return df.sort_values(f'importance_{importance_type}_pct', ascending=False)
 
# 归一化
importance_norm = normalize_importance(importance_df, 'gain')
print("归一化 Gain 重要性 (Top 10):")
print(importance_norm.head(10)[['feature', 'importance_gain_pct']].round(2))

2. 排列重要性

2.1 原理

排列重要性(Permutation Importance)通过打乱特征值来评估其重要性:

  1. 计算基准模型性能
  2. 对某个特征的值进行随机打乱
  3. 重新计算模型性能
  4. 性能下降越多 = 特征越重要

优势:

  • 不依赖于模型内部实现
  • 可以用于任何模型
  • 更能反映特征的真实贡献

2.2 实现排列重要性

from sklearn.metrics import mean_squared_error
 
def permutation_importance(model, X, y, metric=mean_squared_error,
                          n_repeats=5, random_state=42):
    """
    计算排列重要性
 
    参数:
        model: 训练好的模型
        X: 特征数据
        y: 真实标签
        metric: 评估函数(越小越好)
        n_repeats: 重复次数
        random_state: 随机种子
 
    返回:
        importance_df: 特征重要性 DataFrame
    """
    np.random.seed(random_state)
 
    # 基准性能
    baseline_score = metric(y, model.predict(X))
 
    n_features = X.shape[1]
    feature_names = [f'feature_{i}' for i in range(n_features)]
 
    results = []
 
    for feature_idx in range(n_features):
        importance_scores = []
 
        for _ in range(n_repeats):
            # 复制数据
            X_permuted = X.copy()
 
            # 打乱当前特征
            perm_indices = np.random.permutation(len(X))
            X_permuted[:, feature_idx] = X[perm_indices, feature_idx]
 
            # 计算性能
            permuted_score = metric(y, model.predict(X_permuted))
 
            # 重要性 = 性能下降(因为MSE越小越好,所以用 permuted - baseline)
            importance = permuted_score - baseline_score
            importance_scores.append(importance)
 
        results.append({
            'feature': feature_names[feature_idx],
            'importance_mean': np.mean(importance_scores),
            'importance_std': np.std(importance_scores)
        })
 
    importance_df = pd.DataFrame(results)
    importance_df = importance_df.sort_values('importance_mean', ascending=False)
 
    return importance_df, baseline_score
 
# 计算排列重要性
perm_importance, baseline = permutation_importance(
    model, X_test, y_test,
    metric=mean_squared_error,
    n_repeats=10
)
 
print(f"基准 MSE: {baseline:.4f}\n")
print("排列重要性 (Top 10):")
print(perm_importance.head(10))

2.3 可视化排列重要性

def plot_permutation_importance(perm_importance, top_n=15):
    """绘制排列重要性"""
    df = perm_importance.head(top_n).sort_values('importance_mean')
 
    fig, ax = plt.subplots(figsize=(10, 6))
 
    y_positions = range(len(df))
    ax.barh(
        y_positions,
        df['importance_mean'],
        xerr=df['importance_std'],
        color='steelblue',
        alpha=0.8,
        error_kw={'linewidth': 2, 'capsize': 3}
    )
 
    ax.set_yticks(y_positions)
    ax.set_yticklabels(df['feature'])
    ax.set_xlabel('Importance (Performance Drop)')
    ax.set_title('Permutation Importance (Error Bars = Std Dev)')
    ax.axvline(x=0, color='black', linestyle='--', linewidth=1)
    ax.grid(True, alpha=0.3, axis='x')
 
    plt.tight_layout()
    plt.show()
 
plot_permutation_importance(perm_importance)

3. SHAP 值分析

3.1 SHAP 原理

SHAP(SHapley Additive exPlanations)基于博弈论的 Shapley 值:

  • 每个特征的贡献 = 所有可能特征组合的边际贡献的加权平均
  • 满足可加性:预测值 = 基准值 + 各特征 SHAP 值

3.2 SHAP 基础使用

# 安装 shap: pip install shap
import shap
 
# 计算 SHAP 值
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
 
# Summary plot - 全局特征重要性
shap.summary_plot(shap_values, X_test, feature_names=feature_names, show=False)
plt.gcf().set_size_inches(10, 6)
plt.title("SHAP Summary Plot")
plt.tight_layout()
plt.show()
 
# 特征重要性排序
mean_abs_shap = np.abs(shap_values).mean(axis=0)
shap_importance = pd.DataFrame({
    'feature': feature_names,
    'shap_importance': mean_abs_shap
}).sort_values('shap_importance', ascending=False)
 
print("\nSHAP 特征重要性 (Top 10):")
print(shap_importance.head(10))

3.3 SHAP 详解图

# Detailed summary plot
shap.summary_plot(shap_values, X_test, feature_names=feature_names,
                  plot_type="layered", show=False)
plt.gcf().set_size_inches(10, 6)
plt.title("SHAP Summary Plot (Layered)")
plt.tight_layout()
plt.show()
 
# Bar plot - 按重要性排序
shap.summary_plot(shap_values, X_test, feature_names=feature_names,
                  plot_type="bar", show=False)
plt.gcf().set_size_inches(10, 6)
plt.title("SHAP Feature Importance (Bar)")
plt.tight_layout()
plt.show()

3.4 SHAP 依赖图

分析单个特征的影响和交互作用。

def plot_shap_dependence(shap_values, X, feature_idx, feature_names):
    """绘制 SHAP 依赖图"""
    feature_name = feature_names[feature_idx]
 
    shap.dependence_plot(
        feature_idx,
        shap_values,
        X,
        feature_names=feature_names,
        show=False
    )
 
    plt.gcf().set_size_inches(8, 6)
    plt.title(f"SHAP Dependence Plot: {feature_name}")
    plt.tight_layout()
    plt.show()
 
# 绘制最重要特征的依赖图
top_feature_idx = shap_importance.iloc[0]['feature'].replace('feature_', '')
top_feature_idx = int(top_feature_idx)
 
plot_shap_dependence(shap_values, X_test, top_feature_idx, feature_names)

3.5 SHAP 单样本解释

# Force plot - 单样本解释
sample_idx = 0
 
# 预测值
pred = model.predict(X_test[sample_idx:sample_idx+1])[0]
actual = y_test[sample_idx]
 
print(f"样本 {sample_idx}:")
print(f"  实际值: {actual:.4f}")
print(f"  预测值: {pred:.4f}")
print(f"  误差:   {abs(pred - actual):.4f}")
 
# Force plot
shap.force_plot(
    explainer.expected_value,
    shap_values[sample_idx],
    X_test[sample_idx],
    feature_names=feature_names,
    matplotlib=True,
    show=False
)
plt.gcf().set_size_inches(12, 3)
plt.title(f"SHAP Force Plot - Sample {sample_idx}")
plt.tight_layout()
plt.show()
 
# Waterfall plot - 更清晰的单样本解释
shap.waterfall_plot(
    shap.Explanation(
        values=shap_values[sample_idx],
        base_values=explainer.expected_value,
        data=X_test[sample_idx],
        feature_names=feature_names
    ),
    show=False
)
plt.gcf().set_size_inches(10, 6)
plt.title(f"SHAP Waterfall Plot - Sample {sample_idx}")
plt.tight_layout()
plt.show()

4. 全局 vs 局部解释

4.1 全局解释

目的: 理解模型整体的行为模式

方法:

  • 特征重要性排名
  • SHAP summary plot
  • 特征影响分布
def global_feature_analysis(model, X, y, feature_names, explainer=None):
    """全局特征分析"""
    n_features = X.shape[1]
 
    # 1. 内置重要性
    split_importance = model.booster_.feature_importance('split')
    gain_importance = model.booster_.feature_importance('gain')
 
    # 2. SHAP 重要性
    if explainer is None:
        explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)
    shap_importance = np.abs(shap_values).mean(axis=0)
 
    # 3. 排列重要性
    perm_imp, _ = permutation_importance(model, X, y, n_repeats=5)
 
    # 汇总
    global_df = pd.DataFrame({
        'feature': feature_names,
        'split_importance': split_importance,
        'gain_importance': gain_importance,
        'shap_importance': shap_importance,
        'permutation_importance': perm_imp['importance_mean'].values
    })
 
    # 归一化
    for col in ['split_importance', 'gain_importance', 'shap_importance', 'permutation_importance']:
        global_df[f'{col}_norm'] = global_df[col] / global_df[col].sum()
 
    return global_df
 
# 全局分析
global_analysis = global_feature_analysis(model, X_test, y_test, feature_names)
 
print("全局特征分析 (归一化后):")
print(global_analysis[['feature', 'gain_importance_norm',
                       'shap_importance_norm', 'permutation_importance_norm']]
      .sort_values('gain_importance_norm', ascending=False)
      .head(10)
      .round(4))

4.2 局部解释

目的: 理解单个预测的决策过程

def local_explanation(model, explainer, X, sample_idx, feature_names):
    """单个样本的详细解释"""
    # 预测
    pred = model.predict(X[sample_idx:sample_idx+1])[0]
 
    # SHAP 值
    shap_values = explainer.shap_values(X[sample_idx:sample_idx+1])[0]
 
    # 基准值
    base_value = explainer.expected_value
 
    # 组织结果
    explanation = pd.DataFrame({
        'feature': feature_names,
        'value': X[sample_idx],
        'shap_value': shap_values
    }).sort_values('shap_value', key=abs, ascending=False)
 
    print(f"=== 样本 {sample_idx} 的局部解释 ===")
    print(f"基准值: {base_value:.4f}")
    print(f"预测值: {pred:.4f}")
    print(f"SHAP 总和: {base_value + shap_values.sum():.4f}")
    print(f"\n特征贡献 (Top 10):")
    print(explanation.head(10).to_string(index=False))
 
    return explanation
 
# 局部解释
local_expl = local_explanation(model, explainer, X_test, 0, feature_names)

5. 特征重要性稳定性

5.1 时间稳定性分析

def temporal_feature_importance(model, X_list, y_list, feature_names):
    """
    分析特征重要性随时间的变化
 
    参数:
        model: 模型(将被复制并重新训练)
        X_list: 不同时期的特征列表
        y_list: 不同时期的标签列表
        feature_names: 特征名称
    """
    from sklearn.base import clone
 
    importance_over_time = []
 
    for period, (X_period, y_period) in enumerate(zip(X_list, y_list)):
        # 训练模型
        model_period = clone(model)
        model_period.fit(X_period, y_period)
 
        # 获取重要性
        gain_importance = model_period.booster_.feature_importance('gain')
 
        importance_over_time.append({
            'period': period,
            **{f'feat_{i}': imp for i, imp in enumerate(gain_importance)}
        })
 
    importance_df = pd.DataFrame(importance_over_time)
 
    # 可视化前几个特征
    top_features = importance_df.drop('period', axis=1).mean().sort_values(ascending=False).head(5).index
 
    fig, ax = plt.subplots(figsize=(12, 5))
 
    for feat in top_features:
        ax.plot(importance_df['period'], importance_df[feat], marker='o', label=feat)
 
    ax.set_xlabel('Period')
    ax.set_ylabel('Importance (Gain)')
    ax.set_title('Feature Importance Over Time')
    ax.legend()
    ax.grid(True, alpha=0.3)
 
    plt.tight_layout()
    plt.show()
 
    return importance_df
 
# 模拟不同时期的数据
n_periods = 5
X_list = []
y_list = []
 
for i in range(n_periods):
    start = i * 200
    end = start + 500
    X_list.append(X[start:end])
    y_list.append(y[start:end])
 
# 分析
temporal_imp = temporal_feature_importance(model, X_list, y_list, feature_names)

5.2 交叉验证稳定性

def cv_feature_importance(model, X, y, cv=5, random_state=42):
    """交叉验证特征重要性"""
    from sklearn.model_selection import KFold
    from sklearn.base import clone
 
    kf = KFold(n_splits=cv, shuffle=False)  # 不 shuffle,保持时序
    importance_list = []
 
    for fold, (train_idx, _) in enumerate(kf.split(X)):
        model_fold = clone(model)
        model_fold.fit(X[train_idx], y[train_idx])
 
        gain_importance = model_fold.booster_.feature_importance('gain')
 
        importance_list.append({
            'fold': fold,
            **{f'feat_{i}': imp for i, imp in enumerate(gain_importance)}
        })
 
    importance_df = pd.DataFrame(importance_list)
 
    # 计算统计
    importance_stats = pd.DataFrame({
        'feature': feature_names,
        'mean': importance_df.drop('fold', axis=1).mean().values,
        'std': importance_df.drop('fold', axis=1).std().values,
        'cv': importance_df.drop('fold', axis=1).std().values / importance_df.drop('fold', axis=1).mean().values
    }).sort_values('mean', ascending=False)
 
    print("特征重要性交叉验证统计 (Top 10):")
    print(importance_stats.head(10).round(4))
 
    return importance_df, importance_stats
 
# 分析
cv_imp, cv_stats = cv_feature_importance(model, X_train, y_train, cv=5)
 
# 可视化
top_features = cv_stats.head(5)['feature'].values
 
fig, ax = plt.subplots(figsize=(12, 5))
 
for feat in top_features:
    feat_col = f'feat_{feature_names.index(feat)}'
    ax.plot(range(1, 6), cv_imp[feat_col], marker='o', label=feat)
 
ax.set_xlabel('Fold')
ax.set_ylabel('Importance (Gain)')
ax.set_title('Feature Importance Across CV Folds')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

6. 多重共线性检测

6.1 VIF(方差膨胀因子)

检测特征间的共线性。

其中 是用其他特征预测特征

VIF 值共线性程度
< 5可接受
5-10中等共线性
> 10严重共线性
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
 
def calculate_vif(X, feature_names):
    """
    计算方差膨胀因子(VIF)
 
    参数:
        X: 特征矩阵
        feature_names: 特征名称
 
    返回:
        vif_df: 包含 VIF 的 DataFrame
    """
    vif_data = []
 
    for i in range(X.shape[1]):
        # 用其他特征预测当前特征
        other_features = [j for j in range(X.shape[1]) if j != i]
 
        if len(other_features) == 0:
            vif = np.inf
        else:
            model = LinearRegression()
            model.fit(X[:, other_features], X[:, i])
            y_pred = model.predict(X[:, other_features])
            r2 = r2_score(X[:, i], y_pred)
            vif = 1 / (1 - r2) if r2 < 1 else np.inf
 
        vif_data.append({
            'feature': feature_names[i],
            'VIF': vif
        })
 
    return pd.DataFrame(vif_data).sort_values('VIF', ascending=False)
 
# 计算 VIF
vif_df = calculate_vif(X_train, feature_names)
 
print("方差膨胀因子 (VIF):")
print(vif_df.head(15).round(2))
 
# 标记高 VIF 特征
vif_df['high_collinearity'] = vif_df['VIF'] > 10
high_vif = vif_df[vif_df['high_collinearity']]
 
if len(high_vif) > 0:
    print(f"\n发现 {len(high_vif)} 个高共线性特征 (VIF > 10):")
    print(high_vif[['feature', 'VIF']])

6.2 相关系数矩阵

def correlation_analysis(X, feature_names, threshold=0.8):
    """
    相关性分析
 
    返回高度相关的特征对
    """
    corr_matrix = np.corrcoef(X.T)
    corr_df = pd.DataFrame(corr_matrix, index=feature_names, columns=feature_names)
 
    # 找出高相关对
    high_corr_pairs = []
 
    for i in range(len(feature_names)):
        for j in range(i + 1, len(feature_names)):
            corr_val = abs(corr_df.iloc[i, j])
            if corr_val > threshold:
                high_corr_pairs.append({
                    'feature_1': feature_names[i],
                    'feature_2': feature_names[j],
                    'correlation': corr_val
                })
 
    high_corr_df = pd.DataFrame(high_corr_pairs).sort_values('correlation', ascending=False)
 
    print(f"相关性 > {threshold} 的特征对:")
    print(high_corr_df if len(high_corr_df) > 0 else "无")
 
    return corr_df, high_corr_df
 
# 相关性分析
corr_matrix, high_corr = correlation_analysis(X_train, feature_names, threshold=0.8)
 
# 可视化相关系数矩阵
import seaborn as sns
 
fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(corr_matrix, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8},
            xticklabels=feature_names, yticklabels=feature_names, ax=ax)
ax.set_title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

7. 冗余特征剔除

7.1 基于重要性和共线性剔除

def remove_redundant_features(model, X, y, feature_names,
                             importance_threshold=0.01,
                             vif_threshold=10,
                             corr_threshold=0.9):
    """
    剔除冗余特征
 
    策略:
    1. 移除低重要性特征
    2. 在高共线性对中保留重要性更高的特征
    """
    # 1. 计算特征重要性
    importance = model.booster_.feature_importance('gain')
    importance_norm = importance / importance.sum()
 
    # 2. 计算 VIF
    vif_df = calculate_vif(X, feature_names)
 
    # 3. 计算相关性
    corr_matrix = np.corrcoef(X.T)
 
    # 确定保留的特征
    features_to_remove = set()
 
    # 规则1: 低重要性
    for i, (feat, imp) in enumerate(zip(feature_names, importance_norm)):
        if imp < importance_threshold:
            features_to_remove.add(i)
 
    # 规则2: 高共线性,保留重要性更高的
    for i in range(len(feature_names)):
        if i in features_to_remove:
            continue
        for j in range(i + 1, len(feature_names)):
            if j in features_to_remove:
                continue
            if abs(corr_matrix[i, j]) > corr_threshold:
                # 移除重要性较低的
                if importance[i] < importance[j]:
                    features_to_remove.add(i)
                else:
                    features_to_remove.add(j)
 
    # 规则3: 高 VIF
    for _, row in vif_df[vif_df['VIF'] > vif_threshold].iterrows():
        feat_idx = feature_names.index(row['feature'])
        if feat_idx not in features_to_remove:
            # 检查是否有高相关且重要性更高的替代特征
            for j in range(len(feature_names)):
                if j != feat_idx and j not in features_to_remove:
                    if abs(corr_matrix[feat_idx, j]) > 0.7:
                        if importance[j] > importance[feat_idx]:
                            features_to_remove.add(feat_idx)
                            break
 
    # 结果
    keep_indices = [i for i in range(len(feature_names)) if i not in features_to_remove]
    remove_indices = list(features_to_remove)
 
    print(f"原始特征数: {len(feature_names)}")
    print(f"保留特征数: {len(keep_indices)}")
    print(f"移除特征数: {len(remove_indices)}")
 
    if len(remove_indices) > 0:
        print("\n移除的特征:")
        for idx in sorted(remove_indices):
            print(f"  {feature_names[idx]}: 重要性={importance_norm[idx]:.4f}")
 
    return keep_indices, remove_indices
 
# 剔除冗余特征
keep_idx, remove_idx = remove_redundant_features(
    model, X_train, y_train, feature_names,
    importance_threshold=0.01,
    vif_threshold=10,
    corr_threshold=0.9
)
 
# 用保留的特征重新训练
X_train_reduced = X_train[:, keep_idx]
X_test_reduced = X_test[:, keep_idx]
reduced_names = [feature_names[i] for i in keep_idx]
 
model_reduced = lgb.LGBMRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    random_state=42,
    verbose=-1
)
model_reduced.fit(X_train_reduced, y_train)
 
# 对比性能
from sklearn.metrics import mean_squared_error
 
mse_original = mean_squared_error(y_test, model.predict(X_test))
mse_reduced = mean_squared_error(y_test, model_reduced.predict(X_test_reduced))
 
print(f"\n性能对比:")
print(f"  原始模型 MSE: {mse_original:.4f}")
print(f"  简化模型 MSE: {mse_reduced:.4f}")
print(f"  变化: {(mse_reduced - mse_original) / mse_original * 100:+.2f}%")

8. 完整分析报告模板

8.1 综合特征分析报告

def generate_feature_analysis_report(model, X_train, X_test, y_train, y_test,
                                    feature_names, explainer=None):
    """
    生成完整的特征分析报告
    """
    print("=" * 60)
    print("                  特征分析报告")
    print("=" * 60)
 
    # 1. 基本特征重要性
    print("\n【1. 特征重要性排名】\n")
 
    importance_gain = model.booster_.feature_importance('gain')
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'gain_importance': importance_gain,
        'gain_pct': importance_gain / importance_gain.sum() * 100
    }).sort_values('gain_importance', ascending=False)
 
    print(importance_df.head(15).to_string(index=False))
 
    # 2. SHAP 分析
    print("\n【2. SHAP 特征重要性】\n")
 
    if explainer is None:
        explainer = shap.TreeExplainer(model)
 
    shap_values = explainer.shap_values(X_test)
    shap_importance = np.abs(shap_values).mean(axis=0)
 
    shap_df = pd.DataFrame({
        'feature': feature_names,
        'shap_importance': shap_importance,
        'shap_pct': shap_importance / shap_importance.sum() * 100
    }).sort_values('shap_importance', ascending=False)
 
    print(shap_df.head(15).to_string(index=False))
 
    # 3. 排列重要性
    print("\n【3. 排列重要性】\n")
 
    perm_imp, baseline = permutation_importance(
        model, X_test, y_test,
        metric=mean_squared_error,
        n_repeats=10
    )
 
    print(f"基准 MSE: {baseline:.4f}")
    print("\nTop 15 重要特征:")
    print(perm_imp.head(15)[['feature', 'importance_mean']].to_string(index=False))
 
    # 4. 共线性分析
    print("\n【4. 共线性分析】\n")
 
    vif_df = calculate_vif(X_train, feature_names)
    high_vif = vif_df[vif_df['VIF'] > 10]
 
    if len(high_vif) > 0:
        print(f"发现 {len(high_vif)} 个高 VIF 特征:")
        print(high_vif[['feature', 'VIF']].to_string(index=False))
    else:
        print("未发现严重的共线性问题")
 
    # 5. 相关性分析
    print("\n【5. 高相关性特征对】\n")
 
    _, high_corr = correlation_analysis(X_train, feature_names, threshold=0.85)
 
    # 6. 综合建议
    print("\n【6. 综合建议】\n")
 
    # 合并各方法的重要性排名
    rank_df = pd.DataFrame({
        'feature': feature_names,
        'gain_rank': importance_df['gain_importance'].rank(ascending=False).values,
        'shap_rank': shap_df['shap_importance'].rank(ascending=False).values,
        'perm_rank': perm_imp['importance_mean'].rank(ascending=False).values
    })
    rank_df['avg_rank'] = rank_df[['gain_rank', 'shap_rank', 'perm_rank']].mean(axis=1)
    rank_df = rank_df.sort_values('avg_rank')
 
    print("综合重要性排名 (Top 10):")
    print(rank_df.head(10)[['feature', 'gain_rank', 'shap_rank', 'perm_rank', 'avg_rank']].to_string(index=False))
 
    # 建议保留的核心特征
    top_features = rank_df.head(10)['feature'].tolist()
    print(f"\n建议保留的核心特征 (前10): {', '.join(top_features)}")
 
    # 可疑特征
    suspicious = rank_df[rank_df['avg_rank'] > len(feature_names) * 0.8]['feature'].tolist()
    if len(suspicious) > 0:
        print(f"\n可能需要移除的特征: {', '.join(suspicious)}")
 
    print("\n" + "=" * 60)
 
    return {
        'importance_df': importance_df,
        'shap_df': shap_df,
        'perm_importance': perm_imp,
        'vif_df': vif_df,
        'high_corr': high_corr,
        'rank_df': rank_df
    }
 
# 生成报告
report = generate_feature_analysis_report(
    model, X_train, X_test, y_train, y_test, feature_names
)

8.2 交互式分析工具

class FeatureAnalyzer:
    """特征分析交互工具"""
 
    def __init__(self, model, X_train, X_test, y_train, y_test, feature_names):
        self.model = model
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.feature_names = feature_names
        self.explainer = shap.TreeExplainer(model)
        self.shap_values = self.explainer.shap_values(X_test)
 
    def summary(self):
        """打印摘要"""
        return generate_feature_analysis_report(
            self.model, self.X_train, self.X_test,
            self.y_train, self.y_test, self.feature_names,
            self.explainer
        )
 
    def plot_importance(self, method='gain', top_n=15):
        """绘制特征重要性"""
        if method == 'gain':
            importance = self.model.booster_.feature_importance('gain')
        elif method == 'split':
            importance = self.model.booster_.feature_importance('split')
        elif method == 'shap':
            importance = np.abs(self.shap_values).mean(axis=0)
        else:
            raise ValueError(f"Unknown method: {method}")
 
        df = pd.DataFrame({
            'feature': self.feature_names,
            'importance': importance
        }).sort_values('importance').tail(top_n)
 
        plt.figure(figsize=(10, 6))
        plt.barh(range(len(df)), df['importance'], color='steelblue')
        plt.yticks(range(len(df)), df['feature'])
        plt.xlabel('Importance')
        plt.title(f'Feature Importance ({method.capitalize()})')
        plt.gca().invert_yaxis()
        plt.grid(True, alpha=0.3, axis='x')
        plt.tight_layout()
        plt.show()
 
    def explain_sample(self, sample_idx):
        """解释单个样本"""
        return local_explanation(
            self.model, self.explainer, self.X_test,
            sample_idx, self.feature_names
        )
 
    def shap_summary(self):
        """SHAP 总结图"""
        shap.summary_plot(
            self.shap_values, self.X_test,
            feature_names=self.feature_names
        )
 
    def shap_dependence(self, feature_idx):
        """SHAP 依赖图"""
        shap.dependence_plot(
            feature_idx, self.shap_values, self.X_test,
            feature_names=self.feature_names
        )
 
# 使用
analyzer = FeatureAnalyzer(model, X_train, X_test, y_train, y_test, feature_names)
 
# 打印摘要
# analyzer.summary()
 
# 绘制重要性
# analyzer.plot_importance('shap')
 
# 解释样本
# analyzer.explain_sample(0)

核心知识点总结

特征重要性方法对比

方法优点缺点适用场景
Split 重要性计算快,直观偏向高基数特征快速筛选
Gain 重要性反映真实贡献略慢主要参考
排列重要性模型无关,可靠计算慢验证重要性
SHAP 值理论严谨,可解释计算慢深入分析

特征选择策略

第一步: 快速筛选
├── 使用 Gain 重要性
├── 移除重要性 < 1% 的特征
└── 通常可减少 30-50% 特征

第二步: 共线性处理
├── 计算 VIF
├── 在高共线性对中保留重要性更高的
└── 通常可再减少 10-20% 特征

第三步: 验证
├── 使用排列重要性确认
├── 对比简化前后模型性能
└── 确保性能损失 < 5%

稳定性检查

  1. 时间稳定性: 特征重要性不应随时间剧烈变化
  2. 交叉验证稳定性: 不同折的重要性排名应一致
  3. 方法一致性: 不同方法的重要性排名应大致相同

下一节: 06-实战案例.md - 完整的端到端实战项目。