特征重要性分析

1. LightGBM 内置重要性

1.1 两种重要性类型

LightGBM 提供两种特征重要性：

类型	说明	计算方式	优点	缺点
split	分裂次数	特征被用作分裂点的次数	直观，表示使用频率	偏向高基数特征
gain	信息增益	特征分裂带来的总增益	更反映贡献	计算稍慢

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
 
# 生成示例数据
np.random.seed(42)
X, y = make_regression(
    n_samples=5000,
    n_features=20,
    n_informative=8,
    noise=0.1,
    random_state=42
)
 
# 特征命名
feature_names = [f'factor_{i}' for i in range(20)]
 
# 训练模型
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
model = lgb.LGBMRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    random_state=42,
    verbose=-1
)
model.fit(X_train, y_train)
 
# 获取特征重要性
importance_split = model.booster_.feature_importance(importance_type='split')
importance_gain = model.booster_.feature_importance(importance_type='gain')
 
# 组织成 DataFrame
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance_split': importance_split,
    'importance_gain': importance_gain
})
 
# 排序
importance_df_split = importance_df.sort_values('importance_split', ascending=False)
importance_df_gain = importance_df.sort_values('importance_gain', ascending=False)
 
print("=== Split 重要性 (Top 10) ===")
print(importance_df_split.head(10)[['feature', 'importance_split']])
 
print("\n=== Gain 重要性 (Top 10) ===")
print(importance_df_gain.head(10)[['feature', 'importance_gain']])

1.2 可视化内置重要性

import matplotlib.pyplot as plt
 
def plot_feature_importance(importance_df, importance_type='split', top_n=15):
    """绘制特征重要性"""
    df_sorted = importance_df.sort_values(importance_type, ascending=False).head(top_n)
 
    fig, ax = plt.subplots(figsize=(10, 6))
 
    colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(df_sorted)))
    ax.barh(range(len(df_sorted)), df_sorted[f'importance_{importance_type}'], color=colors)
    ax.set_yticks(range(len(df_sorted)))
    ax.set_yticklabels(df_sorted['feature'])
    ax.set_xlabel('Importance')
    ax.set_title(f'Feature Importance ({importance_type.capitalize()})')
    ax.invert_yaxis()
    ax.grid(True, alpha=0.3, axis='x')
 
    plt.tight_layout()
    plt.show()
 
plot_feature_importance(importance_df, 'split')
plot_feature_importance(importance_df, 'gain')

1.3 归一化重要性

def normalize_importance(importance_df, importance_type='gain'):
    """归一化特征重要性（百分比）"""
    df = importance_df.copy()
    total = df[f'importance_{importance_type}'].sum()
    df[f'importance_{importance_type}_pct'] = df[f'importance_{importance_type}'] / total * 100
    return df.sort_values(f'importance_{importance_type}_pct', ascending=False)
 
# 归一化
importance_norm = normalize_importance(importance_df, 'gain')
print("归一化 Gain 重要性 (Top 10):")
print(importance_norm.head(10)[['feature', 'importance_gain_pct']].round(2))

2. 排列重要性

2.1 原理

排列重要性（Permutation Importance）通过打乱特征值来评估其重要性：

计算基准模型性能
对某个特征的值进行随机打乱
重新计算模型性能
性能下降越多 = 特征越重要

优势：

不依赖于模型内部实现
可以用于任何模型
更能反映特征的真实贡献

2.2 实现排列重要性

from sklearn.metrics import mean_squared_error
 
def permutation_importance(model, X, y, metric=mean_squared_error,
                          n_repeats=5, random_state=42):
    """
    计算排列重要性
 
    参数:
        model: 训练好的模型
        X: 特征数据
        y: 真实标签
        metric: 评估函数（越小越好）
        n_repeats: 重复次数
        random_state: 随机种子
 
    返回:
        importance_df: 特征重要性 DataFrame
    """
    np.random.seed(random_state)
 
    # 基准性能
    baseline_score = metric(y, model.predict(X))
 
    n_features = X.shape[1]
    feature_names = [f'feature_{i}' for i in range(n_features)]
 
    results = []
 
    for feature_idx in range(n_features):
        importance_scores = []
 
        for _ in range(n_repeats):
            # 复制数据
            X_permuted = X.copy()
 
            # 打乱当前特征
            perm_indices = np.random.permutation(len(X))
            X_permuted[:, feature_idx] = X[perm_indices, feature_idx]
 
            # 计算性能
            permuted_score = metric(y, model.predict(X_permuted))
 
            # 重要性 = 性能下降（因为MSE越小越好，所以用 permuted - baseline）
            importance = permuted_score - baseline_score
            importance_scores.append(importance)
 
        results.append({
            'feature': feature_names[feature_idx],
            'importance_mean': np.mean(importance_scores),
            'importance_std': np.std(importance_scores)
        })
 
    importance_df = pd.DataFrame(results)
    importance_df = importance_df.sort_values('importance_mean', ascending=False)
 
    return importance_df, baseline_score
 
# 计算排列重要性
perm_importance, baseline = permutation_importance(
    model, X_test, y_test,
    metric=mean_squared_error,
    n_repeats=10
)
 
print(f"基准 MSE: {baseline:.4f}\n")
print("排列重要性 (Top 10):")
print(perm_importance.head(10))

2.3 可视化排列重要性

def plot_permutation_importance(perm_importance, top_n=15):
    """绘制排列重要性"""
    df = perm_importance.head(top_n).sort_values('importance_mean')
 
    fig, ax = plt.subplots(figsize=(10, 6))
 
    y_positions = range(len(df))
    ax.barh(
        y_positions,
        df['importance_mean'],
        xerr=df['importance_std'],
        color='steelblue',
        alpha=0.8,
        error_kw={'linewidth': 2, 'capsize': 3}
    )
 
    ax.set_yticks(y_positions)
    ax.set_yticklabels(df['feature'])
    ax.set_xlabel('Importance (Performance Drop)')
    ax.set_title('Permutation Importance (Error Bars = Std Dev)')
    ax.axvline(x=0, color='black', linestyle='--', linewidth=1)
    ax.grid(True, alpha=0.3, axis='x')
 
    plt.tight_layout()
    plt.show()
 
plot_permutation_importance(perm_importance)

3. SHAP 值分析

3.1 SHAP 原理

SHAP（SHapley Additive exPlanations）基于博弈论的 Shapley 值：

每个特征的贡献 = 所有可能特征组合的边际贡献的加权平均
满足可加性：预测值 = 基准值 + 各特征 SHAP 值

$ϕ_{i} = \sum_{S \subseteq F ∖ {i}} \frac{∣ S ∣ ! ( ∣ F ∣ - ∣ S ∣ - 1 )!}{∣ F ∣ !} [f (S \cup {i}) - f (S)]$

3.2 SHAP 基础使用

# 安装 shap: pip install shap
import shap
 
# 计算 SHAP 值
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
 
# Summary plot - 全局特征重要性
shap.summary_plot(shap_values, X_test, feature_names=feature_names, show=False)
plt.gcf().set_size_inches(10, 6)
plt.title("SHAP Summary Plot")
plt.tight_layout()
plt.show()
 
# 特征重要性排序
mean_abs_shap = np.abs(shap_values).mean(axis=0)
shap_importance = pd.DataFrame({
    'feature': feature_names,
    'shap_importance': mean_abs_shap
}).sort_values('shap_importance', ascending=False)
 
print("\nSHAP 特征重要性 (Top 10):")
print(shap_importance.head(10))

3.3 SHAP 详解图

# Detailed summary plot
shap.summary_plot(shap_values, X_test, feature_names=feature_names,
                  plot_type="layered", show=False)
plt.gcf().set_size_inches(10, 6)
plt.title("SHAP Summary Plot (Layered)")
plt.tight_layout()
plt.show()
 
# Bar plot - 按重要性排序
shap.summary_plot(shap_values, X_test, feature_names=feature_names,
                  plot_type="bar", show=False)
plt.gcf().set_size_inches(10, 6)
plt.title("SHAP Feature Importance (Bar)")
plt.tight_layout()
plt.show()

3.4 SHAP 依赖图

分析单个特征的影响和交互作用。

def plot_shap_dependence(shap_values, X, feature_idx, feature_names):
    """绘制 SHAP 依赖图"""
    feature_name = feature_names[feature_idx]
 
    shap.dependence_plot(
        feature_idx,
        shap_values,
        X,
        feature_names=feature_names,
        show=False
    )
 
    plt.gcf().set_size_inches(8, 6)
    plt.title(f"SHAP Dependence Plot: {feature_name}")
    plt.tight_layout()
    plt.show()
 
# 绘制最重要特征的依赖图
top_feature_idx = shap_importance.iloc[0]['feature'].replace('feature_', '')
top_feature_idx = int(top_feature_idx)
 
plot_shap_dependence(shap_values, X_test, top_feature_idx, feature_names)

3.5 SHAP 单样本解释

# Force plot - 单样本解释
sample_idx = 0
 
# 预测值
pred = model.predict(X_test[sample_idx:sample_idx+1])[0]
actual = y_test[sample_idx]
 
print(f"样本 {sample_idx}:")
print(f"  实际值: {actual:.4f}")
print(f"  预测值: {pred:.4f}")
print(f"  误差:   {abs(pred - actual):.4f}")
 
# Force plot
shap.force_plot(
    explainer.expected_value,
    shap_values[sample_idx],
    X_test[sample_idx],
    feature_names=feature_names,
    matplotlib=True,
    show=False
)
plt.gcf().set_size_inches(12, 3)
plt.title(f"SHAP Force Plot - Sample {sample_idx}")
plt.tight_layout()
plt.show()
 
# Waterfall plot - 更清晰的单样本解释
shap.waterfall_plot(
    shap.Explanation(
        values=shap_values[sample_idx],
        base_values=explainer.expected_value,
        data=X_test[sample_idx],
        feature_names=feature_names
    ),
    show=False
)
plt.gcf().set_size_inches(10, 6)
plt.title(f"SHAP Waterfall Plot - Sample {sample_idx}")
plt.tight_layout()
plt.show()

4. 全局 vs 局部解释

4.1 全局解释

目的： 理解模型整体的行为模式

方法：

特征重要性排名
SHAP summary plot
特征影响分布

def global_feature_analysis(model, X, y, feature_names, explainer=None):
    """全局特征分析"""
    n_features = X.shape[1]
 
    # 1. 内置重要性
    split_importance = model.booster_.feature_importance('split')
    gain_importance = model.booster_.feature_importance('gain')
 
    # 2. SHAP 重要性
    if explainer is None:
        explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)
    shap_importance = np.abs(shap_values).mean(axis=0)
 
    # 3. 排列重要性
    perm_imp, _ = permutation_importance(model, X, y, n_repeats=5)
 
    # 汇总
    global_df = pd.DataFrame({
        'feature': feature_names,
        'split_importance': split_importance,
        'gain_importance': gain_importance,
        'shap_importance': shap_importance,
        'permutation_importance': perm_imp['importance_mean'].values
    })
 
    # 归一化
    for col in ['split_importance', 'gain_importance', 'shap_importance', 'permutation_importance']:
        global_df[f'{col}_norm'] = global_df[col] / global_df[col].sum()
 
    return global_df
 
# 全局分析
global_analysis = global_feature_analysis(model, X_test, y_test, feature_names)
 
print("全局特征分析 (归一化后):")
print(global_analysis[['feature', 'gain_importance_norm',
                       'shap_importance_norm', 'permutation_importance_norm']]
      .sort_values('gain_importance_norm', ascending=False)
      .head(10)
      .round(4))

4.2 局部解释

目的： 理解单个预测的决策过程

def local_explanation(model, explainer, X, sample_idx, feature_names):
    """单个样本的详细解释"""
    # 预测
    pred = model.predict(X[sample_idx:sample_idx+1])[0]
 
    # SHAP 值
    shap_values = explainer.shap_values(X[sample_idx:sample_idx+1])[0]
 
    # 基准值
    base_value = explainer.expected_value
 
    # 组织结果
    explanation = pd.DataFrame({
        'feature': feature_names,
        'value': X[sample_idx],
        'shap_value': shap_values
    }).sort_values('shap_value', key=abs, ascending=False)
 
    print(f"=== 样本 {sample_idx} 的局部解释 ===")
    print(f"基准值: {base_value:.4f}")
    print(f"预测值: {pred:.4f}")
    print(f"SHAP 总和: {base_value + shap_values.sum():.4f}")
    print(f"\n特征贡献 (Top 10):")
    print(explanation.head(10).to_string(index=False))
 
    return explanation
 
# 局部解释
local_expl = local_explanation(model, explainer, X_test, 0, feature_names)

5. 特征重要性稳定性

5.1 时间稳定性分析

def temporal_feature_importance(model, X_list, y_list, feature_names):
    """
    分析特征重要性随时间的变化
 
    参数:
        model: 模型（将被复制并重新训练）
        X_list: 不同时期的特征列表
        y_list: 不同时期的标签列表
        feature_names: 特征名称
    """
    from sklearn.base import clone
 
    importance_over_time = []
 
    for period, (X_period, y_period) in enumerate(zip(X_list, y_list)):
        # 训练模型
        model_period = clone(model)
        model_period.fit(X_period, y_period)
 
        # 获取重要性
        gain_importance = model_period.booster_.feature_importance('gain')
 
        importance_over_time.append({
            'period': period,
            **{f'feat_{i}': imp for i, imp in enumerate(gain_importance)}
        })
 
    importance_df = pd.DataFrame(importance_over_time)
 
    # 可视化前几个特征
    top_features = importance_df.drop('period', axis=1).mean().sort_values(ascending=False).head(5).index
 
    fig, ax = plt.subplots(figsize=(12, 5))
 
    for feat in top_features:
        ax.plot(importance_df['period'], importance_df[feat], marker='o', label=feat)
 
    ax.set_xlabel('Period')
    ax.set_ylabel('Importance (Gain)')
    ax.set_title('Feature Importance Over Time')
    ax.legend()
    ax.grid(True, alpha=0.3)
 
    plt.tight_layout()
    plt.show()
 
    return importance_df
 
# 模拟不同时期的数据
n_periods = 5
X_list = []
y_list = []
 
for i in range(n_periods):
    start = i * 200
    end = start + 500
    X_list.append(X[start:end])
    y_list.append(y[start:end])
 
# 分析
temporal_imp = temporal_feature_importance(model, X_list, y_list, feature_names)

5.2 交叉验证稳定性

def cv_feature_importance(model, X, y, cv=5, random_state=42):
    """交叉验证特征重要性"""
    from sklearn.model_selection import KFold
    from sklearn.base import clone
 
    kf = KFold(n_splits=cv, shuffle=False)  # 不 shuffle，保持时序
    importance_list = []
 
    for fold, (train_idx, _) in enumerate(kf.split(X)):
        model_fold = clone(model)
        model_fold.fit(X[train_idx], y[train_idx])
 
        gain_importance = model_fold.booster_.feature_importance('gain')
 
        importance_list.append({
            'fold': fold,
            **{f'feat_{i}': imp for i, imp in enumerate(gain_importance)}
        })
 
    importance_df = pd.DataFrame(importance_list)
 
    # 计算统计
    importance_stats = pd.DataFrame({
        'feature': feature_names,
        'mean': importance_df.drop('fold', axis=1).mean().values,
        'std': importance_df.drop('fold', axis=1).std().values,
        'cv': importance_df.drop('fold', axis=1).std().values / importance_df.drop('fold', axis=1).mean().values
    }).sort_values('mean', ascending=False)
 
    print("特征重要性交叉验证统计 (Top 10):")
    print(importance_stats.head(10).round(4))
 
    return importance_df, importance_stats
 
# 分析
cv_imp, cv_stats = cv_feature_importance(model, X_train, y_train, cv=5)
 
# 可视化
top_features = cv_stats.head(5)['feature'].values
 
fig, ax = plt.subplots(figsize=(12, 5))
 
for feat in top_features:
    feat_col = f'feat_{feature_names.index(feat)}'
    ax.plot(range(1, 6), cv_imp[feat_col], marker='o', label=feat)
 
ax.set_xlabel('Fold')
ax.set_ylabel('Importance (Gain)')
ax.set_title('Feature Importance Across CV Folds')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

6. 多重共线性检测

6.1 VIF（方差膨胀因子）

检测特征间的共线性。

$V I F_{i} = \frac{1}{1 - R _{i}^{2}}$

其中 $R_{i}^{2}$ 是用其他特征预测特征 $i$ 的 $R^{2}$ 。

VIF 值	共线性程度
< 5	可接受
5-10	中等共线性
> 10	严重共线性

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
 
def calculate_vif(X, feature_names):
    """
    计算方差膨胀因子（VIF）
 
    参数:
        X: 特征矩阵
        feature_names: 特征名称
 
    返回:
        vif_df: 包含 VIF 的 DataFrame
    """
    vif_data = []
 
    for i in range(X.shape[1]):
        # 用其他特征预测当前特征
        other_features = [j for j in range(X.shape[1]) if j != i]
 
        if len(other_features) == 0:
            vif = np.inf
        else:
            model = LinearRegression()
            model.fit(X[:, other_features], X[:, i])
            y_pred = model.predict(X[:, other_features])
            r2 = r2_score(X[:, i], y_pred)
            vif = 1 / (1 - r2) if r2 < 1 else np.inf
 
        vif_data.append({
            'feature': feature_names[i],
            'VIF': vif
        })
 
    return pd.DataFrame(vif_data).sort_values('VIF', ascending=False)
 
# 计算 VIF
vif_df = calculate_vif(X_train, feature_names)
 
print("方差膨胀因子 (VIF):")
print(vif_df.head(15).round(2))
 
# 标记高 VIF 特征
vif_df['high_collinearity'] = vif_df['VIF'] > 10
high_vif = vif_df[vif_df['high_collinearity']]
 
if len(high_vif) > 0:
    print(f"\n发现 {len(high_vif)} 个高共线性特征 (VIF > 10):")
    print(high_vif[['feature', 'VIF']])

6.2 相关系数矩阵

def correlation_analysis(X, feature_names, threshold=0.8):
    """
    相关性分析
 
    返回高度相关的特征对
    """
    corr_matrix = np.corrcoef(X.T)
    corr_df = pd.DataFrame(corr_matrix, index=feature_names, columns=feature_names)
 
    # 找出高相关对
    high_corr_pairs = []
 
    for i in range(len(feature_names)):
        for j in range(i + 1, len(feature_names)):
            corr_val = abs(corr_df.iloc[i, j])
            if corr_val > threshold:
                high_corr_pairs.append({
                    'feature_1': feature_names[i],
                    'feature_2': feature_names[j],
                    'correlation': corr_val
                })
 
    high_corr_df = pd.DataFrame(high_corr_pairs).sort_values('correlation', ascending=False)
 
    print(f"相关性 > {threshold} 的特征对:")
    print(high_corr_df if len(high_corr_df) > 0 else "无")
 
    return corr_df, high_corr_df
 
# 相关性分析
corr_matrix, high_corr = correlation_analysis(X_train, feature_names, threshold=0.8)
 
# 可视化相关系数矩阵
import seaborn as sns
 
fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(corr_matrix, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8},
            xticklabels=feature_names, yticklabels=feature_names, ax=ax)
ax.set_title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

7. 冗余特征剔除

7.1 基于重要性和共线性剔除

def remove_redundant_features(model, X, y, feature_names,
                             importance_threshold=0.01,
                             vif_threshold=10,
                             corr_threshold=0.9):
    """
    剔除冗余特征
 
    策略：
    1. 移除低重要性特征
    2. 在高共线性对中保留重要性更高的特征
    """
    # 1. 计算特征重要性
    importance = model.booster_.feature_importance('gain')
    importance_norm = importance / importance.sum()
 
    # 2. 计算 VIF
    vif_df = calculate_vif(X, feature_names)
 
    # 3. 计算相关性
    corr_matrix = np.corrcoef(X.T)
 
    # 确定保留的特征
    features_to_remove = set()
 
    # 规则1: 低重要性
    for i, (feat, imp) in enumerate(zip(feature_names, importance_norm)):
        if imp < importance_threshold:
            features_to_remove.add(i)
 
    # 规则2: 高共线性，保留重要性更高的
    for i in range(len(feature_names)):
        if i in features_to_remove:
            continue
        for j in range(i + 1, len(feature_names)):
            if j in features_to_remove:
                continue
            if abs(corr_matrix[i, j]) > corr_threshold:
                # 移除重要性较低的
                if importance[i] < importance[j]:
                    features_to_remove.add(i)
                else:
                    features_to_remove.add(j)
 
    # 规则3: 高 VIF
    for _, row in vif_df[vif_df['VIF'] > vif_threshold].iterrows():
        feat_idx = feature_names.index(row['feature'])
        if feat_idx not in features_to_remove:
            # 检查是否有高相关且重要性更高的替代特征
            for j in range(len(feature_names)):
                if j != feat_idx and j not in features_to_remove:
                    if abs(corr_matrix[feat_idx, j]) > 0.7:
                        if importance[j] > importance[feat_idx]:
                            features_to_remove.add(feat_idx)
                            break
 
    # 结果
    keep_indices = [i for i in range(len(feature_names)) if i not in features_to_remove]
    remove_indices = list(features_to_remove)
 
    print(f"原始特征数: {len(feature_names)}")
    print(f"保留特征数: {len(keep_indices)}")
    print(f"移除特征数: {len(remove_indices)}")
 
    if len(remove_indices) > 0:
        print("\n移除的特征:")
        for idx in sorted(remove_indices):
            print(f"  {feature_names[idx]}: 重要性={importance_norm[idx]:.4f}")
 
    return keep_indices, remove_indices
 
# 剔除冗余特征
keep_idx, remove_idx = remove_redundant_features(
    model, X_train, y_train, feature_names,
    importance_threshold=0.01,
    vif_threshold=10,
    corr_threshold=0.9
)
 
# 用保留的特征重新训练
X_train_reduced = X_train[:, keep_idx]
X_test_reduced = X_test[:, keep_idx]
reduced_names = [feature_names[i] for i in keep_idx]
 
model_reduced = lgb.LGBMRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    random_state=42,
    verbose=-1
)
model_reduced.fit(X_train_reduced, y_train)
 
# 对比性能
from sklearn.metrics import mean_squared_error
 
mse_original = mean_squared_error(y_test, model.predict(X_test))
mse_reduced = mean_squared_error(y_test, model_reduced.predict(X_test_reduced))
 
print(f"\n性能对比:")
print(f"  原始模型 MSE: {mse_original:.4f}")
print(f"  简化模型 MSE: {mse_reduced:.4f}")
print(f"  变化: {(mse_reduced - mse_original) / mse_original * 100:+.2f}%")

8. 完整分析报告模板

8.1 综合特征分析报告

def generate_feature_analysis_report(model, X_train, X_test, y_train, y_test,
                                    feature_names, explainer=None):
    """
    生成完整的特征分析报告
    """
    print("=" * 60)
    print("                  特征分析报告")
    print("=" * 60)
 
    # 1. 基本特征重要性
    print("\n【1. 特征重要性排名】\n")
 
    importance_gain = model.booster_.feature_importance('gain')
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'gain_importance': importance_gain,
        'gain_pct': importance_gain / importance_gain.sum() * 100
    }).sort_values('gain_importance', ascending=False)
 
    print(importance_df.head(15).to_string(index=False))
 
    # 2. SHAP 分析
    print("\n【2. SHAP 特征重要性】\n")
 
    if explainer is None:
        explainer = shap.TreeExplainer(model)
 
    shap_values = explainer.shap_values(X_test)
    shap_importance = np.abs(shap_values).mean(axis=0)
 
    shap_df = pd.DataFrame({
        'feature': feature_names,
        'shap_importance': shap_importance,
        'shap_pct': shap_importance / shap_importance.sum() * 100
    }).sort_values('shap_importance', ascending=False)
 
    print(shap_df.head(15).to_string(index=False))
 
    # 3. 排列重要性
    print("\n【3. 排列重要性】\n")
 
    perm_imp, baseline = permutation_importance(
        model, X_test, y_test,
        metric=mean_squared_error,
        n_repeats=10
    )
 
    print(f"基准 MSE: {baseline:.4f}")
    print("\nTop 15 重要特征:")
    print(perm_imp.head(15)[['feature', 'importance_mean']].to_string(index=False))
 
    # 4. 共线性分析
    print("\n【4. 共线性分析】\n")
 
    vif_df = calculate_vif(X_train, feature_names)
    high_vif = vif_df[vif_df['VIF'] > 10]
 
    if len(high_vif) > 0:
        print(f"发现 {len(high_vif)} 个高 VIF 特征:")
        print(high_vif[['feature', 'VIF']].to_string(index=False))
    else:
        print("未发现严重的共线性问题")
 
    # 5. 相关性分析
    print("\n【5. 高相关性特征对】\n")
 
    _, high_corr = correlation_analysis(X_train, feature_names, threshold=0.85)
 
    # 6. 综合建议
    print("\n【6. 综合建议】\n")
 
    # 合并各方法的重要性排名
    rank_df = pd.DataFrame({
        'feature': feature_names,
        'gain_rank': importance_df['gain_importance'].rank(ascending=False).values,
        'shap_rank': shap_df['shap_importance'].rank(ascending=False).values,
        'perm_rank': perm_imp['importance_mean'].rank(ascending=False).values
    })
    rank_df['avg_rank'] = rank_df[['gain_rank', 'shap_rank', 'perm_rank']].mean(axis=1)
    rank_df = rank_df.sort_values('avg_rank')
 
    print("综合重要性排名 (Top 10):")
    print(rank_df.head(10)[['feature', 'gain_rank', 'shap_rank', 'perm_rank', 'avg_rank']].to_string(index=False))
 
    # 建议保留的核心特征
    top_features = rank_df.head(10)['feature'].tolist()
    print(f"\n建议保留的核心特征 (前10): {', '.join(top_features)}")
 
    # 可疑特征
    suspicious = rank_df[rank_df['avg_rank'] > len(feature_names) * 0.8]['feature'].tolist()
    if len(suspicious) > 0:
        print(f"\n可能需要移除的特征: {', '.join(suspicious)}")
 
    print("\n" + "=" * 60)
 
    return {
        'importance_df': importance_df,
        'shap_df': shap_df,
        'perm_importance': perm_imp,
        'vif_df': vif_df,
        'high_corr': high_corr,
        'rank_df': rank_df
    }
 
# 生成报告
report = generate_feature_analysis_report(
    model, X_train, X_test, y_train, y_test, feature_names
)

8.2 交互式分析工具

class FeatureAnalyzer:
    """特征分析交互工具"""
 
    def __init__(self, model, X_train, X_test, y_train, y_test, feature_names):
        self.model = model
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.feature_names = feature_names
        self.explainer = shap.TreeExplainer(model)
        self.shap_values = self.explainer.shap_values(X_test)
 
    def summary(self):
        """打印摘要"""
        return generate_feature_analysis_report(
            self.model, self.X_train, self.X_test,
            self.y_train, self.y_test, self.feature_names,
            self.explainer
        )
 
    def plot_importance(self, method='gain', top_n=15):
        """绘制特征重要性"""
        if method == 'gain':
            importance = self.model.booster_.feature_importance('gain')
        elif method == 'split':
            importance = self.model.booster_.feature_importance('split')
        elif method == 'shap':
            importance = np.abs(self.shap_values).mean(axis=0)
        else:
            raise ValueError(f"Unknown method: {method}")
 
        df = pd.DataFrame({
            'feature': self.feature_names,
            'importance': importance
        }).sort_values('importance').tail(top_n)
 
        plt.figure(figsize=(10, 6))
        plt.barh(range(len(df)), df['importance'], color='steelblue')
        plt.yticks(range(len(df)), df['feature'])
        plt.xlabel('Importance')
        plt.title(f'Feature Importance ({method.capitalize()})')
        plt.gca().invert_yaxis()
        plt.grid(True, alpha=0.3, axis='x')
        plt.tight_layout()
        plt.show()
 
    def explain_sample(self, sample_idx):
        """解释单个样本"""
        return local_explanation(
            self.model, self.explainer, self.X_test,
            sample_idx, self.feature_names
        )
 
    def shap_summary(self):
        """SHAP 总结图"""
        shap.summary_plot(
            self.shap_values, self.X_test,
            feature_names=self.feature_names
        )
 
    def shap_dependence(self, feature_idx):
        """SHAP 依赖图"""
        shap.dependence_plot(
            feature_idx, self.shap_values, self.X_test,
            feature_names=self.feature_names
        )
 
# 使用
analyzer = FeatureAnalyzer(model, X_train, X_test, y_train, y_test, feature_names)
 
# 打印摘要
# analyzer.summary()
 
# 绘制重要性
# analyzer.plot_importance('shap')
 
# 解释样本
# analyzer.explain_sample(0)

核心知识点总结

特征重要性方法对比

方法	优点	缺点	适用场景
Split 重要性	计算快，直观	偏向高基数特征	快速筛选
Gain 重要性	反映真实贡献	略慢	主要参考
排列重要性	模型无关，可靠	计算慢	验证重要性
SHAP 值	理论严谨，可解释	计算慢	深入分析

特征选择策略

第一步: 快速筛选
├── 使用 Gain 重要性
├── 移除重要性 < 1% 的特征
└── 通常可减少 30-50% 特征

第二步: 共线性处理
├── 计算 VIF
├── 在高共线性对中保留重要性更高的
└── 通常可再减少 10-20% 特征

第三步: 验证
├── 使用排列重要性确认
├── 对比简化前后模型性能
└── 确保性能损失 < 5%

稳定性检查

时间稳定性: 特征重要性不应随时间剧烈变化
交叉验证稳定性: 不同折的重要性排名应一致
方法一致性: 不同方法的重要性排名应大致相同

下一节： 06-实战案例.md - 完整的端到端实战项目。

MindCarver Blog

MindCarver

探索

05-特征重要性分析