2025年1月26日

Enhanced エンジニアチーム

技術記事

【Python×LightGBM入門】第3回：回帰問題とパラメータチューニング

Python機械学習LightGBM回帰分析Optunaハイパーパラメータ

はじめに

前回は分類問題への適用方法を学びました。今回は回帰問題への適用と、モデルの性能を最大化するためのパラメータチューニングについて詳しく解説します。特に、最新のAutoMLライブラリ「Optuna」を使った効率的なチューニング方法を紹介します。

回帰問題のデータセット準備

今回は住宅価格予測を題材にします。これは実務でもよく扱われる回帰問題の典型例です。

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# データセットの生成（実際の案件ではCSVファイルなどから読み込む）
np.random.seed(42)
n_samples = 5000

# 住宅データの特徴量を生成
data = pd.DataFrame({
    'area': np.random.uniform(50, 300, n_samples),  # 面積（㎡）
    'rooms': np.random.randint(1, 6, n_samples),    # 部屋数
    'age': np.random.randint(0, 50, n_samples),     # 築年数
    'distance_station': np.random.exponential(2, n_samples),  # 駅からの距離（km）
    'distance_center': np.random.uniform(1, 30, n_samples),   # 都心からの距離（km）
    'floor': np.random.randint(1, 20, n_samples),    # 階数
    'total_floors': np.random.randint(3, 30, n_samples),  # 建物の総階数
    'parking': np.random.choice([0, 1], n_samples, p=[0.3, 0.7]),  # 駐車場有無
    'south_facing': np.random.choice([0, 1], n_samples, p=[0.6, 0.4]),  # 南向きか
    'renovation': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),  # リノベーション済みか
    'structure': np.random.choice(['RC', 'SRC', 'Wood'], n_samples, p=[0.5, 0.3, 0.2]),  # 構造
    'district': np.random.choice(['A', 'B', 'C', 'D', 'E'], n_samples),  # 地区
})

# 価格を生成（実際の価格に近い関係性を持たせる）
data['price'] = (
    200 * data['area'] 
    + 500 * data['rooms'] 
    - 50 * data['age'] 
    - 300 * data['distance_station'] 
    - 100 * data['distance_center']
    + 50 * data['floor']
    + 1000 * data['parking']
    + 500 * data['south_facing']
    + 2000 * data['renovation']
    + np.random.normal(0, 5000, n_samples)  # ノイズ
) / 10000  # 単位：万円

# 外れ値の除去
data = data[(data['price'] > 0) & (data['price'] < 20000)]

print(f"データセットのサイズ: {data.shape}")
print(f"価格の統計情報:")
print(data['price'].describe())

探索的データ分析（EDA）

# 価格の分布を確認
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# ヒストグラム
axes[0].hist(data['price'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('価格（万円）')
axes[0].set_ylabel('頻度')
axes[0].set_title('価格の分布')

# ボックスプロット
axes[1].boxplot(data['price'])
axes[1].set_ylabel('価格（万円）')
axes[1].set_title('価格のボックスプロット')

plt.tight_layout()
plt.show()

# 相関行列の可視化
numeric_cols = ['area', 'rooms', 'age', 'distance_station', 'distance_center', 
                'floor', 'total_floors', 'parking', 'south_facing', 'renovation', 'price']
correlation_matrix = data[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('特徴量の相関行列')
plt.tight_layout()
plt.show()

# 価格との相関が強い特徴量
price_corr = correlation_matrix['price'].sort_values(ascending=False)
print("価格との相関係数:")
print(price_corr)

データの前処理

from sklearn.preprocessing import LabelEncoder

# カテゴリ変数のエンコーディング
categorical_cols = ['structure', 'district']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    data[col + '_encoded'] = le.fit_transform(data[col])
    label_encoders[col] = le

# 特徴量エンジニアリング
data['area_per_room'] = data['area'] / data['rooms']  # 1部屋あたりの面積
data['floor_ratio'] = data['floor'] / data['total_floors']  # 階数の比率
data['is_high_floor'] = (data['floor'] > 10).astype(int)  # 高層階フラグ
data['is_new'] = (data['age'] < 5).astype(int)  # 新築フラグ

# 特徴量とターゲットの分離
feature_cols = ['area', 'rooms', 'age', 'distance_station', 'distance_center', 
                'floor', 'total_floors', 'parking', 'south_facing', 'renovation',
                'structure_encoded', 'district_encoded', 'area_per_room', 
                'floor_ratio', 'is_high_floor', 'is_new']

X = data[feature_cols]
y = data['price']

# 訓練データとテストデータの分割
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"訓練データ: {X_train.shape}")
print(f"テストデータ: {X_test.shape}")

基本的な回帰モデルの構築

# LightGBMデータセットの作成
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# 基本パラメータ
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'random_state': 42
}

# モデルの学習
print("基本モデルの学習...")
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

# 予測と評価
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

# 評価指標
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n基本モデルの評価指標:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.4f}")

Optunaを使ったハイパーパラメータチューニング

Optunaのインストール

pip install optuna

チューニングの実装

import optuna
from optuna.integration import LightGBMPruningCallback

def objective(trial):
    # チューニングするパラメータを定義
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 10, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'verbose': -1,
        'random_state': 42
    }
    
    # 交差検証
    cv_scores = []
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    for fold, (train_idx, valid_idx) in enumerate(kf.split(X_train)):
        X_fold_train = X_train.iloc[train_idx]
        X_fold_valid = X_train.iloc[valid_idx]
        y_fold_train = y_train.iloc[train_idx]
        y_fold_valid = y_train.iloc[valid_idx]
        
        train_data = lgb.Dataset(X_fold_train, label=y_fold_train)
        valid_data = lgb.Dataset(X_fold_valid, label=y_fold_valid, reference=train_data)
        
        # Pruning callbackを使用して効率化
        pruning_callback = LightGBMPruningCallback(trial, 'rmse')
        
        model = lgb.train(
            params,
            train_data,
            valid_sets=[valid_data],
            num_boost_round=1000,
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),
                lgb.log_evaluation(0),
                pruning_callback
            ]
        )
        
        # 検証データでの予測
        y_pred = model.predict(X_fold_valid, num_iteration=model.best_iteration)
        rmse = np.sqrt(mean_squared_error(y_fold_valid, y_pred))
        cv_scores.append(rmse)
    
    return np.mean(cv_scores)

# Optunaでの最適化
print("ハイパーパラメータチューニングを開始...")
study = optuna.create_study(
    direction='minimize',
    sampler=optuna.samplers.TPESampler(seed=42)
)

study.optimize(objective, n_trials=100, show_progress_bar=True)

print(f"\n最適なパラメータ:")
print(study.best_params)
print(f"\n最適なRMSE: {study.best_value:.2f}")

最適化されたモデルの評価

# 最適なパラメータでモデルを再学習
best_params = study.best_params
best_params.update({
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'verbose': 0,
    'random_state': 42
})

# 全訓練データで学習
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

best_model = lgb.train(
    best_params,
    train_data,
    valid_sets=[train_data, valid_data],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

# 予測と評価
y_pred_best = best_model.predict(X_test, num_iteration=best_model.best_iteration)

# 評価指標
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
mae_best = mean_absolute_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print(f"\n最適化モデルの評価指標:")
print(f"RMSE: {rmse_best:.2f} (改善: {rmse - rmse_best:.2f})")
print(f"MAE: {mae_best:.2f} (改善: {mae - mae_best:.2f})")
print(f"R²: {r2_best:.4f} (改善: {r2_best - r2:.4f})")

予測結果の可視化

# 予測値と実測値の比較
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 散布図
axes[0, 0].scatter(y_test, y_pred_best, alpha=0.5)
axes[0, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0, 0].set_xlabel('実測値（万円）')
axes[0, 0].set_ylabel('予測値（万円）')
axes[0, 0].set_title('予測値 vs 実測値')

# 残差プロット
residuals = y_test - y_pred_best
axes[0, 1].scatter(y_pred_best, residuals, alpha=0.5)
axes[0, 1].axhline(y=0, color='r', linestyle='--')
axes[0, 1].set_xlabel('予測値（万円）')
axes[0, 1].set_ylabel('残差（万円）')
axes[0, 1].set_title('残差プロット')

# 残差のヒストグラム
axes[1, 0].hist(residuals, bins=50, edgecolor='black', alpha=0.7)
axes[1, 0].set_xlabel('残差（万円）')
axes[1, 0].set_ylabel('頻度')
axes[1, 0].set_title('残差の分布')

# Q-Qプロット
from scipy import stats
stats.probplot(residuals, dist="norm", plot=axes[1, 1])
axes[1, 1].set_title('Q-Qプロット')

plt.tight_layout()
plt.show()

# 誤差の統計
print(f"\n残差の統計情報:")
print(f"平均: {residuals.mean():.2f}")
print(f"標準偏差: {residuals.std():.2f}")
print(f"最小値: {residuals.min():.2f}")
print(f"最大値: {residuals.max():.2f}")

Optunaの最適化履歴の可視化

# 最適化履歴
fig = optuna.visualization.plot_optimization_history(study)
fig.show()

# パラメータの重要度
fig = optuna.visualization.plot_param_importances(study)
fig.show()

# パラメータの関係性
fig = optuna.visualization.plot_contour(study, params=['num_leaves', 'learning_rate'])
fig.show()

高度なパラメータチューニングテクニック

1. ベイズ最適化の設定

# より高度なサンプラーの使用
from optuna.samplers import CmaEsSampler

study_cmaes = optuna.create_study(
    direction='minimize',
    sampler=CmaEsSampler(seed=42)
)

# 探索空間を絞った最適化
def objective_refined(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        # 前回の最適化結果を参考に範囲を絞る
        'num_leaves': trial.suggest_int('num_leaves', 50, 200),
        'learning_rate': trial.suggest_float('learning_rate', 0.02, 0.15),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 0.95),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 0.95),
        'bagging_freq': trial.suggest_int('bagging_freq', 3, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
        'verbose': -1,
        'random_state': 42
    }
    
    # 簡略化した評価（3-fold CV）
    cv_scores = []
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    
    for train_idx, valid_idx in kf.split(X_train):
        X_fold_train = X_train.iloc[train_idx]
        X_fold_valid = X_train.iloc[valid_idx]
        y_fold_train = y_train.iloc[train_idx]
        y_fold_valid = y_train.iloc[valid_idx]
        
        train_data = lgb.Dataset(X_fold_train, label=y_fold_train)
        valid_data = lgb.Dataset(X_fold_valid, label=y_fold_valid)
        
        model = lgb.train(
            params,
            train_data,
            valid_sets=[valid_data],
            num_boost_round=500,
            callbacks=[lgb.early_stopping(30), lgb.log_evaluation(0)]
        )
        
        y_pred = model.predict(X_fold_valid, num_iteration=model.best_iteration)
        rmse = np.sqrt(mean_squared_error(y_fold_valid, y_pred))
        cv_scores.append(rmse)
    
    return np.mean(cv_scores)

2. カテゴリ変数の最適な扱い方

# カテゴリ変数のインデックスを指定
categorical_features = ['structure_encoded', 'district_encoded']
categorical_indices = [feature_cols.index(col) for col in categorical_features]

# カテゴリ変数を考慮したパラメータ
params_with_cat = best_params.copy()
params_with_cat.update({
    'categorical_feature': categorical_indices,
    'min_data_per_group': 100,  # カテゴリごとの最小データ数
    'cat_smooth': 10,  # カテゴリのスムージング
})

実務的なTips

1. 対数変換による精度向上

# ターゲット変数の対数変換
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

# 対数変換したターゲットで学習
train_data_log = lgb.Dataset(X_train, label=y_train_log)
valid_data_log = lgb.Dataset(X_test, label=y_test_log, reference=train_data_log)

model_log = lgb.train(
    best_params,
    train_data_log,
    valid_sets=[train_data_log, valid_data_log],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
)

# 予測値を元のスケールに戻す
y_pred_log = model_log.predict(X_test, num_iteration=model_log.best_iteration)
y_pred_original = np.expm1(y_pred_log)

rmse_log = np.sqrt(mean_squared_error(y_test, y_pred_original))
print(f"対数変換モデルのRMSE: {rmse_log:.2f}")

2. 外れ値に強いモデル

# Huber損失を使用（外れ値に強い）
params_huber = best_params.copy()
params_huber.update({
    'objective': 'huber',
    'alpha': 0.9,  # Huberのパラメータ
})

model_huber = lgb.train(
    params_huber,
    train_data,
    valid_sets=[train_data, valid_data],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
)

まとめ

今回は、LightGBMを使った回帰問題の実装と、Optunaによる効率的なパラメータチューニングについて学びました。重要なポイントは：

回帰問題の評価指標: RMSE、MAE、R²など複数の指標で評価
Optunaの活用: 効率的なハイパーパラメータ探索
交差検証: 過学習を防ぎ、汎化性能を向上
可視化の重要性: 残差分析による問題の発見
実務的なテクニック: 対数変換、外れ値対策など

次回は「特徴量エンジニアリングと重要度分析」について、より高度な特徴量の作成方法を解説します。

演習問題

異なる目的関数（mae, mapeなど）を試して結果を比較してみましょう
dartやgossなどの異なるブースティングタイプを試してみましょう
アンサンブル（複数モデルの平均）で精度向上を図ってみましょう

ご質問やフィードバックがございましたら、ぜひコメント欄でお知らせください。次回もお楽しみに！

記事をシェア

【Python×LightGBM入門】第6回：実践的な応用例とベストプラクティス

LightGBMを実務で活用するための実践的な応用例とベストプラクティス。大規模データ処理、リアルタイム予測、本番環境での運用方法について詳しく解説します。...

【Python×LightGBM入門】第5回：交差検証とアンサンブル学習

LightGBMで堅牢な予測モデルを構築するための交差検証手法と、複数モデルを組み合わせるアンサンブル学習について実践的に解説します。...

【Python×LightGBM入門】第4回：特徴量エンジニアリングと重要度分析

LightGBMで高精度な予測モデルを構築するための特徴量エンジニアリング手法と、SHAP値を使った高度な特徴量重要度分析について解説します。...

Tech Blog【Python×LightGBM入門】第3回：回帰問題とパラメータチューニング