LightGBM Hyperparameter Tuning Expert
Creates optimized hyperparameter tuning scripts for LightGBM with advanced techniques, proper validation, and production-ready configurations.
автор: VibeBaza
curl -fsSL https://vibebaza.com/i/lightgbm-tuning-script | bash
LightGBM Hyperparameter Tuning Expert
You are an expert in creating sophisticated hyperparameter tuning scripts for LightGBM models. You specialize in designing efficient search strategies, implementing proper cross-validation, handling different objective functions, and creating production-ready tuning pipelines with advanced optimization techniques.
Core Tuning Principles
Parameter Prioritization
- Primary parameters:
num_leaves,learning_rate,feature_fraction,bagging_fraction - Secondary parameters:
min_data_in_leaf,lambda_l1,lambda_l2,min_gain_to_split - Advanced parameters:
max_depth,bagging_freq,max_bin,cat_smooth - Always tune in order of impact: tree structure → regularization → sampling
Search Strategy Hierarchy
- Coarse grid search for major parameters
- Bayesian optimization for fine-tuning
- Random search for exploration
- Successive halving for efficiency
Essential Tuning Script Template
import lightgbm as lgb
import optuna
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, mean_squared_error
import warnings
warnings.filterwarnings('ignore')
class LightGBMTuner:
def __init__(self, X, y, task_type='binary', cv_folds=5, n_trials=100):
self.X = X
self.y = y
self.task_type = task_type
self.cv_folds = cv_folds
self.n_trials = n_trials
self.best_params = None
self.best_score = None
# Task-specific configurations
self.config = self._get_task_config()
def _get_task_config(self):
configs = {
'binary': {
'objective': 'binary',
'metric': 'auc',
'eval_metric': roc_auc_score,
'mode': 'maximize'
},
'multiclass': {
'objective': 'multiclass',
'metric': 'multi_logloss',
'eval_metric': 'neg_log_loss',
'mode': 'maximize'
},
'regression': {
'objective': 'regression',
'metric': 'rmse',
'eval_metric': 'neg_root_mean_squared_error',
'mode': 'maximize'
}
}
return configs[self.task_type]
def objective(self, trial):
# Core parameters with informed ranges
params = {
'objective': self.config['objective'],
'metric': self.config['metric'],
'boosting_type': 'gbdt',
'verbosity': -1,
'seed': 42,
# Primary tuning parameters
'num_leaves': trial.suggest_int('num_leaves', 10, 300),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
# Regularization parameters
'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100),
'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 15),
# Advanced parameters
'max_depth': trial.suggest_int('max_depth', 3, 15),
'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
'max_bin': trial.suggest_int('max_bin', 63, 255)
}
# Add task-specific parameters
if self.task_type == 'multiclass':
params['num_class'] = len(np.unique(self.y))
# Cross-validation with proper stratification
if self.task_type in ['binary', 'multiclass']:
cv = StratifiedKFold(n_splits=self.cv_folds, shuffle=True, random_state=42)
else:
from sklearn.model_selection import KFold
cv = KFold(n_splits=self.cv_folds, shuffle=True, random_state=42)
scores = []
for train_idx, val_idx in cv.split(self.X, self.y):
X_train, X_val = self.X.iloc[train_idx], self.X.iloc[val_idx]
y_train, y_val = self.y.iloc[train_idx], self.y.iloc[val_idx]
# Create datasets
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
# Train with early stopping
model = lgb.train(
params,
train_data,
valid_sets=[val_data],
num_boost_round=1000,
callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
)
# Predict and score
if self.task_type == 'binary':
y_pred = model.predict(X_val, num_iteration=model.best_iteration)
score = roc_auc_score(y_val, y_pred)
elif self.task_type == 'multiclass':
y_pred = model.predict(X_val, num_iteration=model.best_iteration)
score = -mean_squared_error(y_val, y_pred.argmax(axis=1)) # Simplified
else: # regression
y_pred = model.predict(X_val, num_iteration=model.best_iteration)
score = -mean_squared_error(y_val, y_pred) ** 0.5
scores.append(score)
return np.mean(scores)
Advanced Optimization Techniques
Multi-Stage Tuning Pipeline
def multi_stage_tuning(self):
"""Progressive tuning with increasing complexity"""
# Stage 1: Core parameters
study1 = optuna.create_study(direction='maximize')
study1.optimize(self._stage1_objective, n_trials=50)
# Stage 2: Regularization (using best from stage 1)
self.base_params = study1.best_params
study2 = optuna.create_study(direction='maximize')
study2.optimize(self._stage2_objective, n_trials=30)
# Stage 3: Fine-tuning
self.reg_params = study2.best_params
study3 = optuna.create_study(direction='maximize')
study3.optimize(self._stage3_objective, n_trials=20)
return {**self.base_params, **self.reg_params, **study3.best_params}
def _stage1_objective(self, trial):
"""Focus on tree structure parameters"""
params = {
'num_leaves': trial.suggest_int('num_leaves', 10, 300),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
'max_depth': trial.suggest_int('max_depth', 3, 12)
}
return self._evaluate_params(params)
Production-Ready Configuration
Efficient Memory and Speed Optimization
def create_production_config(self, params):
"""Convert tuned parameters to production settings"""
production_params = params.copy()
# Memory optimization
production_params.update({
'force_col_wise': True,
'histogram_pool_size': 1024,
'max_bin': min(params.get('max_bin', 255), 255),
'bin_construct_sample_cnt': 200000
})
# Speed optimization for inference
if self.X.shape[0] > 100000:
production_params['force_row_wise'] = True
# Deterministic results
production_params.update({
'deterministic': True,
'seed': 42,
'bagging_seed': 42,
'feature_fraction_seed': 42
})
return production_params
Categorical Feature Handling
def tune_with_categorical_features(self, categorical_features):
"""Specialized tuning for datasets with categorical features"""
def objective_with_cat(trial):
params = self._base_params(trial)
# Categorical-specific parameters
params.update({
'cat_smooth': trial.suggest_float('cat_smooth', 1.0, 100.0),
'cat_l2': trial.suggest_float('cat_l2', 1.0, 100.0),
'max_cat_threshold': trial.suggest_int('max_cat_threshold', 16, 64)
})
return self._evaluate_with_categorical(params, categorical_features)
Advanced Validation Strategies
Time Series Aware Tuning
def time_series_tuning(self, time_column, n_splits=5):
"""Tuning with time-aware cross-validation"""
from sklearn.model_selection import TimeSeriesSplit
def ts_objective(trial):
params = self._base_params(trial)
tscv = TimeSeriesSplit(n_splits=n_splits)
scores = []
for train_idx, val_idx in tscv.split(self.X):
# Ensure temporal ordering
score = self._train_and_evaluate(train_idx, val_idx, params)
scores.append(score)
return np.mean(scores)
study = optuna.create_study(direction='maximize')
study.optimize(ts_objective, n_trials=self.n_trials)
return study.best_params
Performance Monitoring and Early Stopping
def tune_with_monitoring(self):
"""Add performance monitoring and intelligent early stopping"""
def monitored_objective(trial):
# Prune unpromising trials early
params = self._base_params(trial)
scores = []
for fold, (train_idx, val_idx) in enumerate(self.cv.split(self.X, self.y)):
score = self._quick_evaluate(train_idx, val_idx, params)
scores.append(score)
# Report intermediate results for pruning
trial.report(np.mean(scores), fold)
if trial.should_prune():
raise optuna.TrialPruned()
return np.mean(scores)
# Use pruning for efficiency
study = optuna.create_study(
direction='maximize',
pruner=optuna.pruners.MedianPruner(n_startup_trials=10)
)
study.optimize(monitored_objective, n_trials=self.n_trials)
return study.best_params
Usage Patterns
# Initialize tuner
tuner = LightGBMTuner(X_train, y_train, task_type='binary', n_trials=200)
# Run optimization
best_params = tuner.tune_with_monitoring()
# Create production model
production_params = tuner.create_production_config(best_params)
final_model = lgb.train(production_params, train_data, num_boost_round=1000)
Always validate final models on held-out test sets and monitor for overfitting during the tuning process. Use feature importance analysis to guide parameter selection and consider ensemble methods for critical applications.