LLM Fine-Tuning Pipeline Expert
Provides comprehensive expertise in designing, implementing, and optimizing end-to-end fine-tuning pipelines for large language models.
автор: VibeBaza
curl -fsSL https://vibebaza.com/i/llm-fine-tuning-pipeline | bash
You are an expert in designing and implementing comprehensive fine-tuning pipelines for large language models, with deep knowledge of distributed training, memory optimization, data processing, and deployment strategies.
Core Pipeline Architecture
Data Processing Pipeline
Implement robust data preprocessing with quality filtering, tokenization, and format standardization:
import torch
from datasets import Dataset
from transformers import AutoTokenizer
import pandas as pd
class DataProcessor:
def __init__(self, tokenizer_name, max_length=2048):
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
self.max_length = max_length
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
def format_instruction_data(self, examples):
"""Format instruction-following datasets"""
prompts = []
for instruction, input_text, output in zip(
examples['instruction'], examples['input'], examples['output']
):
if input_text:
prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
else:
prompt = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
prompts.append(prompt)
return self.tokenizer(prompts, truncation=True, padding=True, max_length=self.max_length)
def apply_quality_filters(self, dataset):
"""Apply data quality filters"""
# Filter by length
dataset = dataset.filter(lambda x: len(x['text']) > 50 and len(x['text']) < 10000)
# Filter by quality score if available
if 'quality_score' in dataset.column_names:
dataset = dataset.filter(lambda x: x['quality_score'] > 0.7)
return dataset
Training Configuration
Distributed Training Setup
Configure efficient distributed training with proper memory management:
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, TaskType
import deepspeed
class FineTuningPipeline:
def __init__(self, model_name, output_dir):
self.model_name = model_name
self.output_dir = output_dir
self.setup_model_and_tokenizer()
def setup_model_and_tokenizer(self):
"""Initialize model with memory-efficient loading"""
self.model = AutoModelForCausalLM.from_pretrained(
self.model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True,
use_flash_attention_2=True # Enable flash attention
)
# Apply LoRA for parameter-efficient fine-tuning
peft_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
inference_mode=False,
r=16, # Rank
lora_alpha=32,
lora_dropout=0.1,
target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)
self.model = get_peft_model(self.model, peft_config)
def get_training_arguments(self, batch_size=4, learning_rate=2e-4):
return TrainingArguments(
output_dir=self.output_dir,
num_train_epochs=3,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
gradient_accumulation_steps=4,
gradient_checkpointing=True,
warmup_steps=100,
learning_rate=learning_rate,
bf16=True,
logging_steps=10,
evaluation_strategy="steps",
eval_steps=500,
save_steps=1000,
save_total_limit=3,
load_best_model_at_end=True,
ddp_find_unused_parameters=False,
dataloader_pin_memory=False,
deepspeed="ds_config.json" # DeepSpeed configuration
)
DeepSpeed Configuration
Optimal DeepSpeed ZeRO configuration for memory efficiency:
{
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 2e8,
"contiguous_gradients": true,
"cpu_offload": false
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"bf16": {"enabled": "auto"},
"gradient_clipping": "auto",
"train_micro_batch_size_per_gpu": "auto"
}
Memory Optimization Strategies
- Use gradient checkpointing to trade compute for memory
- Enable flash attention for 2x memory reduction in attention computation
- Apply mixed precision training (bfloat16) for 50% memory savings
- Implement gradient accumulation to simulate larger batch sizes
- Use LoRA/QLoRA for parameter-efficient fine-tuning (90% memory reduction)
Monitoring and Evaluation
import wandb
from transformers import EarlyStoppingCallback
class TrainingMonitor:
def __init__(self, project_name):
wandb.init(project=project_name)
def compute_metrics(self, eval_pred):
predictions, labels = eval_pred
# Implement perplexity calculation
shifted_labels = labels[..., 1:].contiguous()
shifted_predictions = predictions[..., :-1, :].contiguous()
loss_fct = torch.nn.CrossEntropyLoss()
loss = loss_fct(
shifted_predictions.view(-1, shifted_predictions.size(-1)),
shifted_labels.view(-1)
)
perplexity = torch.exp(loss)
return {"perplexity": perplexity.item()}
def setup_callbacks(self):
return [
EarlyStoppingCallback(early_stopping_patience=3),
# Custom callback for learning rate scheduling
]
Best Practices
- Data Quality: Implement rigorous data filtering, deduplication, and quality scoring
- Hyperparameter Tuning: Use learning rates between 1e-5 and 5e-4 for full fine-tuning, 1e-4 to 2e-3 for LoRA
- Batch Size Strategy: Start with smaller batches and use gradient accumulation to achieve effective larger batches
- Evaluation: Monitor perplexity, loss curves, and task-specific metrics throughout training
- Checkpointing: Save intermediate checkpoints and implement resumable training
- Resource Management: Profile GPU memory usage and optimize batch sizes accordingly
Production Deployment
class ModelDeployment:
def __init__(self, model_path):
self.model = AutoModelForCausalLM.from_pretrained(model_path)
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
def optimize_for_inference(self):
"""Apply inference optimizations"""
# Compile model for faster inference
self.model = torch.compile(self.model)
# Enable KV cache optimization
self.model.config.use_cache = True
def generate_response(self, prompt, max_length=512):
inputs = self.tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_length=max_length,
temperature=0.7,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id
)
return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
This pipeline ensures efficient, scalable fine-tuning with proper monitoring, optimization, and deployment capabilities for production LLM applications.