Closed
Description
Bug description
Using certain models (Llama 7B) with Deepspeed Stage 3 Offload results in differing behavior between Trainer and Fabric with the same Deepspeed settings. The former shards the parameters, whereas the latter attempts to load the model each process' GPU individually, which then run out of memory.
What version are you seeing the problem on?
v2.0.3
How to reproduce the bug
Fabric version which OOMs attempting to construct the model on each process' GPU individually:
from transformers import LlamaForCausalLM, LlamaConfig, BertTokenizer, PreTrainedTokenizer
from lightning import Fabric
from lightning.fabric.strategies import DeepSpeedStrategy
from torch.utils.data import Dataset, DataLoader
class TextDataset(Dataset):
def __init__(self, tokenizer: PreTrainedTokenizer, length: int):
self.length = length
self.tokenizer = tokenizer
self.data = self.tokenizer(
[f'Hello world {i}!' for i in range(length)],
padding='longest',
return_tensors='pt'
)
def __getitem__(self, index: int):
return {
'input_ids': self.data['input_ids'][index],
'attention_mask': self.data['attention_mask'][index],
}
def __len__(self):
return self.length
if __name__ == '__main__':
dataset_len = 64
# Build Fabric. Both of these options OOM.
# fabric = Fabric(strategy='deepspeed_stage_3_offload', accelerator='gpu', devices=2, precision='16-mixed')
strategy = DeepSpeedStrategy(stage=3, offload_parameters=True)
fabric = Fabric(strategy=strategy, accelerator='gpu', devices=2, precision='16-mixed')
fabric.launch()
# Build model and tokenizer. Not intended usage, but for bug duplication purposes
model = LlamaForCausalLM(LlamaConfig())
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Build dataset and dataloader
ds = TextDataset(tokenizer, dataset_len)
dl = DataLoader(ds, batch_size=2)
# Set up model and dataloader with Fabric
model = fabric.setup(model)
dl = fabric.setup_dataloaders(dl)
# Predict
generated = model.generate(**next(iter(dl)))
print(generated)
# NOTE Fabric tries to load full model on a GPU, then OOMs
The following version with Trainer works without issue:
from transformers import LlamaForCausalLM, LlamaConfig, BertTokenizer, PreTrainedTokenizer, PreTrainedModel
from lightning import Trainer, LightningModule
from torch.utils.data import Dataset, DataLoader
class TextDataset(Dataset):
def __init__(self, tokenizer: PreTrainedTokenizer, length: int):
self.length = length
self.tokenizer = tokenizer
self.data = self.tokenizer(
[f'Hello world {i}!' for i in range(length)],
padding='longest',
return_tensors='pt'
)
def __getitem__(self, index: int):
return {
'input_ids': self.data['input_ids'][index],
'attention_mask': self.data['attention_mask'][index],
}
def __len__(self):
return self.length
class BoringModel(LightningModule):
def __init__(self, model: PreTrainedModel):
super().__init__()
self.model = model
def predict_step(self, batch, batch_idx):
return self.model.generate(**batch)
if __name__ == '__main__':
dataset_len = 64
# Build model and tokenizer. Not intended usage, but for bug duplication purposes
model = BoringModel(LlamaForCausalLM(LlamaConfig()))
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Build dataset and dataloader
ds = TextDataset(tokenizer, dataset_len)
dl = DataLoader(ds, batch_size=2)
# Build trainer
trainer = Trainer(
strategy='deepspeed_stage_3_offload',
accelerator='gpu',
devices=2,
precision='16-mixed'
)
# Predict
generated = trainer.predict(model, dl)
print(generated)
Error messages and logs
For the second example which OOMs:
OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB (GPU 0; 15.77 GiB total capacity; 15.27 GiB already allocated; 143.38 MiB free; 15.27 GiB reserved in total by PyTorch) If reserved memory is >>
allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Environment
#- PyTorch Lightning Version (e.g., 1.5.0): 2.0.3
#- PyTorch Version (e.g., 2.0): 2.0.1
#- Python version (e.g., 3.9): 3.9.7
#- OS (e.g., Linux): Linux
#- CUDA/cuDNN version: 11.7
#- GPU models and configuration: Tesla V100-SXM2-16GB
#- How you installed Lightning(`conda`, `pip`, source): pip